# MovieLens MLN Recommendation via PyTorch

adapted from https://github.com/fastai/fastai

In [265]:
import math
import copy
import pickle
from pathlib import Path
from itertools import zip_longest

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch import optim
from torch.optim.lr_scheduler import _LRScheduler

import networkx as nx

In [266]:
def set_random_seed(state=1):
    gens = (np.random.seed, torch.manual_seed, torch.cuda.manual_seed)
    for set_state in gens:
        set_state(state)

RANDOM_STATE = 1
set_random_seed(RANDOM_STATE)

In [267]:
def read_data(path):
    files = {}
    path = Path(path)
    for filename in path.glob('*'):
        if filename.suffix == '.csv':
            files[filename.stem] = pd.read_csv(filename)
        elif filename.suffix == '.dat':
            if filename.stem == 'ratings':
                columns = ['userId', 'movieId', 'rating', 'timestamp']
            else:
                columns = ['movieId', 'title', 'genres']
            data = pd.read_csv(filename, sep='::', names=columns, engine='python')
            files[filename.stem] = data
    return files['ratings'], files['movies']

In [268]:
# pick one of the available folders
ratings, movies = read_data('/home/weiss/rs_data/ml-latest-small')

In [269]:

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [270]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [271]:
def create_dataset(ratings, top=None):
    if top is not None:
        ratings.groupby('userId')['rating'].count()

    unique_users = ratings.userId.unique()
    user_to_index = {old: new for new, old in enumerate(unique_users)}
    new_users = ratings.userId.map(user_to_index)

    unique_movies = ratings.movieId.unique()
    movie_to_index = {old: new for new, old in enumerate(unique_movies)}
    new_movies = ratings.movieId.map(movie_to_index)

    n_users = unique_users.shape[0]
    n_movies = unique_movies.shape[0]

    X = pd.DataFrame({'user_id': new_users, 'movie_id': new_movies})
    y = ratings['rating'].astype(np.float32)
    return (n_users, n_movies), (X, y), (user_to_index, movie_to_index)

(n, m), (X, y), _ = create_dataset(ratings)
print(f'Embeddings: {n} users, {m} movies')
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

Embeddings: 610 users, 9724 movies
Dataset shape: (100836, 2)
Target shape: (100836,)


In [272]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=RANDOM_STATE)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size=0.5, random_state=RANDOM_STATE)
datasets = {'train': (X_train, y_train), 'val': (X_valid, y_valid), 'test': (X_test, y_test)}
dataset_sizes = {'train': len(X_train), 'val': len(X_valid), 'test': len(X_test)}

In [273]:
class RatingsIterator:

    def __init__(self, X, y, batch_size=32, shuffle=True):
        X, y = np.asarray(X), np.asarray(y)
        if shuffle:
            index = np.random.permutation(X.shape[0])
            X, y = X[index], y[index]
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n_batches = int(math.ceil(X.shape[0] // batch_size))
        self._current = 0

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        if self._current >= self.n_batches:
            raise StopIteration()
        k = self._current
        self._current += 1
        bs = self.batch_size
        return self.X[k*bs:(k + 1)*bs], self.y[k*bs:(k + 1)*bs]

(       user_id  movie_id
92457      596      1140
77404      482       105
65287      417      4214
59648      386       180
69512      447      7564
...        ...       ...
10684       67      1033
28784      198       184
97062      603       519
62054      410      2432
21419      139      4921

[20168 rows x 2 columns], 92457    5.0
77404    4.0
65287    4.5
59648    3.0
69512    2.0
        ... 
10684    3.0
28784    5.0
97062    2.0
62054    2.0
21419    3.0
Name: rating, Length: 20168, dtype: float32)


In [311]:
#Create a graph
G = nx.Graph()

#Add nodes
G.add_nodes_from(X_train.user_id.unique(), bipartite=0)
G.add_nodes_from(X_train.movie_id.unique(), bipartite=1)

for X, y in RatingsIterator(X_train, y_train, batch_size=1):
    print(X[0][0], X[0][1], y[0])

#G.add_weighted_edges_from((X[0][0], X[0][1], y[0]))
#Add weights (ratings) for edges
#G.add_weighted_edges_from([(userId, movieId, rating) for (userId, movieId, rating)
#              in train[['userId', 'movieId', 'rating']].to_numpy()])

print(nx.info(G))

317 2740 4.0
482 61 5.0
231 5932 2.5
386 1635 1.5
404 361 3.5
595 253 3.5
598 1367 3.0
447 5820 1.5
447 1866 4.5
523 1166 5.0
473 1612 2.5
447 7868 3.5
153 926 5.0
413 827 3.0
44 2324 4.5
562 3432 2.0
413 125 3.0
274 2448 5.0
324 5633 2.0
473 2394 3.5
589 875 3.5
307 15 2.5
9 900 4.5
261 2651 3.0
22 356 4.0
596 383 4.0
143 621 3.5
596 6599 4.0
5 654 3.0
294 2657 3.5
552 1718 5.0
225 779 4.0
5 1 5.0
56 1668 4.0
14 1031 4.5
379 889 3.0
609 6442 2.5
139 1123 3.0
561 2285 4.5
274 2468 4.0
303 660 5.0
424 2190 4.0
61 1796 4.0
386 2177 3.5
135 532 4.0
583 323 5.0
287 50 5.0
248 227 3.0
124 809 4.0
401 343 5.0
221 1313 2.5
509 1652 3.0
41 1015 4.0
225 757 3.5
364 457 2.5
351 1075 5.0
278 1091 3.5
599 1838 1.5
569 471 3.5
379 1102 5.0
218 92 3.5
158 232 4.5
281 728 4.5
261 475 5.0
273 3446 3.5
554 1872 4.0
102 2657 5.0
386 2038 4.0
402 162 4.5
437 660 3.5
417 1342 4.0
473 2687 3.0
304 175 4.5
331 1125 4.0
306 639 0.5
560 1915 4.0
409 343 4.0
519 1055 4.5
65 263 3.5
69 3504 5.0
18 1567 3.0
189 

KeyboardInterrupt: 