In [93]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
import numpy as np, pandas as pd
import pickle, gc
gc.collect()


movies_pivot = pd.read_csv('data/kafka_log-movielog6_stream_processed(initial).csv', header=None)
print(movies_pivot.shape)
movies_pivot.head(2)

(124208, 5)


Unnamed: 0,0,1,2,3,4
0,2022-09-21T19:52:45,189292,rate,casino+jack+and+the+united+states+of+money+2010,2
1,2022-09-21T19:52:42,200691,rate,quick+change+1990,3


In [101]:
with open("model/top_k_movies.pkl",'wb') as f:
    top_k_movies = movies_pivot.sort_values(4, ascending=False)[3].tolist()
    pickle.dump(top_k_movies, f, protocol=pickle.HIGHEST_PROTOCOL)

In [75]:
"""creating a pivot table"""
movies_pivot = movies_pivot.groupby(1).aggregate({3:'first', 4: 'mean'})
movies_pivot.columns = ['movie_id', 'rate']
print(movies_pivot.shape)
movies_pivot.head(2)

(67786, 2)


Unnamed: 0_level_0,movie_id,rate
1,Unnamed: 1_level_1,Unnamed: 2_level_1
2,peter+pan+1953,4.0
3,ruhr+2009,3.0


In [76]:
user_label = LabelEncoder().fit_transform(movies_pivot.index)
movie_label = LabelEncoder().fit_transform(movies_pivot.movie_id)
mappings = {
    'user': {k: v for k,v in zip(user_label, movies_pivot.index)},
    'movie': {k: v for k,v in zip(movie_label, movies_pivot['movie_id'])}
}
with open("model/id_mapping.pkl",'wb') as f:
    pickle.dump(mappings, f, protocol=pickle.HIGHEST_PROTOCOL)
#
movies_pivot= movies_pivot.reset_index(drop=True)
movies_pivot.index.name = 'user_id'
movies_pivot['movie_id'] = movie_label
movies_pivot.to_csv("data/user_movie_rating.csv")
movies_pivot.head(2)

Unnamed: 0_level_0,movie_id,rate
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10492,4.0
1,11463,3.0


Model

In [77]:
%load_ext autoreload
%autoreload 2
from model import MatrixFactorization
from torch.utils.data import Dataset, DataLoader
import torch

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [82]:
class MovieRating(Dataset):
 
    def __init__(self, file_name="data/user_movie_features.csv"):
        df=pd.read_csv(file_name)

        x = df[['user_id','movie_id']].values
        y = df['rate'].values
        self.n_user = df['user_id'].nunique()
        self.n_movie = df['movie_id'].nunique()
        self.x_train=torch.tensor(x, dtype=torch.long)
        self.y_train=torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self,idx):
        return self.x_train[idx], self.y_train[idx]

In [86]:
# model training
dataset = MovieRating('data/user_movie_rating.csv')
dataloader = DataLoader(dataset, batch_size=128)
model = MatrixFactorization(dataset.n_user, dataset.n_movie, n_factors=20)
loss_fn = torch.nn.HuberLoss() 
optimizer = torch.optim.SparseAdam(model.parameters(), lr=1e-4)

In [87]:
for n_ep in range(20):
    if n_ep%5==0: print(f"epoch: {n_ep}")
    for idx, (data, target) in enumerate(dataloader):
        # predict
        prediction = model(data)
        loss = loss_fn(prediction, target)
        if idx%20==0: print(f"Batch {idx}: loss:{loss:.2f}")
        # backpropagate
        loss.backward()
        # update weights
        optimizer.step()

epoch: 0
Batch 0: loss:29.855697631835938
Batch 20: loss:33.918785095214844
Batch 40: loss:30.947935104370117
Batch 60: loss:33.46770095825195
Batch 80: loss:31.281450271606445
Batch 100: loss:28.151803970336914
Batch 120: loss:44.713321685791016
Batch 140: loss:34.772544860839844
Batch 160: loss:37.8205451965332
Batch 180: loss:27.98522186279297
Batch 200: loss:36.814205169677734
Batch 220: loss:28.732946395874023
Batch 240: loss:32.466651916503906
Batch 260: loss:30.418352127075195
Batch 280: loss:41.42574691772461
Batch 300: loss:35.4355354309082
Batch 320: loss:33.94124221801758
Batch 340: loss:46.16339111328125
Batch 360: loss:33.44694519042969
Batch 380: loss:35.7506103515625
Batch 400: loss:33.844810485839844
Batch 420: loss:33.492469787597656
Batch 440: loss:28.627086639404297
Batch 460: loss:33.78052520751953
Batch 480: loss:23.22759246826172
Batch 500: loss:34.55928039550781
Batch 520: loss:25.939205169677734
Batch 0: loss:19.003795623779297
Batch 20: loss:22.167936325073242


KeyboardInterrupt: 

In [89]:
torch.save(model.state_dict(), 'model_movie_recommendation.pth')