In [1]:
# all config/downloads to use fastai
# !pip install pandas --upgrade
#!pip install plotly --upgrade
#!pip install fastai==0.7.0
#!pip install torchtext==0.2.3
!pip install torch
#!pip install torchvision
#!pip install Pillow>=4.1.1
#!pip install image
#!pip install matplotlib



In [2]:
from google.colab import drive
drive.mount('/mnt')

Mounted at /mnt


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

In [6]:
data_pd = pd.read_csv('ratings.csv')
movies_pd = pd.read_csv('movies.csv')
#data_pd = pd.read_csv('ratings.csv') если из локальной папки
#movies_pd = pd.read_csv('movies.csv')
data_pd = data_pd.sample(frac=1)

In [7]:
u_temp = list(data_pd.userId.unique())
u_temp.sort()
m_temp = list(data_pd.movieId.unique())
m_temp.sort()

In [8]:
movie_dict = dict(list(zip(m_temp,range(len(m_temp)))))
user_dict = dict(list(zip(u_temp,range(len(u_temp)))))

In [9]:
data_pd['userId'] = data_pd['userId'].map(lambda x: user_dict[x])
data_pd['movieId'] = data_pd['movieId'].map(lambda x: movie_dict[x])

In [10]:
X = data_pd[['userId','movieId']].values
y = data_pd[['rating']].values

In [11]:
films = np.transpose(X)[1]
counts = np.bincount(films)

In [12]:
y = y.astype(np.float)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
class RatingDataset():
  def __init__(self, train, label):
    self.feature_= train
    self.label_= label
  def __len__(self):
    #return size of dataset
    return len(self.feature_)
  def __getitem__(self, idx):
    return torch.tensor(self.feature_[idx]),torch.tensor(self.label_[idx])

In [15]:
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm

In [16]:
bs = 5000
train_dataloader = DataLoader(RatingDataset(x_train, y_train), batch_size=bs, shuffle=True)
test_dataloader = DataLoader(RatingDataset(x_test, y_test), batch_size=bs)

In [17]:
class MatrixFactorization(torch.nn.Module):
    
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        self.user_biases = torch.nn.Embedding(n_users, 1)
        self.item_biases = torch.nn.Embedding(n_items,1)
        torch.nn.init.xavier_uniform_(self.user_factors.weight)
        torch.nn.init.xavier_uniform_(self.item_factors.weight)
        self.user_biases.weight.data.fill_(0.)
        self.item_biases.weight.data.fill_(0.)
        
    def forward(self, user, item):
        pred = self.user_biases(user) + self.item_biases(item)
        pred += (self.user_factors(user) * self.item_factors(item)).sum(1, keepdim=True)
        #pred = pred.float()
        pred.to('cuda')
        return pred.squeeze()

In [18]:
n, m =  len(data_pd.userId.unique()), len(data_pd.movieId.unique())

In [19]:
nfactor = 300
model = MatrixFactorization(n, m, n_factors=nfactor)

In [20]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# dev = torch.device("cpu")

In [21]:
dev

device(type='cuda')

In [22]:
model.parameters

<bound method Module.parameters of MatrixFactorization(
  (user_factors): Embedding(610, 300)
  (item_factors): Embedding(9724, 300)
  (user_biases): Embedding(610, 1)
  (item_biases): Embedding(9724, 1)
)>

In [23]:
dev

device(type='cuda')

In [24]:
loss_func = torch.nn.MSELoss()
model.to(dev)

MatrixFactorization(
  (user_factors): Embedding(610, 300)
  (item_factors): Embedding(9724, 300)
  (user_biases): Embedding(610, 1)
  (item_biases): Embedding(9724, 1)
)

In [25]:
model.parameters
model.double()

MatrixFactorization(
  (user_factors): Embedding(610, 300)
  (item_factors): Embedding(9724, 300)
  (user_biases): Embedding(610, 1)
  (item_biases): Embedding(9724, 1)
)

In [26]:
epoches = 100
train_loss_data = []
test_loss_data = []
for epoch in range(0, epoches):
    pbar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))  # progress bar
    count = 0
    cum_loss = 0.
    for i,( train_batch, label_batch) in pbar:
        count = 1 + i
        # Predict and calculate loss for user factor and bias
        optimizer = torch.optim.SGD([model.user_biases.weight,model.user_factors.weight], lr=0.01, weight_decay=1e-5) # learning rate
        prediction = model(train_batch[:,0].to(dev), train_batch[:,1].to(dev))
        loss = loss_func(prediction, label_batch.to(dev))    
        # Backpropagate
        loss.backward()
        # Update the parameters
        optimizer.step()
        optimizer.zero_grad()
        #predict and calculate loss for item factor and bias
        optimizer = torch.optim.SGD([model.item_biases.weight,model.item_factors.weight], lr=0.01, weight_decay=1e-5) # learning rate
        prediction = model(train_batch[:,0].to(dev), train_batch[:,1].to(dev))
        loss = loss_func(prediction, label_batch.to(dev))
        # Backpropagate
        loss.backward()
        # Update the parameters
        optimizer.step()
        optimizer.zero_grad()
        cum_loss += loss.item()
        pbar.set_description('training loss at {} batch {}: {}'.format(epoch,i,loss.item()))
    train_loss = cum_loss/count
    pbar = tqdm(enumerate(test_dataloader), total=len(test_dataloader))  # progress bar
    cum_loss =0.
    count = 0
    for i,( test_batch, label_batch) in pbar:
        count = 1 + i
        with torch.no_grad():
            prediction = model(test_batch[:,0].to(dev), test_batch[:,1].to(dev))
            loss = loss_func(prediction, label_batch.to(dev))
            cum_loss += loss.item()
            pbar.set_description('test loss at {} batch {}: {}'.format(epoch,i,loss.item()))
    test_loss = cum_loss/count
    train_loss_data.append(train_loss)#for graph building
    test_loss_data.append(test_loss)
    if (np.mean(np.diff(test_loss_data)[:-10]) >= -0.02):
      break

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
training loss at 0 batch 16: 13.632511659753243: 100%|██████████| 17/17 [00:01<00:00,  9.80it/s]
  return F.mse_loss(input, target, reduction=self.reduction)
test loss at 0 batch 4: 12.94891434870736: 100%|██████████| 5/5 [00:00<00:00, 18.25it/s]
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
training loss at 1 batch 16: 12.863684706672705: 100%|██████████| 17/17 [00:01<00:00, 10.24it/s]
test loss at 1 batch 4: 12.834923586646942: 100%|██████████| 5/5 [00:00<00:00, 11.60it/s]
training loss at 2 batch 16: 13.129981905029386: 100%|██████████| 17/17 [00:01<00:00, 10.24it/s]
test loss at 2 batch 4: 12.723104670733766: 100%|██████████| 5/5 [00:00<00:00, 15.38it/s]
training loss at 3 batch 16: 13.140416595520351: 100%|██████████| 17/17 [00:01<00:00, 10.65it/s]
test loss at 3 batch 4: 12.613056213617483: 100%|██████████| 5/5 [00:00<00:00, 14.91it/s]
training los

In [30]:
movie_id = movie_dict[random.randint(1, 20)] # random film
sim_dict = {} #films similarity
for m, id in zip(model.item_factors.weight.data.cpu().numpy(), movie_dict.keys()): #cos metric
    sim_dict[id] = np.dot(model.item_factors.weight.data.cpu().numpy()[movie_id], m)/(np.linalg.norm(model.item_factors.weight.data.cpu().numpy()[movie_id])*np.linalg.norm(m))

In [31]:
sim_pd = pd.DataFrame(sim_dict.items(), columns=['movieId', 'similarity']).merge(movies_pd, on='movieId')

Films, similar to "Matrix":

In [32]:
sim_pd.sort_values('similarity', ascending=False)[0:30][['title', 'genres', 'similarity']]

Unnamed: 0,title,genres,similarity
19,Money Train (1995),Action|Comedy|Crime|Drama|Thriller,1.0
4184,Amen. (2002),Drama,0.193525
3134,Exit Wounds (2001),Action|Thriller,0.18616
1102,Metro (1997),Action|Comedy|Crime|Drama|Thriller,0.185538
3682,Cousins (1989),Comedy|Romance,0.183668
2112,Universal Soldier: The Return (1999),Action|Sci-Fi,0.181719
5739,Sweet Liberty (1986),Comedy,0.178442
3229,A.I. Artificial Intelligence (2001),Adventure|Drama|Sci-Fi,0.177547
5039,"Duck, You Sucker (1971)",Action|Western,0.174478
3805,Cadillac Man (1990),Comedy|Crime,0.174027


In [29]:
import random
userId = random.randint(1, 20) #check of first 20 persons randomly

In [33]:
ratings_dict = {}
user_bias = model.user_biases.weight.data.cpu().numpy()[userId][0]
for pred, item_bias, id in zip(model.item_factors.weight.data.cpu().numpy().dot(model.user_factors.weight.data.cpu().numpy()[userId]),
model.item_biases.weight.data.cpu().numpy(), movie_dict.keys()):
  if not id in np.transpose(data_pd[data_pd['userId'] == user_dict[42]][['movieId']].to_numpy())[0]:
    ratings_dict[id] = pred + item_bias[0] + user_bias

In [34]:
ratings_pd = pd.DataFrame(ratings_dict.items(), columns=['movieId', 'rating'])
ratings_pd = ratings_pd.merge(movies_pd, on='movieId')

Best 30 films for random user:

In [35]:
ratings_pd.sort_values('rating', ascending=False)[0:30][['title', 'genres', 'rating']]

Unnamed: 0,title,genres,rating
259,Forrest Gump (1994),Comedy|Drama|Romance|War,0.752453
1662,"Matrix, The (1999)",Action|Sci-Fi|Thriller,0.682532
218,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,0.680423
418,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,0.658688
189,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,0.614926
374,Schindler's List (1993),Drama|War,0.571937
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.562603
83,Braveheart (1995),Action|Drama|War,0.555509
416,Terminator 2: Judgment Day (1991),Action|Sci-Fi,0.543097
756,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,0.53772
