In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')

In [4]:
from src.utils import loading, Spark
import pyspark.ml as M
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from tqdm.notebook import tqdm

In [5]:
from src import indexTransformer
from scipy import sparse
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

In [6]:
spark = Spark()

Spark UI address http://127.0.0.1:4040


In [8]:
datas = loading(spark, '../data/interim')

In [9]:
datas

{'train_0.5_0.5': DataFrame[userId: string, movieId: string, rating: string],
 'train_0.25_0.75': DataFrame[userId: string, movieId: string, rating: string],
 'test_0.5_0.5': DataFrame[userId: string, movieId: string, rating: string],
 'test_0.25_0.75': DataFrame[userId: string, movieId: string, rating: string],
 'train_0.75_0.25': DataFrame[userId: string, movieId: string, rating: string],
 'test_0.75_0.25': DataFrame[userId: string, movieId: string, rating: string]}

In [10]:
cast_int = lambda df: df.select([F.col(c).cast('int') for c in ['userId', 'movieId']] + \
                                [F.col('rating').cast('float')])
training, test = cast_int(datas['train_0.75_0.25']), cast_int(datas['test_0.75_0.25'])

In [11]:
print(f'''
        training set num of rows {training.count()},
        test set num of rows {test.count()},
        training set num of users {training.select('userId').distinct().count()},
        training set num of movies {training.select('movieId').distinct().count()},
        test set num of users {test.select('userId').distinct().count()},
        test set num of movies {test.select('movieId').distinct().count()},
        ''')


        training set num of rows 89977,
        test set num of rows 30342,
        training set num of users 23503,
        training set num of movies 1102,
        test set num of users 16470,
        test set num of movies 1102,
        


In [12]:
indexer = indexTransformer()
indexer.fit(training)
training = indexer.transform(training).select('userId_idx', 'movieId_idx', 'rating').toPandas().values
test =  indexer.transform(test).select('userId_idx', 'movieId_idx', 'rating').toPandas().values

In [13]:
X = training.copy()

In [14]:
row = X[:, 0]
col = X[:, 1]
data = X[:, 2]

In [21]:
ratings = sparse.csr_matrix((data, (row, col)))

In [22]:
import torch

In [23]:
torch.__version__

'1.6.0'

In [30]:
n_users, n_items = ratings.shape

In [31]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors, sparse=True)
        self.item_factors = torch.nn.Embedding(n_items, n_factors, sparse=True)

    def forward(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)

In [34]:
model = MatrixFactorization(n_users, n_items, n_factors=20)
loss_func = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)  # learning rate

In [35]:
rows, cols = ratings.nonzero()
p = np.random.permutation(len(rows))
rows, cols = rows[p], cols[p]

for row, col in zip(*(rows, cols)):
    # Set gradients to zero
    optimizer.zero_grad()
    
    # Turn data into tensors
    rating = torch.FloatTensor([ratings[row, col]])
    row = torch.LongTensor([row])
    col = torch.LongTensor([col])

    # Predict and calculate loss
    prediction = model(row, col)
    loss = loss_func(prediction, rating)

    # Backpropagate
    loss.backward()

    # Update the parameters
    optimizer.step()

In [37]:
model.eval()

MatrixFactorization(
  (user_factors): Embedding(23503, 20, sparse=True)
  (item_factors): Embedding(1102, 20, sparse=True)
)

In [48]:
pred = []
for row, col in zip(*(rows, cols)):
    row = torch.LongTensor([row])
    col = torch.LongTensor([col])
    pred.append(int(model(row, col)[0]))

In [49]:
pred

[4,
 -4,
 2,
 -6,
 0,
 1,
 4,
 -1,
 4,
 -4,
 1,
 -4,
 1,
 3,
 5,
 -3,
 -9,
 5,
 0,
 -1,
 -7,
 0,
 0,
 2,
 -2,
 -1,
 -5,
 -6,
 1,
 0,
 0,
 12,
 1,
 0,
 2,
 -4,
 0,
 0,
 6,
 -1,
 0,
 -7,
 -6,
 0,
 -6,
 0,
 0,
 0,
 2,
 -1,
 -1,
 0,
 0,
 0,
 1,
 -1,
 -2,
 -1,
 -1,
 1,
 -4,
 -7,
 2,
 8,
 3,
 4,
 0,
 4,
 4,
 0,
 -2,
 0,
 -2,
 5,
 0,
 -4,
 -6,
 -3,
 -4,
 9,
 2,
 -1,
 8,
 -3,
 -1,
 0,
 2,
 -2,
 3,
 4,
 -3,
 5,
 -4,
 6,
 8,
 1,
 -16,
 -1,
 4,
 4,
 -4,
 -8,
 0,
 -1,
 5,
 -8,
 0,
 -1,
 -2,
 -5,
 -9,
 2,
 0,
 -4,
 4,
 2,
 2,
 2,
 0,
 -4,
 -3,
 2,
 -4,
 -1,
 -9,
 0,
 -1,
 1,
 -3,
 0,
 5,
 1,
 7,
 2,
 -7,
 0,
 4,
 2,
 4,
 -2,
 3,
 -5,
 0,
 0,
 6,
 2,
 0,
 6,
 11,
 -3,
 4,
 1,
 7,
 -1,
 -4,
 6,
 2,
 1,
 -4,
 6,
 -1,
 -2,
 0,
 0,
 -1,
 -6,
 -10,
 1,
 4,
 -2,
 0,
 5,
 -3,
 -8,
 -2,
 7,
 -11,
 13,
 1,
 0,
 4,
 0,
 -2,
 5,
 3,
 3,
 5,
 9,
 -3,
 4,
 8,
 -5,
 3,
 0,
 0,
 3,
 -4,
 3,
 -7,
 8,
 -3,
 2,
 2,
 -1,
 -2,
 -2,
 0,
 -11,
 0,
 1,
 1,
 0,
 -8,
 -3,
 0,
 1,
 -6,
 1,
 5,
 0,
 -7,
 0,
 4,
 -2,
 2,
 3,
 

In [24]:
class BiasedMatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors, sparse=True)
        self.item_factors = torch.nn.Embedding(n_items, n_factors, sparse=True)
        self.user_biases = torch.nn.Embedding(n_users, 1, sparse=True)
        self.item_biases = torch.nn.Embedding(n_items, 1, sparse=True)

    def forward(self, user, item):
        pred = self.user_biases(user) + self.item_biases(item)
        pred += (
            (self.user_factors(user) * self.item_factors(item))
            .sum(dim=1, keepdim=True)
        )
        return pred.squeeze()

In [25]:

reg_loss_func = torch.optim.SGD(model.parameters(), lr=1e-6, weight_decay=1e-5)


NameError: name 'optimizer' is not defined