In [None]:
#@title Step 1: Imports

import pandas as pd
import numpy as np
import math
import torch
%matplotlib inline
import matplotlib.pyplot as plt 
import os                        
import sklearn.datasets    
import json
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from scipy.special import softmax
from sklearn.metrics import mean_squared_error

In [None]:
#@title Step 2: Download data
!pip install kaggle

!mkdir ~/.kaggle

kaggle_username = "wmd0701" #@param {type:"string"}
kaggle_api_key = "8f525bc765511d324f8509b938d7f39c" #@param {type:"string"}

assert len(kaggle_username) > 0 and len(kaggle_api_key) > 0

api_token = {"username": kaggle_username,"key": kaggle_api_key}

with open('kaggle.json', 'w') as file:
    json.dump(api_token, file)

!mv kaggle.json ~/.kaggle/kaggle.json

!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c cil-collaborative-filtering-2021

!unzip data_train.csv.zip 
!unzip sampleSubmission.csv.zip
print()

number_of_users, number_of_movies = (10000, 1000)

data_pd = pd.read_csv('data_train.csv')
print(data_pd.head(5))
print('\nShape', data_pd.shape)

submission_pd = pd.read_csv('sampleSubmission.csv.zip')
print("\n\n",submission_pd.head(5))
print("\nShape:", submission_pd.shape)
print("\n\nSummary:", np.unique(data_pd.Prediction.values, return_counts=True))

sparsity = 1.0 - data_pd.shape[0] / (number_of_users * number_of_movies)
print("\nSparsity:", sparsity)

Downloading data_train.csv.zip to /content
  0% 0.00/3.33M [00:00<?, ?B/s]
100% 3.33M/3.33M [00:00<00:00, 112MB/s]
Downloading sampleSubmission.csv.zip to /content
  0% 0.00/2.92M [00:00<?, ?B/s]
100% 2.92M/2.92M [00:00<00:00, 96.4MB/s]
Archive:  data_train.csv.zip
  inflating: data_train.csv          
Archive:  sampleSubmission.csv.zip
  inflating: sampleSubmission.csv    

       Id  Prediction
0  r44_c1           4
1  r61_c1           3
2  r67_c1           4
3  r72_c1           3
4  r86_c1           5

Shape (1176952, 2)


         Id  Prediction
0   r37_c1           3
1   r73_c1           3
2  r156_c1           3
3  r160_c1           3
4  r248_c1           3

Shape: (1176952, 2)


Summary: (array([1, 2, 3, 4, 5]), array([ 43508,  99180, 274327, 324700, 435237]))

Sparsity: 0.8823048


In [None]:
#@title Step 3: Split data

train_size = 0.9

train_pd, test_pd = train_test_split(data_pd, train_size=train_size, random_state=0)
print(train_pd.shape)
print(test_pd.shape)

def extract_users_items_predictions(data_pd):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    return users, movies, predictions

train_users, train_movies, train_predictions = extract_users_items_predictions(train_pd)
test_users, test_movies, test_predictions = extract_users_items_predictions(test_pd)
all_users, all_movies, all_predictions = extract_users_items_predictions(data_pd)

(1059256, 2)
(117696, 2)


In [None]:
#@title Step 4: Generate user-movie rating matrix

data = np.full((number_of_users, number_of_movies), 0) 

# whether to use 10% data for validation
validate = True

if validate:
    iters = zip(train_users, train_movies, train_predictions)
else:
    iters = zip(all_users, all_movies, all_predictions)

for user, movie, pred in iters:
    data[user][movie] = pred

In [None]:
#@title Step 5: CSR matrix and fit KNN model

# convert full matrix into CSR format which costs less memory
t_data = np.transpose(data)
csr = csr_matrix(t_data)

# fit knn model with cosine distance metric
model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model.fit(csr)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [None]:
#@title Step 6: Validation

# consider 5 nearest neighbors among 500 neighbors
n_neighbors = 500
neighbor_count_limit = 5

my_preds = []
for i in range(len(test_movies)):
    user  = test_users[i]
    movie = test_movies[i]
    
    distances, indices = model.kneighbors(t_data[[movie]], n_neighbors=n_neighbors)

    neighbor_count = 0
    neighbor_simi  = []
    neighbor_pred  = []

    # the first element is always the movie itself so we skip it
    for j in range(1, n_neighbors):
        neighbor = indices[0][j]

        # consider only rated neighbors
        if t_data[neighbor][user] != 0:
            neighbor_count += 1

            # cosine similarity = 1 - cosine distance
            neighbor_simi.append(1 - distances[0][j])
            neighbor_pred.append(t_data[neighbor][user])

            if neighbor_count >= neighbor_count_limit:
                break
    
    # when n_neighbors too low, it may happen that there are no rated neighors
    if neighbor_count == 0:
        print("WTF! No neighbor have rating different from 0!")
        break

    neighbor_weight = softmax(neighbor_simi)
    my_pred = np.dot(neighbor_weight, neighbor_pred)
    my_preds.append(my_pred)

    if i%1000 == 0:
        print("current iteration:", i)

current iteration: 0
current iteration: 1000
current iteration: 2000
current iteration: 3000
current iteration: 4000
current iteration: 5000
current iteration: 6000
current iteration: 7000
current iteration: 8000
current iteration: 9000
current iteration: 10000
current iteration: 11000
current iteration: 12000
current iteration: 13000
current iteration: 14000
current iteration: 15000
current iteration: 16000
current iteration: 17000
current iteration: 18000
current iteration: 19000
current iteration: 20000
current iteration: 21000
current iteration: 22000
current iteration: 23000
current iteration: 24000
current iteration: 25000
current iteration: 26000
current iteration: 27000
current iteration: 28000
current iteration: 29000
current iteration: 30000
current iteration: 31000
current iteration: 32000
current iteration: 33000
current iteration: 34000
current iteration: 35000
current iteration: 36000
current iteration: 37000
current iteration: 38000
current iteration: 39000
current itera

In [None]:
#@title Step 7: Calculate MSE

mse = mean_squared_error(my_preds, test_predictions)
mse

1.5173301656416667

In [None]:
#@title (optional) Self-defined MSE calculation
mse = np.sum(np.square(np.array(my_preds) - test_predictions))/len(test_predictions)
mse

1.5173301656416667