### Libraries to install

In [1]:
import pandas as pd
import surprise as sp

### Constants

In [2]:
BYTES_TO_MB_DIV = 0.000001

### Function to check the memory usage of the dataframe

In [3]:
def df_mem_usage(df):
    print()
    mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
    print("Memory usage is " + str(mem) + " MB")

### Reading the dataset

In [4]:
%%time

cols = ['%%MatrixMarket','matrix','coordinate']

dtypes = {
    '%%MatrixMarket':'int32', 
    'matrix':'int16', 
    'coordinate':'int8'
}

df = pd.read_csv('data/netflix_mm', delim_whitespace=True, usecols=cols, dtype=dtypes, skiprows=range(1, 3))
df.columns = ['userID', 'itemID', 'rating']

print(df.head())
df_mem_usage(df)

   userID  itemID  rating
0       1       1       3
1       2       1       5
2       3       1       4
3       5       1       3
4       6       1       3

Memory usage is 693.505 MB
CPU times: user 38.6 s, sys: 4.62 s, total: 43.2 s
Wall time: 48.5 s


In [5]:
df_read = df.head(10000)

# Cosine similarity

**Documentation of scikit-surprise:**

-https://surprise.readthedocs.io/en/stable/matrix_factorization.html <br>
-https://datascience.stackexchange.com/questions/6814/how-to-split-train-test-in-recommender-systems

### Declare a reader object to parse the ratings

In [6]:
reader = sp.Reader(rating_scale=(1,5))

### Load the dataframe in scikit-surprise's format using the Reader

In [7]:
%%time

data = sp.Dataset.load_from_df(df_read[['userID', 'itemID', 'rating']], reader)

CPU times: user 20.3 ms, sys: 11 ms, total: 31.3 ms
Wall time: 41.6 ms


### Divide the data into a training and test set

In [8]:
trainset, testset = sp.model_selection.train_test_split(data, test_size=.25)

### Run the Grid search

In [9]:
algo = sp.KNNBasic(sim_options = {'name': 'cosine'})

### Train the model on the training set and evaluate on the test set

In [10]:
%time

algo.fit(trainset)

CPU times: user 6 µs, sys: 41 µs, total: 47 µs
Wall time: 50.8 µs
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f9377143860>

In [13]:
predictions = algo.test(testset)
print(("Cosine similarity RMSE is: ").format(sp.accuracy.rmse(predictions)))

RMSE: 1.2814
Cosine similarity RMSE is: 
