In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors


In [88]:

ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')


In [89]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [90]:
movie_stats = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])
C = movie_stats['count'].mean()
m = movie_stats['mean'].mean()

In [91]:
C

10.369806663924312

In [92]:
m

3.262448274810963

In [93]:
movie_stats['bayesian_avg'] = (C * m + movie_stats['count'] * movie_stats['mean']) / (C + movie_stats['count'])

In [94]:
movie_stats

Unnamed: 0_level_0,count,mean,bayesian_avg
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,215,3.920930,3.890632
2,110,3.431818,3.417227
3,52,3.259615,3.260086
4,7,2.357143,2.897612
5,49,3.071429,3.104793
...,...,...,...
193581,1,4.000000,3.327318
193583,1,3.500000,3.283341
193585,1,3.500000,3.283341
193587,1,3.500000,3.283341


In [95]:
min_C = 30
min_m = 3.5
# Filter out movies with count below min_C and Bayesian average below min_m
filtered_movies = movie_stats[(movie_stats['count'] >= min_C) & (movie_stats['bayesian_avg'] >= min_m)]

In [96]:
filtered_movies

Unnamed: 0_level_0,count,mean,bayesian_avg
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,215,3.920930,3.890632
6,102,3.946078,3.882991
11,70,3.671429,3.618659
16,82,3.926829,3.852243
17,67,3.776119,3.707273
...,...,...,...
122904,54,3.833333,3.741365
134130,48,4.000000,3.868969
134853,43,3.813953,3.706795
139385,31,3.903226,3.742608


In [97]:
filtered_movielist = filtered_movies.index

In [98]:
filtered_movielist

Index([     1,      6,     11,     16,     17,     25,     29,     32,     34,
           36,
       ...
       115617, 116797, 119145, 122882, 122886, 122904, 134130, 134853, 139385,
       152081],
      dtype='int64', name='movieId', length=488)

In [99]:
# Lets get the top interactive users as well 
active_users = ratings.groupby('userId')['rating'].agg(['count'])

In [100]:
active_users

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
...,...
606,1115
607,187
608,831
609,37


In [101]:
avg_user_count = active_users['count'].mean()

In [102]:
avg_user_count

165.30491803278687

In [103]:
active_users = active_users[active_users['count'] >= 150]

In [104]:
active_users

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
1,232
4,216
6,314
7,152
18,502
...,...
605,221
606,1115
607,187
608,831


In [105]:
active_userlist = active_users.index

In [106]:
active_userlist

Index([  1,   4,   6,   7,  18,  19,  20,  21,  28,  33,
       ...
       596, 597, 599, 600, 603, 605, 606, 607, 608, 610],
      dtype='int64', name='userId', length=176)

In [107]:
ratings = ratings[ratings['userId'].isin(active_userlist)]

In [108]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [109]:
final_ratings = ratings[ratings['movieId'].isin(filtered_movielist)]

In [110]:
final_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
7,1,110,4.0,964982176
...,...,...,...,...
100742,610,122904,3.0,1493845981
100760,610,134130,3.5,1479543002
100763,610,134853,3.5,1493845106
100780,610,139385,4.5,1493846777


In [111]:
final_ratings = final_ratings.merge(movies[['movieId', 'title']])

In [112]:
final_ratings.drop_duplicates()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1,4.0,964982703,Toy Story (1995)
1,1,6,4.0,964982224,Heat (1995)
2,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995)
3,1,50,5.0,964982931,"Usual Suspects, The (1995)"
4,1,110,4.0,964982176,Braveheart (1995)
...,...,...,...,...,...
24545,610,122904,3.0,1493845981,Deadpool (2016)
24546,610,134130,3.5,1479543002,The Martian (2015)
24547,610,134853,3.5,1493845106,Inside Out (2015)
24548,610,139385,4.5,1493846777,The Revenant (2015)


In [85]:
pivot_ratings = final_ratings.pivot_table(index = 'title',columns = 'userId', values = 'rating')

In [86]:
pivot_ratings.fillna(0,inplace=True)

In [87]:
pivot_ratings

userId,1,4,6,7,18,19,20,21,28,33,...,596,597,599,600,603,605,606,607,608,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,3.5
12 Angry Men (1957),0.0,5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,4.0,4.0,3.0,0.0,0.0,3.0,0.0,...,4.0,0.0,5.0,4.0,5.0,0.0,5.0,0.0,3.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,4.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,3.5,5.0
300 (2007),0.0,0.0,0.0,0.0,3.5,0.0,0.0,3.5,4.5,0.0,...,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yes Man (2008),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Young Frankenstein (1974),5.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,5.0,3.5,4.0,5.0,0.0,3.5,0.0,0.0,0.0
Zodiac (2007),0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
Zombieland (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5


In [75]:
from sklearn.metrics.pairwise import cosine_similarity

In [79]:
similarity_scores = cosine_similarity(pivot_ratings)

In [118]:
def recommend(movie_name):
    # index fetch
    index = np.where(pivot_ratings.index == movie_name)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key = lambda x:x[1],reverse = True)[1:6]
    for i in similar_items:
        print(pivot_ratings.index[i[0]])
    

In [134]:
pivot_ratings.shape

(488, 176)

In [136]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [137]:
# Train-test split
train_data, test_data = train_test_split(pivot_ratings, test_size=0.2, random_state=42)

# User-based collaborative filtering
def collaborative_filtering(train_data, test_data):
    # Calculate cosine similarity matrix
    similarity_matrix = cosine_similarity(train_data.T, train_data.T)
    
    # Predict ratings for test data
    predicted_ratings = np.dot(similarity_matrix, test_data.T) / np.array([np.abs(similarity_matrix).sum(axis=1)]).T
    
    # Convert predicted ratings to DataFrame
    pred_df = pd.DataFrame(predicted_ratings, index=train_data.columns, columns=test_data.index)
    
    return pred_df

# Run collaborative filtering
predicted_ratings = collaborative_filtering(train_data, test_data)

# Evaluate the model
mse = mean_squared_error(test_data.values.flatten(), predicted_ratings.values.flatten())
print("Mean Squared Error:", mse)

Mean Squared Error: 3.750798435712361
