In [1]:
# Imporing libraries
import os
import pandas as pd
import numpy as np
import sklearn.metrics.pairwise as pw

## Read the dataset, display the first few rows to understand it, and display the count of ratings (rows) in the dataset.

In [2]:
# Read the ratings dataset
baseLocation = "D:/TUNI/Courses/Period-2/DATA.ML.360 [Recommender]/Assignment 1/ml-latest-small/"
ratings_data = pd.read_csv(baseLocation+"ratings.csv")

# Displaying top 5 ratings data
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
# Remove 'timestamp' column from ratings dataset because it's not needed
ratings_data.drop('timestamp', inplace=True, axis=1)

# Displaying top 5 ratings data
ratings_data.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [4]:
# Count of ratings (rows)
print(len(ratings_data))

# Count of users
print(len(pd.unique(ratings_data['userId'])))

100836
610


In [5]:
# Read the movie dataset
movies_data = pd.read_csv(baseLocation+'movies.csv')

# Displaying top 5 movies data
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# Remove the "genres" column from the movies dataset because it is unnecessary.
movies_data.drop('genres', inplace=True, axis=1)

# Inspecting top 5 ratings data
movies_data.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


## Implement the item-based collaborative filtering approach, using the cosine similarity for computing similarities between items

In [7]:
# Merge Movies and Ratings data by MovieId
mergedData = movies_data.merge(ratings_data,on="movieId")
mergedData.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [8]:
# Create a user info table
user_item_data = pd.pivot_table(mergedData,values='rating',columns='movieId',index='userId')
user_item_data=user_item_data.fillna(0)
user_item_data.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Calculating Cosine Similarities

In [9]:
# Cosine Similarities
cosineSim = pw.cosine_similarity(user_item_data.T,user_item_data.T)
cosineSim

array([[1.        , 0.41056206, 0.2969169 , ..., 0.        , 0.        ,
        0.        ],
       [0.41056206, 1.        , 0.28243799, ..., 0.        , 0.        ,
        0.        ],
       [0.2969169 , 0.28243799, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [10]:
# Convert cosine similarity into a dataframe
cosineSim = pd.DataFrame(cosineSim, index = user_item_data.columns, columns = user_item_data.columns)

In [11]:
# Create a user history disctionary
userHistory = {}
for ind in user_item_data.index:
    userRated =[]
    userNotRated = []
    for col in user_item_data.columns:
        if user_item_data.loc[ind,col] == 0:
            userNotRated.append(col)
        else:
            userRated.append(col)
    userHistory[ind] = [userRated, userNotRated]

## Calculation of the prediction function presented in class for predicting movies scores

In [12]:
R = {}

def predictionScore(userId):
    temp_r = {}
    for p in userHistory[userId][1]:
        neu = 0
        den = 0
        for i in userHistory[userId][0]:
            neu = neu +(cosineSim.loc[i,p] * user_item_data.loc[userId,i])
            den = den+ cosineSim.loc[i,p]
            
        if den!= 0:
            temp_r[p]=(neu/den)
            
    R[userId] = temp_r 

## Select a user from the dataset, and for this user, show the 20 most relevant movies that the recommender suggests

In [13]:
# Taking a user:
predictionScore(100)

In [14]:
# Sort the prediction values
RecommendedMovieIds = []
for k,v in R.items():
    sort_v = sorted(v.items(), key=lambda x: x[1], reverse=True)
    for i in sort_v:
        RecommendedMovieIds.append(i[0])

## Top 20 Movies to recommend

In [15]:
recommended_df =movies_data[movies_data['movieId'].isin(RecommendedMovieIds[:20])]
recommended_df

Unnamed: 0,movieId,title
624,791,"Last Klezmer: Leopold Kozlowski, His Life and ..."
847,1116,"Single Girl, A (Fille seule, La) (1995)"
864,1137,Hustler White (1996)
866,1144,"Line King: The Al Hirschfeld Story, The (1996)"
1659,2226,"Ring, The (1927)"
1875,2493,"Harmonists, The (1997)"
2838,3795,"Five Senses, The (1999)"
4193,6049,Ethan Frome (1993)
5453,26095,"Carabineers, The (Carabiniers, Les) (1963)"
6945,65350,"General Died at Dawn, The (1936)"
