In [2]:
#import packages
import pandas as pd
import numpy as np
import os
from sklearn.metrics import mean_squared_error 

In [3]:
#return a dataframe of all ratings
def get_ratings(part='u.data'):
    """Return a DataFrame of user-movie ratings."""
    return pd.read_csv(
        os.path.join('ml-100k', part), header=None, sep='\t',
        names=['user_id', 'item_id', 'rating', 'timestamp'],
    ).rename(columns={'item_id': 'movie_id'})

get_ratings()
#read data file and transform it into data frame
df_train = get_ratings('ua.base')
df_test = get_ratings('ua.test')

In [4]:
#get movies from the movie file
ITEM_PROPS = ['movie_id', 'movie_title', 'video_release_date', 'unknown', 'IMDb_URL']
GENRES = ['Action', 'Adventure', 'Animation',
          'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
          'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
          'Thriller', 'War', 'Western']

def get_movies():
    """Return a DataFrame of all movies."""
    return pd.read_csv(
        os.path.join('ml-100k', 'u.item'), header=None, index_col=False, sep='|', encoding="utf-16",
        names=ITEM_PROPS + GENRES,
    )
movies = get_movies()
movies.head()


Unnamed: 0,movie_id,movie_title,video_release_date,unknown,IMDb_URL,Action,Adventure,Animation,Childrens,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [5]:
df_train.describe()
df_train = pd.merge(df_train,movies[movies.columns[0:2]],on=['movie_id'])
df_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title
0,1,1,5,874965758,Toy Story (1995)
1,2,1,4,888550871,Toy Story (1995)
2,6,1,4,883599478,Toy Story (1995)
3,10,1,4,877888877,Toy Story (1995)
4,13,1,3,882140487,Toy Story (1995)


In [6]:
#calculate the mean of the user ratings
ratings = pd.DataFrame(df_train.groupby('movie_title')['rating'].mean())
ratings['rating_freq'] = df_train.groupby('movie_title')['rating'].count()
ratings.head()


Unnamed: 0_level_0,rating,rating_freq
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),2.375,8
1-900 (1994),2.6,5
101 Dalmatians (1996),2.94,100
12 Angry Men (1957),4.327434,113
187 (1997),3.026316,38


In [7]:
# sort the rating table by frequency
ratings.sort_values('rating_freq',ascending=False).head()


Unnamed: 0_level_0,rating,rating_freq
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Star Wars (1977),4.365657,495
Fargo (1996),4.148984,443
Return of the Jedi (1983),4.01139,439
Contact (1997),3.791262,412
"English Patient, The (1996)",3.6925,400


In [8]:
# define user-item matrix
user_item_matrix = df_train.pivot_table(index='user_id', columns='movie_title',values='rating')
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix.head()

movie_title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,� k�ldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0


In [63]:
def user_similarity(userid):
    user_ratings = item_user_matrix[userid]
    similar_m = item_user_matrix.corrwith(user_ratings)
    corr = pd.DataFrame(similar_m,columns=['Correlation'])
    corr.dropna(inplace=True)
    recm_list = corr.sort_values('Correlation',ascending=False)
    return recm_list


In [45]:
item_user_matrix = user_item_matrix.T

In [52]:

def item_similarity(movietitle):
    item_ratings = user_item_matrix[movietitle]
    similar_m = user_item_matrix.corrwith(item_ratings)
    corr = pd.DataFrame(similar_m,columns=['Correlation'])
    corr.dropna(inplace=True)
    corr = corr.join(ratings['rating_freq'])
    recm_list = corr[corr['rating_freq']>100].sort_values('Correlation',ascending=False)
    
    return  recm_list


In [127]:
# return k similar users
def predict_rating_by_user(k,userid,movietitle):
    sim_list = user_similarity(userid)
    itemrating = item_user_matrix.loc[movietitle]
    selected_user = {}
    count = 0
    i = 0
    while count != k:
        user = sim_list['Correlation'].index[i]
        i += 1
        old_rating = item_user_matrix.loc[movietitle][user]
        if old_rating != 0:
            count += 1
            selected_user[user] = old_rating
    rating_sum = 0
    sim_sum = 0
    for user in selected_user.keys():
        rating_sum += sim_list['Correlation'][user]*selected_user[user]
        sim_sum += sim_list['Correlation'][user]
    predicted_rating = (rating_sum/sim_sum).round(2)
    return predicted_rating


predict_rating_by_user(2,1,"187 (1997)")

3.0

In [107]:

def predict_rating_by_item(k,userid,movietitle):
    sim_list = item_similarity(movietitle)
    userrating = user_item_matrix.loc[userid]
    selected_item = {}
    count = 0
    i = 0
    while count != k:
        movie = sim_list['Correlation'].index[i]
        i += 1
        old_rating = user_item_matrix.loc[userid][movie]
        if old_rating != 0:
            count += 1
            selected_item[movie] = old_rating
    rating_sum = 0
    sim_sum = 0
    for movie in selected_item.keys():
        rating_sum += sim_list['Correlation'][movie]*selected_item[movie]
        sim_sum += sim_list['Correlation'][movie]
    predicted_rating = (rating_sum/sim_sum).round(2)
    return predicted_rating

predict_rating_by_item(2,1,"187 (1997)")


1.93

## Test data

In [27]:
df_test = pd.merge(df_test,movies[movies.columns[0:2]],on=['movie_id'])
df_test.head(100)

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title
0,1,20,4,887431883,Angels and Insects (1995)
1,63,20,3,875748004,Angels and Insects (1995)
2,115,20,3,881171009,Angels and Insects (1995)
3,189,20,5,893264466,Angels and Insects (1995)
4,296,20,5,884196921,Angels and Insects (1995)
...,...,...,...,...,...
95,828,171,3,891036568,Delicatessen (1991)
96,870,171,4,875050698,Delicatessen (1991)
97,875,171,5,876465370,Delicatessen (1991)
98,1,189,3,888732928,"Grand Day Out, A (1992)"


In [28]:
test_user_item_matrix = df_test.pivot_table(index='user_id', columns='movie_title',values='rating')
test_user_item_matrix.head()

movie_title,'Til There Was You (1997),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",8 1/2 (1963),...,Wonderland (1997),"World of Apu, The (Apur Sansar) (1959)","Wrong Trousers, The (1993)",Wyatt Earp (1994),Year of the Horse (1997),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",unknown
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,1.0,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,2.0,,,,,,,,,...,,,,,,,,,,


In [125]:

def itemCf_mse():
    mse = 0
    for i,row in df_test[0:50].iterrows():
        user_id = row['user_id']
        movie_title = row['movie_title']
        rating = row['rating']
        predicted_rating = predict_rating_by_item(2,user_id,movie_title)
        diff = predicted_rating - rating
        mse += pow(diff,2)
        
    return mse
mse_itemcf = itemCf_mse()


44.0567

In [132]:
#calculate the rmse of top 50 testing columns
rmse_itemcf = mse_itemcf/50

0.881134

In [134]:
def userCf_mse():
    mse = 0
    for i,row in df_test[0:50].iterrows():
        user_id = row['user_id']
        movie_title = row['movie_title']
        rating = row['rating']
        predicted_rating = predict_rating_by_user(2,user_id,movie_title)
        diff = predicted_rating - rating
        mse += pow(diff,2)
        
    return mse
mse_usercf = userCf_mse()

In [None]:
rmse_usercf = mse_itemcf/50
print(rmse_usercf)