In [93]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Read Rating Data

In [94]:
rating_data=pd.read_csv('/Users/ziranmin/Desktop/Sophia课/ml-latest-small/ratings.csv')
rating_data.shape

(100836, 4)

In [95]:
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# Reshape to Matrix

In [96]:
reshap_rating_data = rating_data.pivot(index='movieId', columns='userId', values='rating')

In [97]:
rating_matrix = reshap_rating_data.reset_index()
del rating_matrix.columns.name
rating_matrix.head()

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,601,602,603,604,605,606,607,608,609,610
0,1,4.0,,,,4.0,,4.5,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
1,2,,,,,,4.0,,4.0,,...,,4.0,,5.0,3.5,,,2.0,,
2,3,4.0,,,,,5.0,,,,...,,,,,,,,2.0,,
3,4,,,,,,3.0,,,,...,,,,,,,,,,
4,5,,,,,,5.0,,,,...,,,,3.0,,,,,,


# Found faster method to remove NaN pairs 
# No need to use for loop

# New Pearson

In [98]:
import math
def pearson(rating_matrix, id_1, id_2):
    # find movies that both users have rated
    user_1 = rating_matrix[id_1]
    user_2 = rating_matrix[id_2]
    
    bad = ~np.logical_or(np.isnan(user_1), np.isnan(user_2))
    
    user_1_l = np.compress(bad, user_1)
    user_2_l = np.compress(bad, user_2)
    
    #special case
    if len(user_1_l) == 0:
        return 0
    
    result = np.corrcoef(user_1_l,user_2_l)[0][1]
    
    #special case
    if math.isnan(result):
        return 0

    return result

# New Cosine

In [99]:
def cosine(rating_matrix, id_1, id_2):
    # find movies that both users have rated
    user_1 = rating_matrix[id_1]
    user_2 = rating_matrix[id_2]
    
    bad = ~np.logical_or(np.isnan(user_1), np.isnan(user_2))
    
    user_1_l = np.compress(bad, user_1)
    user_2_l = np.compress(bad, user_2)
            
    #special case
    if len(user_1_l) == 0:
        return 0

    
    dot_product = np.dot(user_1_l, user_2_l)
    norm_a = np.linalg.norm(user_1_l)
    norm_b = np.linalg.norm(user_2_l)
    return dot_product / (norm_a * norm_b)

# With new similarity function, Top Match can run much more faster
# Top Match

In [100]:
def topMatch(rating_matrix, id_1, sim_function):
    result = []
    for i in rating_matrix.columns.values.tolist()[1:]:
        if i != id_1:
            current_score = sim_function(rating_matrix, id_1, i)
            result += [(current_score, i)]
    result.sort(reverse = True)
    return result[:20]

In [101]:
topMatch(rating_matrix, 1, pearson)

[(1.0, 550),
 (1.0, 333),
 (1.0, 146),
 (1.0, 106),
 (0.9999999999999999, 598),
 (0.9622504486493764, 473),
 (0.9258200997725513, 511),
 (0.9185586535436918, 9),
 (0.87831006565368, 13),
 (0.8728715609439694, 366),
 (0.8669214468630109, 401),
 (0.8664002254439636, 535),
 (0.8660254037844387, 481),
 (0.8660254037844387, 154),
 (0.8215838362577493, 90),
 (0.8029550685469661, 499),
 (0.8017837257372732, 157),
 (0.790569415042095, 139),
 (0.7869358789643608, 476),
 (0.7745966692414833, 487)]

In [102]:
topMatch(rating_matrix, 1, cosine)

[(1.0, 388),
 (1.0, 383),
 (1.0, 358),
 (1.0, 315),
 (1.0, 291),
 (1.0, 253),
 (1.0, 245),
 (1.0, 184),
 (1.0, 85),
 (1.0, 77),
 (0.9999999999999999, 12),
 (0.9999999999999998, 2),
 (0.999193447324659, 511),
 (0.9988130559615213, 278),
 (0.9986560262721613, 550),
 (0.9986331420538472, 366),
 (0.9986178293325098, 472),
 (0.9981149841863163, 459),
 (0.9980597354377347, 114),
 (0.9978250350798125, 65)]

# Example in video 

https://www.bing.com/videos/search?q=recommendation+systems+collaborative+filtering+university+of+washington&&view=detail&mid=1DB77199CB8E00F3384B1DB77199CB8E00F3384B&&FORM=VDRVRV

In [103]:
df_0=pd.read_csv('/Users/ziranmin/Desktop/Sophia课/ml-latest-small/example.csv')

In [104]:
df_1 = df_0.pivot(index='movieId', columns='userId', values='rating')

In [105]:
df_2 = df_1.reset_index()
del df_2.columns.name
df_2

Unnamed: 0,movieId,1,2,3,4,5,6,7
0,1,2.5,3.0,2.5,,3.0,3.0,
1,2,3.5,3.5,3.0,3.5,4.0,4.0,4.5
2,3,3.0,1.5,,3.0,2.0,,
3,4,3.5,5.0,3.5,4.0,3.0,5.0,4.0
4,5,2.5,3.5,,2.5,2.0,3.5,1.0
5,6,3.0,3.0,4.0,4.5,3.0,3.0,


# Recommend Function 

In [106]:
def recommend(df_2, user, sim_function):
    total_user_num = df_2.shape[1]
    
    A = df_2.loc[df_2[user].isnull()].drop(['movieId',user], axis=1).fillna(0).values
    
    sim_l = []
    for i in df_2.columns.values.tolist()[1:]:
        if i != user:
            curr_sim = sim_function(df_2, user, i)
            sim_l += [curr_sim]
            
    sim_l_array = np.array(sim_l)
    B = sim_l_array.reshape((total_user_num -2,1))
    
    sum_score = np.dot(A,B)
    
    watched_unwatched = A.copy()
    watched_unwatched[watched_unwatched>0] = 1
    sum_sim = np.dot(watched_unwatched, B)
    weighted_score = sum_score/sum_sim
    
    unseen_id = df_2.loc[df_2[user].isnull()]['movieId'].tolist()
    
    result_rank = []
    for i in range(len(unseen_id)):
        result_rank += [(weighted_score[i][0], unseen_id[i])]
        
    result_rank.sort(reverse = True)
    return result_rank

# Recommend Movie for User 7

In [107]:
recommend(df_2, 7, pearson)

[(3.1192015867855525, 6), (3.0022347306071273, 1), (2.5309807037655645, 3)]

In [108]:
recommend(df_2, 7, cosine)

[(3.423868647883322, 6), (2.7959647128789165, 1), (2.382318132517556, 3)]

# Back to Large Dataset

# Top 10 Recommendations for user 1 by Pearson

In [109]:
pearson(rating_matrix,1,86)

0.23742778106092

In [110]:
recommend(rating_matrix, 1, pearson)[:80]

[(78.92869344315301, 1312),
 (56.29599486428759, 193),
 (54.28591122451324, 634),
 (36.4032268281137, 30816),
 (32.84226694454812, 4440),
 (31.011681906297962, 3046),
 (31.011681906297962, 1406),
 (30.570386845499826, 704),
 (26.741059596132477, 181),
 (22.653141446063774, 680),
 (18.60536683152634, 56336),
 (18.123028191510876, 881),
 (17.293178826790406, 6619),
 (16.613930401415484, 645),
 (12.231688207657403, 49910),
 (11.562507391059407, 711),
 (10.678374958619793, 5643),
 (10.405907946885208, 619),
 (10.35040567869407, 5696),
 (9.05123263538052, 86),
 (8.940151826530647, 667),
 (8.698411354324147, 5420),
 (8.672632703143368, 15),
 (8.542595960155584, 8531),
 (8.451451218442825, 1670),
 (8.164749760339003, 935),
 (7.19032237318008, 126),
 (7.035619528010808, 281),
 (6.90645231468517, 3787),
 (6.551884912150186, 6078),
 (6.121041841199566, 3884),
 (6.078581703816919, 3283),
 (5.978846123427465, 452),
 (5.9747006159752996, 3324),
 (5.761927947380438, 7380),
 (5.72008893315892, 3364),

# Top 10 Recommendations for user 1 by Cosine

In [111]:
recommend(rating_matrix, 1, cosine)[:60]

[(5.000000000000001, 141718),
 (5.000000000000001, 86721),
 (5.000000000000001, 44943),
 (5.000000000000001, 6192),
 (5.000000000000001, 5745),
 (5.000000000000001, 5723),
 (5.000000000000001, 5607),
 (5.000000000000001, 5241),
 (5.000000000000001, 5088),
 (5.000000000000001, 3096),
 (5.000000000000001, 2824),
 (5.0, 158027),
 (5.0, 147410),
 (5.0, 146684),
 (5.0, 146662),
 (5.0, 143559),
 (5.0, 143031),
 (5.0, 142444),
 (5.0, 139640),
 (5.0, 138966),
 (5.0, 6611),
 (5.0, 6408),
 (5.0, 6402),
 (5.0, 6201),
 (5.0, 6086),
 (5.0, 6021),
 (5.0, 5746),
 (5.0, 5537),
 (5.0, 5513),
 (5.0, 5490),
 (5.0, 5468),
 (5.0, 5416),
 (5.0, 5328),
 (5.0, 5244),
 (5.0, 5059),
 (5.0, 3086),
 (5.0, 3073),
 (5.0, 2972),
 (5.0, 2969),
 (5.0, 2512),
 (5.0, 2196),
 (5.0, 2075),
 (5.0, 1759),
 (5.0, 1631),
 (5.0, 1349),
 (5.0, 1310),
 (5.0, 1151),
 (5.0, 1140),
 (5.0, 876),
 (5.0, 633),
 (5.0, 496),
 (5.0, 495),
 (5.0, 467),
 (5.0, 148),
 (5.0, 99),
 (4.999999999999999, 6442),
 (4.999999999999999, 5889),
 (4.99