In [45]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Read Rating Data

In [3]:
rating_data=pd.read_csv('/Users/ziranmin/Desktop/Sophia课/ml-latest-small/ratings.csv')
rating_data.shape

(100836, 4)

In [4]:
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# Reshape to Matrix

In [5]:
reshap_rating_data = rating_data.pivot(index='movieId', columns='userId', values='rating')

In [6]:
rating_matrix = reshap_rating_data.reset_index()
del rating_matrix.columns.name
rating_matrix.head()

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,601,602,603,604,605,606,607,608,609,610
0,1,4.0,,,,4.0,,4.5,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
1,2,,,,,,4.0,,4.0,,...,,4.0,,5.0,3.5,,,2.0,,
2,3,4.0,,,,,5.0,,,,...,,,,,,,,2.0,,
3,4,,,,,,3.0,,,,...,,,,,,,,,,
4,5,,,,,,5.0,,,,...,,,,3.0,,,,,,


# Found faster method to remove NaN pairs 
# No need to use for loop

# New Pearson

In [7]:
import math
def pearson(rating_matrix, id_1, id_2):
    # find movies that both users have rated
    user_1 = rating_matrix[id_1]
    user_2 = rating_matrix[id_2]
    
    bad = ~np.logical_or(np.isnan(user_1), np.isnan(user_2))
    
    user_1_l = np.compress(bad, user_1)
    user_2_l = np.compress(bad, user_2)
    
    #special case
    if len(user_1_l) == 0:
        return 0
    
    result = np.corrcoef(user_1_l,user_2_l)[0][1]
    
    #special case
    if math.isnan(result):
        return 0

    return result

In [10]:
pearson(rating_matrix, 1, 5)

0.26874902634577136

# New Cosine

In [15]:
def cosine(rating_matrix, id_1, id_2):
    # find movies that both users have rated
    user_1 = rating_matrix[id_1]
    user_2 = rating_matrix[id_2]
    
    bad = ~np.logical_or(np.isnan(user_1), np.isnan(user_2))
    
    user_1_l = np.compress(bad, user_1)
    user_2_l = np.compress(bad, user_2)
            
    #special case
    if len(user_1_l) == 0:
        return 0

    
    dot_product = np.dot(user_1_l, user_2_l)
    norm_a = np.linalg.norm(user_1_l)
    norm_b = np.linalg.norm(user_2_l)
    return dot_product / (norm_a * norm_b)

# With new similarity function, Top Match can run much more faster
# Top Match

In [76]:
def topMatch(rating_matrix, id_1, sim_function):
    result = []
    for i in rating_matrix.columns.values.tolist()[1:]:
        if i != id_1:
            current_score = sim_function(rating_matrix, id_1, i)
            result += [(current_score, i)]
    result.sort(reverse = True)
    return result[:20]

In [77]:
topMatch(rating_matrix, 1, pearson)

[(1.0, 550),
 (1.0, 333),
 (1.0, 146),
 (1.0, 106),
 (0.9999999999999999, 598),
 (0.9622504486493764, 473),
 (0.9258200997725513, 511),
 (0.9185586535436918, 9),
 (0.87831006565368, 13),
 (0.8728715609439694, 366),
 (0.8669214468630109, 401),
 (0.8664002254439636, 535),
 (0.8660254037844387, 481),
 (0.8660254037844387, 154),
 (0.8215838362577493, 90),
 (0.8029550685469661, 499),
 (0.8017837257372732, 157),
 (0.790569415042095, 139),
 (0.7869358789643608, 476),
 (0.7745966692414833, 487)]

In [78]:
topMatch(rating_matrix, 1, cosine)

[(1.0, 388),
 (1.0, 383),
 (1.0, 358),
 (1.0, 315),
 (1.0, 291),
 (1.0, 253),
 (1.0, 245),
 (1.0, 184),
 (1.0, 85),
 (1.0, 77),
 (0.9999999999999999, 12),
 (0.9999999999999998, 2),
 (0.999193447324659, 511),
 (0.9988130559615213, 278),
 (0.9986560262721613, 550),
 (0.9986331420538472, 366),
 (0.9986178293325098, 472),
 (0.9981149841863163, 459),
 (0.9980597354377347, 114),
 (0.9978250350798125, 65)]

# Example in video 

https://www.bing.com/videos/search?q=recommendation+systems+collaborative+filtering+university+of+washington&&view=detail&mid=1DB77199CB8E00F3384B1DB77199CB8E00F3384B&&FORM=VDRVRV

In [17]:
df_0=pd.read_csv('/Users/ziranmin/Desktop/Sophia课/ml-latest-small/example.csv')

In [18]:
df_1 = df_0.pivot(index='movieId', columns='userId', values='rating')

In [19]:
df_2 = df_1.reset_index()
del df_2.columns.name
df_2

Unnamed: 0,movieId,1,2,3,4,5,6,7
0,1,2.5,3.0,2.5,,3.0,3.0,
1,2,3.5,3.5,3.0,3.5,4.0,4.0,4.5
2,3,3.0,1.5,,3.0,2.0,,
3,4,3.5,5.0,3.5,4.0,3.0,5.0,4.0
4,5,2.5,3.5,,2.5,2.0,3.5,1.0
5,6,3.0,3.0,4.0,4.5,3.0,3.0,


# Recommend Function 

In [42]:
def recommend(df_2, user, sim_function):
    total_user_num = df_2.shape[1]
    
    A = df_2.loc[df_2[user].isnull()].drop(['movieId',user], axis=1).fillna(0).values
    
    sim_l = []
    for i in df_2.columns.values.tolist()[1:]:
        if i != user:
            curr_sim = sim_function(df_2, user, i)
            sim_l += [curr_sim]
            
    sim_l_array = np.array(sim_l)
    B = sim_l_array.reshape((total_user_num -2,1))
    
    weighted_score = np.dot(A,B)
    unseen_id = df_2.loc[df_2[user].isnull()]['movieId'].tolist()
    
    result_rank = []
    for i in range(len(unseen_id)):
        result_rank += [(weighted_score[i][0], unseen_id[i])]
        
    result_rank.sort(reverse = True)
    return result_rank

# Recommend Movie for User 7

In [43]:
recommend(df_2, 7, pearson)

[(8.899751858472692, 6), (8.074754105841562, 3), (5.883808341404687, 1)]

In [44]:
recommend(df_2, 7, cosine)

[(19.57976491173302, 6), (13.317747025677896, 1), (9.043215897374766, 3)]

# Back to Large Dataset

# Top 10 Recommendations for user 1 by Pearson

In [79]:
recommend(rating_matrix, 1, pearson)[:10]

[(181.4308362928555, 318),
 (127.66425274604327, 4993),
 (116.75905021212643, 2762),
 (112.05535688493067, 5952),
 (111.54988474188364, 7153),
 (102.84307756959298, 589),
 (99.39252598597905, 150),
 (97.69001519793534, 32),
 (96.62926045097862, 858),
 (94.61511411002596, 4226)]

# Top 10 Recommendations for user 1 by Cosine

In [59]:
recommend(rating_matrix, 1, cosine)[:10]

[(1355.9595822792298, 318),
 (857.3599987171442, 589),
 (790.6650063159748, 858),
 (778.9077160623111, 4993),
 (745.8677915399023, 150),
 (729.8757903467765, 7153),
 (723.2136524022485, 5952),
 (681.0802765058897, 32),
 (670.3355800317212, 2762),
 (663.2921179184383, 588)]

# We got similar top 10 results for user 1