In [169]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read Rating Data

In [170]:
rating_data=pd.read_csv('/Users/ziranmin/Desktop/Sophia课/ml-latest-small/ratings.csv')
rating_data.shape

(100836, 4)

In [171]:
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# Reshape to Matrix

In [172]:
reshap_rating_data = rating_data.pivot(index='movieId', columns='userId', values='rating')

In [173]:
rating_matrix = reshap_rating_data.reset_index()
del rating_matrix.columns.name
rating_matrix.head()

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,601,602,603,604,605,606,607,608,609,610
0,1,4.0,,,,4.0,,4.5,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
1,2,,,,,,4.0,,4.0,,...,,4.0,,5.0,3.5,,,2.0,,
2,3,4.0,,,,,5.0,,,,...,,,,,,,,2.0,,
3,4,,,,,,3.0,,,,...,,,,,,,,,,
4,5,,,,,,5.0,,,,...,,,,3.0,,,,,,


# Pearson

In [174]:
import math
def pearson(rating_matrix, id_1, id_2):
    # find movies that both users have rated
    user_1 = rating_matrix[id_1]
    user_2 = rating_matrix[id_2]
    
    bad = ~np.logical_or(np.isnan(user_1), np.isnan(user_2))
    
    user_1_l = np.compress(bad, user_1)
    user_2_l = np.compress(bad, user_2)
    
    #special case
    if len(user_1_l) == 0:
        return 0
    
    result = np.corrcoef(user_1_l,user_2_l)[0][1]
    
    #special case
    if math.isnan(result):
        return 0

    return result

# Cosine

In [175]:
def cosine(rating_matrix, id_1, id_2):
    # find movies that both users have rated
    user_1 = rating_matrix[id_1]
    user_2 = rating_matrix[id_2]
    
    bad = ~np.logical_or(np.isnan(user_1), np.isnan(user_2))
    
    user_1_l = np.compress(bad, user_1)
    user_2_l = np.compress(bad, user_2)
            
    #special case
    if len(user_1_l) == 0:
        return 0

    
    dot_product = np.dot(user_1_l, user_2_l)
    norm_a = np.linalg.norm(user_1_l)
    norm_b = np.linalg.norm(user_2_l)
    return dot_product / (norm_a * norm_b)

# Top Match

In [176]:
def topMatch(rating_matrix, id_1, sim_function):
    best_id = 0
    best_sim = -10
    for i in rating_matrix.columns.values.tolist()[1:]:
        if i != id_1:
            current_score = sim_function(rating_matrix, id_1, i)
            if current_score > best_sim:
                best_sim = current_score
                best_id = i
    return best_id

In [177]:
topMatch(rating_matrix, 1, cosine)

77

# Example in video 

https://www.bing.com/videos/search?q=recommendation+systems+collaborative+filtering+university+of+washington&&view=detail&mid=1DB77199CB8E00F3384B1DB77199CB8E00F3384B&&FORM=VDRVRV

In [178]:
df_0=pd.read_csv('/Users/ziranmin/Desktop/Sophia课/ml-latest-small/example.csv')

In [179]:
df_1 = df_0.pivot(index='movieId', columns='userId', values='rating')

In [180]:
df_2 = df_1.reset_index()
del df_2.columns.name
df_2

Unnamed: 0,movieId,1,2,3,4,5,6,7
0,1,2.5,3.0,2.5,,3.0,3.0,
1,2,3.5,3.5,3.0,3.5,4.0,4.0,4.5
2,3,3.0,1.5,,3.0,2.0,,
3,4,3.5,5.0,3.5,4.0,3.0,5.0,4.0
4,5,2.5,3.5,,2.5,2.0,3.5,1.0
5,6,3.0,3.0,4.0,4.5,3.0,3.0,


# Recommend Movie for user 7

# Find Movie (with ratings) user 7 hasn't seen

In [181]:
df_2.loc[df_2[7].isnull()]

Unnamed: 0,movieId,1,2,3,4,5,6,7
0,1,2.5,3.0,2.5,,3.0,3.0,
2,3,3.0,1.5,,3.0,2.0,,
5,6,3.0,3.0,4.0,4.5,3.0,3.0,


In [182]:
df_2.loc[df_2[7].isnull()].drop(['movieId',7], axis=1).fillna(0)

Unnamed: 0,1,2,3,4,5,6
0,2.5,3.0,2.5,0.0,3.0,3.0
2,3.0,1.5,0.0,3.0,2.0,0.0
5,3.0,3.0,4.0,4.5,3.0,3.0


In [183]:
df_2.loc[df_2[7].isnull()].drop(['movieId',7], axis=1).fillna(0).values

array([[2.5, 3. , 2.5, 0. , 3. , 3. ],
       [3. , 1.5, 0. , 3. , 2. , 0. ],
       [3. , 3. , 4. , 4.5, 3. , 3. ]])

In [184]:
A = df_2.loc[df_2[7].isnull()].drop(['movieId',7], axis=1).fillna(0).values

# Calculate user 7's similarity with other users

In [185]:
sim_l = []
for i in df_2.columns.values.tolist()[1:]:
    if i != 7:
        curr_sim = pearson(df_2, 7, i)
        sim_l += [curr_sim]

In [186]:
sim_l

[0.9912407071619304,
 0.3812464258315117,
 -0.9999999999999999,
 0.8934051474415642,
 0.924473451641905,
 0.6628489803598702]

In [187]:
sim_l_array = np.array(sim_l)

In [188]:
sim_l_array.reshape((6,1))

array([[ 0.99124071],
       [ 0.38124643],
       [-1.        ],
       [ 0.89340515],
       [ 0.92447345],
       [ 0.66284898]])

In [189]:
B = sim_l_array.reshape((6,1))

# Use Dot Product to sum ratings multiplied by similarities for unwatched movies

In [190]:
sum_score = np.dot(A,B)
sum_score

array([[5.88380834],
       [8.07475411],
       [8.89975186]])

# Sum Similarity

In [191]:
watched_unwatched = A.copy()

In [192]:
watched_unwatched[watched_unwatched>0] = 1

In [193]:
watched_unwatched

array([[1., 1., 1., 0., 1., 1.],
       [1., 1., 0., 1., 1., 0.],
       [1., 1., 1., 1., 1., 1.]])

In [194]:
sum_sim = np.dot(watched_unwatched, B)

In [195]:
sum_sim

array([[1.95980956],
       [3.19036573],
       [2.85321471]])

# Get Weighted Rating 

In [196]:
weighted_score = sum_score/sum_sim

In [197]:
weighted_score

array([[3.00223473],
       [2.5309807 ],
       [3.11920159]])

In [198]:
unseen_id = df_2.loc[df_2[7].isnull()]['movieId'].tolist()
unseen_id

[1, 3, 6]

In [199]:
result_rank = []
for i in range(len(unseen_id)):
    result_rank += [(weighted_score[i][0], unseen_id[i])]

In [200]:
result_rank.sort(reverse = True)

In [201]:
result_rank

[(3.1192015867855525, 6), (3.0022347306071273, 1), (2.5309807037655645, 3)]

# Try RMSE

In [202]:
AA = df_2.loc[df_2[7].notna()].drop(['movieId',7], axis=1).fillna(0).values
AA

array([[3.5, 3.5, 3. , 3.5, 4. , 4. ],
       [3.5, 5. , 3.5, 4. , 3. , 5. ],
       [2.5, 3.5, 0. , 2.5, 2. , 3.5]])

In [203]:
sim_l = []
for i in df_2.columns.values.tolist()[1:]:
    if i != 7:
        curr_sim = pearson(df_2, 7, i)
        sim_l += [curr_sim]
        
sim_l_array = np.array(sim_l)
BB = sim_l_array.reshape((6,1))
BB

array([[ 0.99124071],
       [ 0.38124643],
       [-1.        ],
       [ 0.89340515],
       [ 0.92447345],
       [ 0.66284898]])

In [204]:
total_sum_score = np.dot(AA,BB)
total_sum_score

array([[11.27991271],
       [11.53686045],
       [10.21489546]])

In [205]:
total_watched_unwatched = AA.copy()
total_watched_unwatched[total_watched_unwatched>0] = 1
total_watched_unwatched

array([[1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1.],
       [1., 1., 0., 1., 1., 1.]])

In [206]:
total_sum_sim = np.dot(total_watched_unwatched, BB)
total_sum_sim

array([[2.85321471],
       [2.85321471],
       [3.85321471]])

In [207]:
tol_weighted_score = total_sum_score/total_sum_sim
tol_weighted_score

array([[3.95340479],
       [4.04346031],
       [2.65100604]])

In [208]:
seen_index = df_2.loc[df_2[7].notna()].index.tolist()
seen_index

[1, 3, 4]

In [209]:
actual_rated_score = np.array(df_2.loc[df_2[7].notna()][7]).reshape(-1,1)
actual_rated_score

array([[4.5],
       [4. ],
       [1. ]])

In [210]:
square_error = np.square(tol_weighted_score - actual_rated_score)
square_error

array([[2.98766324e-01],
       [1.88879870e-03],
       [2.72582093e+00]])

In [211]:
np.sqrt(sum(square_error)/len(square_error))

array([1.00440298])

# Single Row RMSE

In [212]:
def single_row_rmse(df_2, user, sim_function):
    total_user_num = df_2.shape[1]
    AA = df_2.loc[df_2[user].notna()].drop(['movieId',user], axis=1).fillna(0).values
    
    sim_l = []
    for i in df_2.columns.values.tolist()[1:]:
        if i != user:
            curr_sim = pearson(df_2, user, i)
            sim_l += [curr_sim]
        
    sim_l_array = np.array(sim_l)
    BB = sim_l_array.reshape((total_user_num-2,1))
    
    total_sum_score = np.dot(AA, BB)
    total_watched_unwatched = AA.copy()
    total_watched_unwatched[total_watched_unwatched>0] = 1
    
    total_sum_sim = np.dot(total_watched_unwatched, BB)
    
    tol_weighted_score = total_sum_score/total_sum_sim
    
    actual_rated_score = np.array(df_2.loc[df_2[user].notna()][user]).reshape(-1,1)
    
    square_error = np.square(tol_weighted_score - actual_rated_score)
    
    return np.sqrt(sum(square_error)/len(square_error))

In [213]:
single_row_rmse(df_2, 7, pearson)

array([1.00440298])

In [214]:
for i in range(1,8):
    print(single_row_rmse(df_2, i, pearson))

[0.50122282]
[0.73040852]
[1.0694377]
[0.65412783]
[0.57797704]
[0.63343321]
[1.00440298]


In [217]:
for i in range(1,8):
    print(single_row_rmse(df_2, i, cosine))

[0.50122282]
[0.73040852]
[1.0694377]
[0.65412783]
[0.57797704]
[0.63343321]
[1.00440298]


In [215]:
def single_row_rmse_sum(df_2, user, sim_function):
    total_user_num = df_2.shape[1]
    AA = df_2.loc[df_2[user].notna()].drop(['movieId',user], axis=1).fillna(0).values
    
    sim_l = []
    for i in df_2.columns.values.tolist()[1:]:
        if i != user:
            curr_sim = pearson(df_2, user, i)
            sim_l += [curr_sim]
        
    sim_l_array = np.array(sim_l)
    BB = sim_l_array.reshape((total_user_num-2,1))
    
    total_sum_score = np.dot(AA, BB)
    total_watched_unwatched = AA.copy()
    total_watched_unwatched[total_watched_unwatched>0] = 1
    
    total_sum_sim = np.dot(total_watched_unwatched, BB)
    
    tol_weighted_score = total_sum_score/total_sum_sim
    
    actual_rated_score = np.array(df_2.loc[df_2[user].notna()][user]).reshape(-1,1)
    
    square_error = np.square(tol_weighted_score - actual_rated_score)
    
    return sum(square_error)

In [97]:
def total_rmse(df_2,sim_function):
    total_rating_num = sum(df_2.drop(['movieId'], axis=1).notna().sum())
    user_list = df_2.columns.tolist()[1:]
    square_error = 0
    for i in user_list:
        square_error += single_row_rmse_sum(df_2, i, sim_function)
    return np.sqrt(square_error/total_rating_num)

In [216]:
total_rmse(df_2,pearson)

array([0.7262337])

In [218]:
total_rmse(df_2,cosine)

array([0.7262337])

# Try larger dataset

In [219]:
single_row_rmse(rating_matrix, 2, pearson)

  c /= stddev[:, None]
  c = cov(x, y, rowvar)
  c *= 1. / np.float64(fact)
  c *= 1. / np.float64(fact)
  c /= stddev[None, :]


array([nan])

In [220]:
total_user_num = rating_matrix.shape[1]

In [221]:
AA = rating_matrix.loc[rating_matrix[1].notna()].drop(['movieId',1], axis=1).fillna(0).values
AA

array([[0. , 0. , 0. , ..., 2.5, 3. , 5. ],
       [0. , 0. , 0. , ..., 2. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 5. ],
       ...,
       [0. , 0. , 3. , ..., 1.5, 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [222]:
sim_l = []
for i in rating_matrix.columns.values.tolist()[1:]:
    if i != 1:
        curr_sim = pearson(rating_matrix, 1, i)
        sim_l += [curr_sim]

  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar)
  c *= 1. / np.float64(fact)
  c *= 1. / np.float64(fact)


In [223]:
len(sim_l)

609

In [224]:
sim_l_array = np.array(sim_l)

In [225]:
BB = sim_l_array.reshape((total_user_num-2,1))

In [226]:
total_sum_score = np.dot(AA, BB)

In [227]:
total_watched_unwatched = AA.copy()
total_watched_unwatched[total_watched_unwatched>0] = 1

In [228]:
total_watched_unwatched

array([[0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [229]:
total_sum_sim = np.dot(total_watched_unwatched, BB)

In [230]:
tol_weighted_score = total_sum_score/total_sum_sim

  """Entry point for launching an IPython kernel.


In [231]:
actual_rated_score = np.array(rating_matrix.loc[rating_matrix[1].notna()][1]).reshape(-1,1)


In [232]:
square_error = np.square(tol_weighted_score - actual_rated_score)


In [233]:
np.sqrt(sum(square_error)/len(square_error))

array([nan])

# Find NaN

In [250]:
sum(square_error)

array([nan])

In [251]:
np.argwhere(np.isnan((total_sum_score/total_sum_sim)))

  """Entry point for launching an IPython kernel.


array([[96,  0]])

In [252]:
tol_weighted_score[96]

array([nan])

In [253]:
total_sum_score[96]

array([0.])

In [258]:
total_sum_sim[96]

array([0.])

In [256]:
AA[96]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.