In [6]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read Rating Data

In [68]:
rating_data=pd.read_csv('/Users/ziranmin/Desktop/Sophia课/ml-latest-small/ratings.csv')
rating_data.shape

(100836, 4)

In [174]:
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# Reshape to Matrix

In [70]:
reshap_rating_data = rating_data.pivot(index='movieId', columns='userId', values='rating')

In [71]:
rating_matrix = reshap_rating_data.reset_index()
del rating_matrix.columns.name
rating_matrix.head()

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,601,602,603,604,605,606,607,608,609,610
0,1,4.0,,,,4.0,,4.5,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
1,2,,,,,,4.0,,4.0,,...,,4.0,,5.0,3.5,,,2.0,,
2,3,4.0,,,,,5.0,,,,...,,,,,,,,2.0,,
3,4,,,,,,3.0,,,,...,,,,,,,,,,
4,5,,,,,,5.0,,,,...,,,,3.0,,,,,,


# Pearson

In [137]:
import math
def pearson(rating_matrix, id_1, id_2):
    # find movies that both users have rated
    user_1 = rating_matrix[id_1]
    user_2 = rating_matrix[id_2]
    
    user_1_l = []
    user_2_l = []
    
    for i in range(len(user_1)):
        if math.isnan(user_1[i]) == False and math.isnan(user_2[i]) == False:
            user_1_l += [user_1[i]]
            user_2_l += [user_2[i]]
    
    #special case
    if len(user_1_l) == 0:
        return 0
    
    result = np.corrcoef(user_1_l,user_2_l)[0][1]
    
    #special case
    if math.isnan(result):
        return 0

    return result

# Cosine

In [162]:
def cosine(rating_matrix, id_1, id_2):
    # find movies that both users have rated
    user_1 = rating_matrix[id_1]
    user_2 = rating_matrix[id_2]
    
    user_1_l = []
    user_2_l = []
    
    for i in range(len(user_1)):
        if math.isnan(user_1[i]) == False and math.isnan(user_2[i]) == False:
            user_1_l += [user_1[i]]
            user_2_l += [user_2[i]]
            
    #special case
    if len(user_1_l) == 0:
        return 0

    
    dot_product = np.dot(user_1_l, user_2_l)
    norm_a = np.linalg.norm(user_1_l)
    norm_b = np.linalg.norm(user_2_l)
    return dot_product / (norm_a * norm_b)

# Top Match

In [158]:
def topMatch(rating_matrix, id_1, sim_function):
    best_id = 0
    best_sim = -10
    for i in rating_matrix.columns.values.tolist()[1:]:
        if i != id_1:
            current_score = sim_function(rating_matrix, id_1, i)
            if current_score > best_sim:
                best_sim = current_score
                best_id = i
    return best_id

In [163]:
topMatch(rating_matrix, 1, cosine)

0

# Example in video 

https://www.bing.com/videos/search?q=recommendation+systems+collaborative+filtering+university+of+washington&&view=detail&mid=1DB77199CB8E00F3384B1DB77199CB8E00F3384B&&FORM=VDRVRV

In [155]:
df_0=pd.read_csv('/Users/ziranmin/Desktop/Sophia课/ml-latest-small/example.csv')

In [156]:
df_1 = df_0.pivot(index='movieId', columns='userId', values='rating')

In [157]:
df_2 = df_1.reset_index()
del df_2.columns.name
df_2

Unnamed: 0,movieId,1,2,3,4,5,6,7
0,1,2.5,3.0,2.5,,3.0,3.0,
1,2,3.5,3.5,3.0,3.5,4.0,4.0,4.5
2,3,3.0,1.5,,3.0,2.0,,
3,4,3.5,5.0,3.5,4.0,3.0,5.0,4.0
4,5,2.5,3.5,,2.5,2.0,3.5,1.0
5,6,3.0,3.0,4.0,4.5,3.0,3.0,


# Add Movie Name DataFrame

In [251]:
d = {'moiveId': [1, 2,3,4,5,6], 'name': ['lady in the water','snakes on a plane','just my luck','superman returns','you, me and dupree','the night listener']}
movie = pd.DataFrame(data=d)
movie

Unnamed: 0,moiveId,name
0,1,lady in the water
1,2,snakes on a plane
2,3,just my luck
3,4,superman returns
4,5,"you, me and dupree"
5,6,the night listener


# Recommend Movie for user 7

# Find Movie (with ratings) user 7 hasn't seen

In [210]:
df_2.loc[df_2[7].isnull()]

Unnamed: 0,movieId,1,2,3,4,5,6,7
0,1,2.5,3.0,2.5,,3.0,3.0,
2,3,3.0,1.5,,3.0,2.0,,
5,6,3.0,3.0,4.0,4.5,3.0,3.0,


In [219]:
df_2.loc[df_2[7].isnull()].drop(['movieId',7], axis=1).fillna(0)

Unnamed: 0,1,2,3,4,5,6
0,2.5,3.0,2.5,0.0,3.0,3.0
2,3.0,1.5,0.0,3.0,2.0,0.0
5,3.0,3.0,4.0,4.5,3.0,3.0


In [220]:
df_2.loc[df_2[7].isnull()].drop(['movieId',7], axis=1).fillna(0).values

array([[2.5, 3. , 2.5, 0. , 3. , 3. ],
       [3. , 1.5, 0. , 3. , 2. , 0. ],
       [3. , 3. , 4. , 4.5, 3. , 3. ]])

In [221]:
A = df_2.loc[df_2[7].isnull()].drop(['movieId',7], axis=1).fillna(0).values

# Calculate user 7's similarity with other users

In [181]:
sim_l = []
for i in df_2.columns.values.tolist()[1:]:
    if i != 7:
        curr_sim = pearson(df_2, 7, i)
        sim_l += [curr_sim]

In [182]:
sim_l

[0.9912407071619304,
 0.3812464258315117,
 -0.9999999999999999,
 0.8934051474415642,
 0.924473451641905,
 0.6628489803598702]

In [189]:
sim_l_array = np.array(sim_l)

In [193]:
sim_l_array.reshape((6,1))

array([[ 0.99124071],
       [ 0.38124643],
       [-1.        ],
       [ 0.89340515],
       [ 0.92447345],
       [ 0.66284898]])

In [196]:
B = sim_l_array.reshape((6,1))

# Use Dot Product to get weighted ratings for unwatched movies

In [233]:
weighted_score = np.dot(A,B)
weighted_score

array([[5.88380834],
       [8.07475411],
       [8.89975186]])

In [230]:
unseen_id = df_2.loc[df_2[7].isnull()]['movieId'].tolist()
unseen_id

[1, 3, 6]

In [232]:
np.dot(A,B)[1][0]

8.074754105841562

In [245]:
result_rank = []
for i in range(len(unseen_id)):
    result_rank += [(weighted_score[i][0], unseen_id[i])]

In [246]:
result_rank.sort(reverse = True)

In [247]:
result_rank

[(8.899751858472692, 6), (8.074754105841562, 3), (5.883808341404687, 1)]

In [256]:
movie.loc[movie['movieId'] == 1]]

SyntaxError: invalid syntax (<ipython-input-256-91ddf7b9cecc>, line 1)

In [257]:
movie

Unnamed: 0,moiveId,name
0,1,lady in the water
1,2,snakes on a plane
2,3,just my luck
3,4,superman returns
4,5,"you, me and dupree"
5,6,the night listener
