In [112]:
import pandas as pd
import numpy as np
from scipy import stats

#### Read Excel sheets

In [88]:
df1 = pd.read_excel (r'./UUCF-Assignment-Spreadsheet.xls',sheet_name='movie-row')
df2 = pd.read_excel (r'./UUCF-Assignment-Spreadsheet.xls',sheet_name='user-row')
df3 = pd.read_excel (r'./UUCF-Assignment-Spreadsheet.xls',sheet_name='correlations')
df1 = df1.replace(np.nan, 0);
df2 = df2.replace(np.nan, 0);
df3 = df3.replace(np.nan, 0);

#### Compute User-User correlations

Formula can be checked in https://en.wikipedia.org/wiki/Collaborative_filtering

In [270]:
UserList = list(df3.columns)[1:]
ItemList = list(df2.columns)[1:]
numUsers = len(UserList)
numItems = len(ItemList)
UserRows = {UserList[i]: i for i in range(numUsers)}
ItemRows = {ItemList[i]: i for i in range(numItems)}

In [255]:
for user1 in UserList:
    for user2 in UserList:
        array1, array2 = [], []
        for i in range(len(ItemList)):
            if df1[user1].array[i] != 0 and df1[user2].array[i] != 0:
                array1.append(df1[user1].array[i])
                array2.append(df1[user2].array[i])
        array1 = array1-np.mean(array1)
        array2 = array2-np.mean(array2)
        df3[user1][UserRows[user2]] = np.dot(array1, array2)/np.linalg.norm(array1)/np.linalg.norm(array2)

# some checkings
print(df3[1648][UserRows[1648]])
print(df3[1648][UserRows[5136]])
print(df3[918][UserRows[2824]])

# print results
print('5 closest neighbors to User 3712: ', [UserList[i] for i in np.argsort(-df3[3712].array)[1:6]])
print('5 closest neighbors to User 3867: ', [UserList[i] for i in np.argsort(-df3[3867].array)[1:6]])
print('5 closest neighbors to User 89: ', [UserList[i] for i in np.argsort(-df3[89].array)[1:6]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3[user1][UserRows[user2]] = np.dot(array1, array2)/np.linalg.norm(array1)/np.linalg.norm(array2)


1.0
0.40298018845699635
-0.31706324373711403
5 closest neighbors to User 3712:  [2824, 3867, 5062, 442, 3853]
5 closest neighbors to User 3867:  [2492, 3853, 2486, 3712, 2288]
5 closest neighbors to User 89:  [4809, 5136, 860, 5062, 3525]


#### Predicted scores
Predict scores by taking the correlation-weighted average of the ratings of the top-five neighbors (for each target user) for each movie.

$$\frac{\sum_{n=1}^5 r_n w_n}{\sum_{n=1}^5 w_n}$$

The weight for each contributed rating is the user-user correlation when that neighbor has rated the movie, but 0 when the neighbor has not rated the movie).

In [337]:
user = 89
neighbor_rates, neighbor_weights = [], []
for neighbor in [UserList[i] for i in np.argsort(-df3[user].array)[1:6]]:
    neighbor_rates.append(df1[neighbor])
    neighbor_weights.append(df3[user].array[UserRows[neighbor]])
user_rates = []
for item in range(numItems):
    temp_rate, temp_weight = 0, 0
    for neighbor in range(5):
        if neighbor_rates[neighbor][item] != 0:
            temp_weight += neighbor_weights[neighbor]
            temp_rate += neighbor_rates[neighbor][item]*neighbor_weights[neighbor]
    if temp_weight != 0:
        user_rates.append(temp_rate/temp_weight)
    else: 
        user_rates.append(0)

# print k results
k = 12;
count = 1
for topitem in np.argsort(-np.array(user_rates))[:k]:
    print('Top ', count, ' item is ', ItemList[topitem], '\n'+'score: ', round(user_rates[topitem],3))
    count += 1

Top  1  item is  238: The Godfather (1972) 
score:  4.894
Top  2  item is  278: The Shawshank Redemption (1994) 
score:  4.882
Top  3  item is  807: Seven (a.k.a. Se7en) (1995) 
score:  4.774
Top  4  item is  275: Fargo (1996) 
score:  4.771
Top  5  item is  424: Schindler's List (1993) 
score:  4.729
Top  6  item is  122: The Lord of the Rings: The Return of the King (2003) 
score:  4.696
Top  7  item is  13: Forrest Gump (1994) 
score:  4.601
Top  8  item is  38: Eternal Sunshine of the Spotless Mind (2004) 
score:  4.551
Top  9  item is  453: A Beautiful Mind (2001) 
score:  4.543
Top  10  item is  120: The Lord of the Rings: The Fellowship of the Ring (2001) 
score:  4.528
Top  11  item is  121: The Lord of the Rings: The Two Towers (2002) 
score:  4.528
Top  12  item is  105: Back to the Future (1985) 
score:  4.521


#### Predicted scores with normalization
Formula is
$$\bar{r}_u + \frac{\sum_{n=1}^5 (r_n - \bar{r}_n) w_n}{\sum_{n=1}^5 w_n}$$


In [341]:
user = 89
neighbors = np.argsort(-df3[user].array)[1:6]
neighbor_rates, neighbor_weights = [], []
for neighbor in [UserList[i] for i in neighbors]:
    neighbor_rates.append(df1[neighbor])
    neighbor_weights.append(df3[user].array[UserRows[neighbor]])
user_rates = []
rubar = np.sum(df1[user].array)/np.count_nonzero(df1[user].array)
rnbars = [np.sum(df1[UserList[nei]].array)/np.count_nonzero(df1[UserList[nei]].array) for nei in neighbors]
for item in range(numItems):
    temp_rate, temp_weight = 0, 0
    for neighbor in range(5):
        if neighbor_rates[neighbor][item] != 0:
            temp_weight += neighbor_weights[neighbor]
            temp_rate += (neighbor_rates[neighbor][item]-rnbars[neighbor])*neighbor_weights[neighbor]
    if temp_weight != 0:
        user_rates.append(temp_rate/temp_weight + rubar)
    else: 
        user_rates.append(0)

# print k results
k = 3;
count = 1
for topitem in np.argsort(-np.array(user_rates))[:k]:
    print('Top ', count, ' item is ', ItemList[topitem], '\n'+'score: ', round(user_rates[topitem],3))
    count += 1

Top  1  item is  238: The Godfather (1972) 
score:  5.322
Top  2  item is  278: The Shawshank Redemption (1994) 
score:  5.261
Top  3  item is  275: Fargo (1996) 
score:  5.241


In [342]:
user = 3525
neighbors = np.argsort(-df3[user].array)[1:6]
neighbor_rates, neighbor_weights = [], []
for neighbor in [UserList[i] for i in neighbors]:
    neighbor_rates.append(df1[neighbor])
    neighbor_weights.append(df3[user].array[UserRows[neighbor]])
user_rates = []
rubar = np.sum(df1[user].array)/np.count_nonzero(df1[user].array)
rnbars = [np.sum(df1[UserList[nei]].array)/np.count_nonzero(df1[UserList[nei]].array) for nei in neighbors]
for item in range(numItems):
    temp_rate, temp_weight = 0, 0
    for neighbor in range(5):
        if neighbor_rates[neighbor][item] != 0:
            temp_weight += neighbor_weights[neighbor]
            temp_rate += (neighbor_rates[neighbor][item]-rnbars[neighbor])*neighbor_weights[neighbor]
    if temp_weight != 0:
        user_rates.append(temp_rate/temp_weight + rubar)
    else: 
        user_rates.append(0)

# print k results
k = 3;
count = 1
for topitem in np.argsort(-np.array(user_rates))[:k]:
    print('Top ', count, ' item is ', ItemList[topitem], '\n'+'score: ', round(user_rates[topitem],3))
    count += 1

Top  1  item is  238: The Godfather (1972) 
score:  4.76
Top  2  item is  424: Schindler's List (1993) 
score:  4.663
Top  3  item is  134: O Brother Where Art Thou? (2000) 
score:  4.585
