# Collabrative Filtering


Collabrative Filtering Item-Item Recommendation System

In [64]:
import numpy as np
import pandas as pd
from numpy import dot
from numpy.linalg import norm
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

In [447]:
# Users are rows and Items are columns
# The task is to predict the fifth user's rating on the 6th item

matrix = [[1,0,0,2,0,1],
          [0,0,2,4,0,0],
          [3,4,4,0,5,3],
          [0,3,0,1,4,0],
          [3,4,5,2,0],
          [0,2,0,0,0,5],
          [0,0,0,3,4,0],
          [2,0,4,0,0,0],
          [0,0,0,4,0,5],
          [0,0,0,3,2,0],
          [4,2,2,5,1,4],
          [0,5,0,0,3,0]]

In [160]:
matrix_df = pd.DataFrame(matrix)

matrix_df = matrix_df.replace(0, np.nan)

In [161]:
matrix_df # Original dataframe 

Unnamed: 0,0,1,2,3,4,5
0,1.0,,,2.0,,1.0
1,,,2.0,4.0,,
2,3.0,4.0,4.0,,5.0,3.0
3,,3.0,,1.0,4.0,
4,3.0,4.0,5.0,2.0,,
5,,2.0,,,,5.0
6,,,,3.0,4.0,
7,2.0,,4.0,,,
8,,,,4.0,,5.0
9,,,,3.0,2.0,


In [227]:
df = pd.DataFrame()

for i in range(6):
    df[i] = matrix_df[i] - matrix_df[i].mean()

df = df.replace(np.nan,0)

In [228]:
df

Unnamed: 0,0,1,2,3,4,5
0,-1.6,0.0,0.0,-1.0,0.0,-2.6
1,0.0,0.0,-1.4,1.0,0.0,0.0
2,0.4,0.666667,0.6,0.0,1.833333,-0.6
3,0.0,-0.333333,0.0,-2.0,0.833333,0.0
4,0.4,0.666667,1.6,-1.0,0.0,0.0
5,0.0,-1.333333,0.0,0.0,0.0,1.4
6,0.0,0.0,0.0,0.0,0.833333,0.0
7,-0.6,0.0,0.6,0.0,0.0,0.0
8,0.0,0.0,0.0,1.0,0.0,1.4
9,0.0,0.0,0.0,0.0,-1.166667,0.0


In [200]:
# Calculating cosine similarity

def sim(i,j):
    return round(dot(i, j)/(norm(i)*norm(j)),2)

In [235]:
# Put all the cosine similarity to lists
def sim_list(item):
    return [sim(list(df.iloc[:,i]),list(df.iloc[:,item])) for i in range(6)]  

In [244]:
cosine_similarity_map = defaultdict(list)

for i in range(6):
    cosine_similarity_map[i] = sim_list(i)

In [245]:
cosine_similarity_map # item-item cosine similarity

defaultdict(list,
            {0: [1.0, -0.22, -0.24, 0.51, -0.31, 0.59],
             1: [-0.22, 1.0, 0.46, -0.28, 0.4, -0.31],
             2: [-0.24, 0.46, 1.0, -0.62, 0.47, -0.1],
             3: [0.51, -0.28, -0.62, 1.0, -0.53, 0.41],
             4: [-0.31, 0.4, 0.47, -0.53, 1.0, -0.18],
             5: [0.59, -0.31, -0.1, 0.41, -0.18, 1.0]})

In [248]:
def distance(item,i):
     return cosine_similarity_map[item][item]-cosine_similarity_map[item][i]

In [263]:
distance_map = defaultdict(list)

for i in range(6):
    distance_map[i] = [distance(i,j) for j in range(6)]

In [291]:
distance_map

defaultdict(list,
            {0: [0.0, 1.22, 1.24, 0.49, 1.31, 0.41000000000000003],
             1: [1.22, 0.0, 0.54, 1.28, 0.6, 1.31],
             2: [1.24, 0.54, 0.0, 1.62, 0.53, 1.1],
             3: [0.49, 1.28, 1.62, 0.0, 1.53, 0.5900000000000001],
             4: [1.31, 0.6, 0.53, 1.53, 0.0, 1.18],
             5: [0.41000000000000003,
              1.31,
              1.1,
              0.5900000000000001,
              1.18,
              0.0]})

In [286]:
index_list = []

for i in range(6):
    for j in range(6):
        if distance_map[i][j]<1 & (distance_map[i][j]>0):
            index_list.append((i,j))

In [290]:
index_list       

[(0, 3),
 (0, 5),
 (1, 2),
 (1, 4),
 (2, 1),
 (2, 4),
 (3, 0),
 (3, 5),
 (4, 1),
 (4, 2),
 (5, 0),
 (5, 3)]

In [294]:
index_dict = defaultdict(list)

for v, k in index_list:
    index_dict[v].append(k)

In [298]:
index_dict

defaultdict(list,
            {0: [3, 5], 1: [2, 4], 2: [1, 4], 3: [0, 5], 4: [1, 2], 5: [0, 3]})

In [331]:
matrix_impute = matrix_df.replace(np.nan,0)

def cal_for_rec(user, item):
    """
    Calculate certain user's item rating
    based on item-item cosine similarity
    """
    cos_list = sim_list(item)
    index_1 = index_dict[item][0]
    index_2 = index_dict[item][1]
    cos_1 = cos_list[index_1]
    cos_2 = cos_list[index_2]
    weight_1 = matrix_impute.iloc[user,index_1] # find the weight from pre-normalized matrix
    weight_2 = matrix_impute.iloc[user,index_2]
    return round(((cos_1*weight_1+cos_2*weight_2)/(cos_1+cos_2)),1)

In [302]:
cal_for_rec(4, 5) # calculating user 4 to item 5

2.6

## Calculate/predict all unknown ratings

In [337]:
dff=matrix_df.copy()

In [409]:
dff.iloc[i,j]=cal_for_rec(i,j) #seudo code

In [410]:
dff

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.0,0.0,2.0,0.0,1.0
1,1.9,1.1,2.0,4.0,1.1,1.6
2,3.0,4.0,4.0,3.0,5.0,3.0
3,0.5,3.0,3.5,1.0,4.0,0.4
4,3.0,4.0,5.0,2.0,4.5,2.6
5,2.7,2.0,1.0,2.2,0.9,5.0
6,1.4,1.9,2.0,3.0,4.0,1.2
7,2.0,2.1,4.0,1.1,2.2,1.2
8,4.5,0.0,0.0,4.0,0.0,5.0
9,1.4,0.9,1.0,3.0,2.0,1.2


In [412]:
dff_nan=dff.copy()

In [413]:
dff_nan=dff_nan.replace(0, np.nan) #Turn those values that cann't retrieve weights back to null
dff_nan

Unnamed: 0,0,1,2,3,4,5
0,1.0,,,2.0,,1.0
1,1.9,1.1,2.0,4.0,1.1,1.6
2,3.0,4.0,4.0,3.0,5.0,3.0
3,0.5,3.0,3.5,1.0,4.0,0.4
4,3.0,4.0,5.0,2.0,4.5,2.6
5,2.7,2.0,1.0,2.2,0.9,5.0
6,1.4,1.9,2.0,3.0,4.0,1.2
7,2.0,2.1,4.0,1.1,2.2,1.2
8,4.5,,,4.0,,5.0
9,1.4,0.9,1.0,3.0,2.0,1.2


In [416]:
sum(matrix_df.isnull().sum()) # null values before

37

In [418]:
sum(dff_nan.isnull().sum()) # null values after

9

In [419]:
(37-9)/37 # Reduced 76% of null values

0.7567567567567568

In [423]:
((72-9)-(72-37))/72 # Added prediction takes up 39% of total value counts

0.3888888888888889

In [445]:
result = dff_nan.rename(columns=lambda s: "item_"+str(s+1), index=lambda s: "user_"+str(s+1))

In [446]:
result # Now we can predict 10 users' rating on all items

Unnamed: 0,item_1,item_2,item_3,item_4,item_5,item_6
user_1,1.0,,,2.0,,1.0
user_2,1.9,1.1,2.0,4.0,1.1,1.6
user_3,3.0,4.0,4.0,3.0,5.0,3.0
user_4,0.5,3.0,3.5,1.0,4.0,0.4
user_5,3.0,4.0,5.0,2.0,4.5,2.6
user_6,2.7,2.0,1.0,2.2,0.9,5.0
user_7,1.4,1.9,2.0,3.0,4.0,1.2
user_8,2.0,2.1,4.0,1.1,2.2,1.2
user_9,4.5,,,4.0,,5.0
user_10,1.4,0.9,1.0,3.0,2.0,1.2
