# Item to item collaborative filtering (Sklearn)
### By Lawrence Wang

### Import libraries

In [390]:
# Data Processing Lib
import numpy as np
import pandas as pd
import scipy.stats

# Data Visualization Lib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Sklearn
from sklearn.metrics.pairwise import cosine_similarity

# Make our coding aesthetic
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

### Read the data

In [391]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/ycwang15/Rec_Sys_assignments/Data/Rec_Sys_df.csv"
)

<IPython.core.display.Javascript object>

In [392]:
df.head()

Unnamed: 0,Critics,Mac,Estee Lauder,Bobbi brown,SUQQU,YSL,Chanel,Dior,LANC√îME
0,Allen,,3.0,,3.0,4.0,,,4.0
1,Sohee,3.0,4.0,5.0,5.0,4.0,4.0,2.0,5.0
2,Didi,1.0,2.0,,,5.0,5.0,,5.0
3,Shao,3.0,,3.0,,,,5.0,
4,Gloria,3.0,3.0,4.0,4.0,4.0,5.0,3.0,


<IPython.core.display.Javascript object>

### Tidy the data

In [393]:
formatted_df = pd.melt(df, ["Critics"], var_name="brand", value_name="rating")
formatted_df = formatted_df.sort_values(by=["Critics"])

<IPython.core.display.Javascript object>

In [394]:
formatted_df.head(15)

Unnamed: 0,Critics,brand,rating
0,Allen,Mac,
40,Allen,YSL,4.0
30,Allen,SUQQU,3.0
70,Allen,LANC√îME,4.0
60,Allen,Dior,
20,Allen,Bobbi brown,
10,Allen,Estee Lauder,3.0
50,Allen,Chanel,
32,Didi,SUQQU,
2,Didi,Mac,1.0


<IPython.core.display.Javascript object>

In [395]:
matrix = formatted_df.pivot_table(index="brand", columns="Critics", values="rating")
matrix

Critics,Allen,Didi,Gloria,Grace,Keer,Molly,Pan,Shao,Sohee,Sophie
brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bobbi brown,,,4.0,,4.0,3.0,4.0,3.0,5.0,
Chanel,,5.0,5.0,,5.0,,4.0,,4.0,5.0
Dior,,,3.0,,,,2.0,5.0,2.0,
Estee Lauder,3.0,2.0,3.0,,3.0,4.0,3.0,,4.0,1.0
LANC√îME,4.0,5.0,,3.0,,2.0,5.0,,5.0,
Mac,,1.0,3.0,,1.0,5.0,,3.0,3.0,2.0
SUQQU,3.0,,4.0,,2.0,2.0,4.0,,5.0,
YSL,4.0,5.0,4.0,,5.0,,,,4.0,5.0


<IPython.core.display.Javascript object>

### Get the value (actual - average) for each brand

In [396]:
df_norm = matrix.subtract(matrix.mean(axis=1), axis=0)
df_norm

Critics,Allen,Didi,Gloria,Grace,Keer,Molly,Pan,Shao,Sohee,Sophie
brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bobbi brown,,,0.166667,,0.166667,-0.833333,0.166667,-0.833333,1.166667,
Chanel,,0.333333,0.333333,,0.333333,,-0.666667,,-0.666667,0.333333
Dior,,,0.0,,,,-1.0,2.0,-1.0,
Estee Lauder,0.125,-0.875,0.125,,0.125,1.125,0.125,,1.125,-1.875
LANC√îME,0.0,1.0,,-1.0,,-2.0,1.0,,1.0,
Mac,,-1.571429,0.428571,,-1.571429,2.428571,,0.428571,0.428571,-0.571429
SUQQU,-0.333333,,0.666667,,-1.333333,-1.333333,0.666667,,1.666667,
YSL,-0.5,0.5,-0.5,,0.5,,,,-0.5,0.5


<IPython.core.display.Javascript object>

### Leveraging 𝑆𝑖𝑚𝑖𝑙𝑎𝑟𝑖𝑡𝑦(𝐴,𝐵)=cos(𝜃)=𝐴⋅𝐵 / ||𝐴||×||𝐵|| to calculate the similarity for each brand.

In [397]:
cosine_similarity_df = cosine_similarity(df_norm.fillna(0))
cosine_similarity_df

array([[ 1.        , -0.40016337, -0.72760688,  0.09912721,  0.63012604,
        -0.35955325,  0.67033306, -0.28295823],
       [-0.40016337,  1.        ,  0.47140452, -0.55048188, -0.30618622,
        -0.34942295, -0.56853524,  0.47140452],
       [-0.72760688,  0.47140452,  1.        , -0.19462474, -0.28867513,
         0.05111986, -0.35176324,  0.16666667],
       [ 0.09912721, -0.55048188, -0.19462474,  1.        , -0.25282495,
         0.61485994,  0.04694525, -0.62279916],
       [ 0.63012604, -0.30618622, -0.28867513, -0.25282495,  1.        ,
        -0.6197954 ,  0.65279121,  0.        ],
       [-0.35955325, -0.34942295,  0.05111986,  0.61485994, -0.6197954 ,
         1.        , -0.01541322, -0.54527854],
       [ 0.67033306, -0.56853524, -0.35176324,  0.04694525,  0.65279121,
        -0.01541322,  1.        , -0.50251891],
       [-0.28295823,  0.47140452,  0.16666667, -0.62279916,  0.        ,
        -0.54527854, -0.50251891,  1.        ]])

<IPython.core.display.Javascript object>

### Convert array into dataframe, allowing us to look it clearly

In [398]:
df_array = pd.DataFrame(cosine_similarity_df)
df_array

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,-0.400163,-0.727607,0.099127,0.630126,-0.359553,0.670333,-0.282958
1,-0.400163,1.0,0.471405,-0.550482,-0.306186,-0.349423,-0.568535,0.471405
2,-0.727607,0.471405,1.0,-0.194625,-0.288675,0.05112,-0.351763,0.166667
3,0.099127,-0.550482,-0.194625,1.0,-0.252825,0.61486,0.046945,-0.622799
4,0.630126,-0.306186,-0.288675,-0.252825,1.0,-0.619795,0.652791,0.0
5,-0.359553,-0.349423,0.05112,0.61486,-0.619795,1.0,-0.015413,-0.545279
6,0.670333,-0.568535,-0.351763,0.046945,0.652791,-0.015413,1.0,-0.502519
7,-0.282958,0.471405,0.166667,-0.622799,0.0,-0.545279,-0.502519,1.0


<IPython.core.display.Javascript object>

### Reset the index and columns name

In [399]:
reset_name = df_norm.index.to_list()

<IPython.core.display.Javascript object>

In [400]:
df_array.set_axis(reset_name, axis=1, inplace=True)

<IPython.core.display.Javascript object>

In [401]:
df_array.index = reset_name

<IPython.core.display.Javascript object>

In [402]:
df_array

Unnamed: 0,Bobbi brown,Chanel,Dior,Estee Lauder,LANC√îME,Mac,SUQQU,YSL
Bobbi brown,1.0,-0.400163,-0.727607,0.099127,0.630126,-0.359553,0.670333,-0.282958
Chanel,-0.400163,1.0,0.471405,-0.550482,-0.306186,-0.349423,-0.568535,0.471405
Dior,-0.727607,0.471405,1.0,-0.194625,-0.288675,0.05112,-0.351763,0.166667
Estee Lauder,0.099127,-0.550482,-0.194625,1.0,-0.252825,0.61486,0.046945,-0.622799
LANC√îME,0.630126,-0.306186,-0.288675,-0.252825,1.0,-0.619795,0.652791,0.0
Mac,-0.359553,-0.349423,0.05112,0.61486,-0.619795,1.0,-0.015413,-0.545279
SUQQU,0.670333,-0.568535,-0.351763,0.046945,0.652791,-0.015413,1.0,-0.502519
YSL,-0.282958,0.471405,0.166667,-0.622799,0.0,-0.545279,-0.502519,1.0


<IPython.core.display.Javascript object>

---

---

### Find the brands that users didn't rating before

In [403]:
not_rating_all_users = []
for user in matrix.columns:
    not_rating_before = matrix[matrix[user].isnull()].index.to_list()
    not_rating_all_users.append(not_rating_before)

<IPython.core.display.Javascript object>

In [404]:
not_rating_all_users

[['Bobbi brown', 'Chanel', 'Dior', 'Mac'],
 ['Bobbi brown', 'Dior', 'SUQQU'],
 ['LANC√îME'],
 ['Bobbi brown', 'Chanel', 'Dior', 'Estee Lauder', 'Mac', 'SUQQU', 'YSL'],
 ['Dior', 'LANC√îME'],
 ['Chanel', 'Dior', 'YSL'],
 ['Mac', 'YSL'],
 ['Chanel', 'Estee Lauder', 'LANC√îME', 'SUQQU', 'YSL'],
 [],
 ['Bobbi brown', 'Dior', 'LANC√îME', 'SUQQU']]

<IPython.core.display.Javascript object>

#### Who do we want to recommend?

In [405]:
target_user = input("Target user is:")

Target user is:Allen


<IPython.core.display.Javascript object>

#### Which brand(s) she never used before?

In [406]:
never_used = not_rating_all_users[matrix.columns.get_loc(target_user)]
never_used

['Bobbi brown', 'Chanel', 'Dior', 'Mac']

<IPython.core.display.Javascript object>

### Get the top3 similarity brands' name and similarity value

In [407]:
similarity_list = []
for brand in df_array.columns:
    sm = df_array[brand].sort_values(ascending=False)[1:].head(3)
    similarity_list.append(sm)

<IPython.core.display.Javascript object>

In [408]:
top3_for_never_used = []
for brand in never_used:
    top3 = similarity_list[df_array.columns.get_loc(brand)]
    top3_for_never_used.append(top3)

<IPython.core.display.Javascript object>

In [409]:
top3_for_never_used

[SUQQU           0.670333
 LANC√îME        0.630126
 Estee Lauder    0.099127
 Name: Bobbi brown, dtype: float64,
 Dior        0.471405
 YSL         0.471405
 LANC√îME   -0.306186
 Name: Chanel, dtype: float64,
 Chanel    0.471405
 YSL       0.166667
 Mac       0.051120
 Name: Dior, dtype: float64,
 Estee Lauder    0.614860
 Dior            0.051120
 SUQQU          -0.015413
 Name: Mac, dtype: float64]

<IPython.core.display.Javascript object>

### Convert above list into dataframe, to make the next step easier 

In [410]:
df_brand_sm = pd.DataFrame(top3_for_never_used).T
df_brand_sm

Unnamed: 0,Bobbi brown,Chanel,Dior,Mac
SUQQU,0.670333,,,-0.015413
LANC√îME,0.630126,-0.306186,,
Estee Lauder,0.099127,,,0.61486
Dior,,0.471405,,0.05112
YSL,,0.471405,0.166667,
Chanel,,,0.471405,
Mac,,,0.05112,


<IPython.core.display.Javascript object>

### Check the target user rating history again, and fill out null value as 0, to allow us to calculate in the future.

In [411]:
df_target_user = matrix[target_user].to_frame().fillna(0)
df_target_user

Unnamed: 0_level_0,Allen
brand,Unnamed: 1_level_1
Bobbi brown,0.0
Chanel,0.0
Dior,0.0
Estee Lauder,3.0
LANC√îME,4.0
Mac,0.0
SUQQU,3.0
YSL,4.0


<IPython.core.display.Javascript object>

### Combine the above dataframes, fill null value as 0 to allow us to calculate.

In [412]:
df_final = df_target_user.join(df_brand_sm).fillna(0)
df_final

Unnamed: 0_level_0,Allen,Bobbi brown,Chanel,Dior,Mac
brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bobbi brown,0.0,0.0,0.0,0.0,0.0
Chanel,0.0,0.0,0.0,0.471405,0.0
Dior,0.0,0.0,0.471405,0.0,0.05112
Estee Lauder,3.0,0.099127,0.0,0.0,0.61486
LANC√îME,4.0,0.630126,-0.306186,0.0,0.0
Mac,0.0,0.0,0.0,0.05112,0.0
SUQQU,3.0,0.670333,0.0,0.0,-0.015413
YSL,4.0,0.0,0.471405,0.166667,0.0


<IPython.core.display.Javascript object>

### Calculate the rating for each brand that the target user never used before with loop.

In [413]:
final_rating = []
for i in never_used:
    sum_rating = sum(df_final[i] * df_final[target_user])
    final_rating.append(sum_rating)

<IPython.core.display.Javascript object>

### Return the maximum rating and according to the index find the corresponding brand name.

In [414]:
brand_recommend = never_used[final_rating.index(max(final_rating))]

<IPython.core.display.Javascript object>

### Get the result, return the rating and the brand that we should recommend to the target user.

In [415]:
print(
    "The brand we should recommend to the",
    target_user,
    "is",
    brand_recommend,
    "the rating is",
    round(max(final_rating), 2),
)

The brand we should recommend to the Allen is Bobbi brown the rating is 4.83


<IPython.core.display.Javascript object>

### Which part can be improved?

**In future research, I personally think that the threshold with the lowest rating should be added. If the highest rating score is still very low, then the recommendation will become meaningless. For this small data set, if the threshold with the lowest value is set, then It may appear that many people cannot get recommended brands. But for the larger data, it makes sense to set the minimum rating score threshold.**

---

---

## The link to the lightFM
https://github.com/ycwang15/Rec_Sys_assignments/blob/main/Y_Wang_M3_assn_lightFM.ipynb