# Hybrid Recommender System

# Load Dataset


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [77]:
item_name = "Bakewell"
random_user = "Adam"

In [3]:
df = pd.read_csv("data.csv")

In [4]:
df.head(5)

Unnamed: 0,TransactionNo,Items,DateTime,Daypart,DayType,User,Rating,overview
0,1,Bread,2016-10-30 09:58:11,Morning,Weekend,Gloria,3,"A staple food made from flour, water, and yeas..."
1,2,Scandinavian,2016-10-30 10:05:34,Morning,Weekend,Louise,5,"Reflecting the flavors of Northern Europe, Sca..."
2,2,Scandinavian,2016-10-30 10:05:34,Morning,Weekend,Daniel,5,"Reflecting the flavors of Northern Europe, Sca..."
3,3,Hot chocolate,2016-10-30 10:07:57,Morning,Weekend,Shannon,4,"Though not a cake in the traditional sense, ho..."
4,3,Jam,2016-10-30 10:07:57,Morning,Weekend,Manuel,3,"A sweet spread made from fruits, commonly used..."


In [5]:
df.shape

(20507, 8)

# Let's Start!

In [6]:
# unique items:
df["Items"].nunique()

94

In [7]:
df["Items"].value_counts().head()

Coffee    5471
Bread     3325
Tea       1435
Cake      1025
Pastry     856
Name: Items, dtype: int64

In [8]:
# create the User Item Df:
user_item_df = df.pivot_table(index=["User"], columns=["Items"], values="Rating")
user_item_df.shape

(503, 94)

In [9]:
user_item_df.head(10)

Items,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron,,2.0,,,,,,,,,...,,,,2.0,,,,,,
Abigail,,,,,,,5.0,,,,...,,,,,,,,,,
Adam,,0.0,0.0,,,,,,,,...,,,,2.0,1.0,,,,,
Adrian,,,,,,,3.0,,,,...,,,,1.0,,,3.0,,,
Agnes,,,,,,,,,,,...,,,,0.0,,,,,,
Alan,,,4.5,,,,,4.0,,0.0,...,,,,,,,,,,
Albert,,1.0,,,,,3.0,,,,...,,,4.0,,,,,,,
Alfred,,,4.0,,,,,,,,...,,,2.0,,,,2.0,,,
Alice,,,,,,,0.0,,,,...,,,,,,,,,,
Alicia,,,,,,,,,,,...,,,,0.5,2.0,,,,2.5,


In [10]:
# item-based item recommendation example:
item_name = user_item_df[item_name]
user_item_df.corrwith(item_name).sort_values(ascending=False).head(10) # top 10 items most similarity with item_name

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Items
Chimichurri Oil      1.000000
Spread               1.000000
Bare Popcorn         1.000000
Hack the stack       1.000000
Hot chocolate        1.000000
Mighty Protein       0.887764
Basket               0.658872
Postcard             0.585369
Honey                0.562565
Lemon and coconut    0.443607
dtype: float64

In [11]:
item_name

User
Aaron      1.00
Abigail    3.25
Adam       4.00
Adrian      NaN
Agnes      2.50
           ... 
Wilson      NaN
Yolanda    0.50
Yvonne      NaN
Zach        NaN
Zachary    2.00
Name: Hot chocolate, Length: 503, dtype: float64

In [12]:
# Let's determine the movies that the user watched.

# Let's choose random user:
# random_user = int(pd.Series(user_movie_df.index).sample(1, random_state=45).values)


In [13]:
# Let's reduce the #dataset to random user:
random_user_df = user_item_df[user_item_df.index == random_user]
random_user_df

Items,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Adam,,0.0,0.0,,,,,,,,...,,,,2.0,1.0,,,,,


In [14]:
# Let's choose non-NaN. Movies watched by all 28491:
items_bought = random_user_df.columns[random_user_df.notna().any()].tolist() # items which random user bought
items_bought

['Afternoon with the baker',
 'Alfajores',
 'Bread',
 'Brownie',
 'Cake',
 'Coffee',
 'Coke',
 'Cookies',
 'Hearty & Seasonal',
 'Hot chocolate',
 'Jam',
 'Jammie Dodgers',
 'Juice',
 'Medialuna',
 'Mineral water',
 'Muffin',
 'Pastry',
 'Salad',
 'Sandwich',
 'Soup',
 'Tacos/Fajita',
 'Tea',
 'Toast',
 'Truffles']

In [15]:
#let's verify:
user_item_df.loc[user_item_df.index == random_user, user_item_df.columns == "Juice"] # return ratings of random user for a item

Items,Juice
User,Unnamed: 1_level_1
Adam,3.0


In [16]:
# How many items have random user bought:
len(items_bought)

24

In [17]:
# we have reduced the dataset based on items watched by random user:
items_bought_df = user_item_df[items_bought] # only return items bought
items_bought_df.head()
# items_bought_df.shape

Items,Afternoon with the baker,Alfajores,Bread,Brownie,Cake,Coffee,Coke,Cookies,Hearty & Seasonal,Hot chocolate,...,Mineral water,Muffin,Pastry,Salad,Sandwich,Soup,Tacos/Fajita,Tea,Toast,Truffles
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron,2.0,,2.0,3.0,2.333333,2.25,,0.0,,1.0,...,,1.0,5.0,0.0,4.5,1.0,,1.666667,2.0,
Abigail,,,2.285714,3.5,3.25,3.578947,,5.0,,3.25,...,5.0,4.0,1.5,2.0,,5.0,,1.666667,,
Adam,0.0,0.0,3.285714,4.0,3.5,2.916667,0.0,0.5,2.0,4.0,...,2.0,3.0,0.0,2.0,5.0,4.0,0.0,3.0,2.0,1.0
Adrian,,,3.285714,0.0,1.0,2.0,,,,,...,4.0,,,,3.666667,1.0,,0.0,1.0,
Agnes,,,4.0,,1.5,2.636364,,2.0,,2.5,...,,4.0,0.0,,2.0,,,3.5,0.0,


In [18]:
# information on how many items each user bought in total:
user_item_count = items_bought_df.T.notnull().sum()

user_item_count = user_item_count.reset_index()
user_item_count.columns = ["User","item_count"] # number of items, which random user bought, were bought by each user , max is random user
user_item_count.head()

Unnamed: 0,User,item_count
0,Aaron,15
1,Abigail,13
2,Adam,24
3,Adrian,10
4,Agnes,13


In [19]:
# 3 user bought 10 items:
user_item_count[user_item_count["item_count"] == 10].count()

User          43
item_count    43
dtype: int64

In [20]:
# 60% of items bought by random user:
perc = len(items_bought) * 60 / 100
perc

14.4

In [21]:
# People who have bought more than 60% items together with random user:
users_same_items = user_item_count[user_item_count["item_count"] > perc]["User"] # only calculate with users who bought more than 60% items together with random user 
users_same_items.head()

0      Aaron
2       Adam
6     Albert
9     Alicia
21       Ann
Name: User, dtype: object

In [25]:
# Let's combine the data of random user and similar users:
final_df = pd.concat([items_bought_df[items_bought_df.index.isin(users_same_items)],
                      random_user_df[items_bought]])
# final_df = items_bought_df[items_bought_df.index.isin(users_same_items)]

# final_df.shape
final_df.T.corr() # corr between two user based on items which random user bought 
# final_df

User,Aaron,Adam,Albert,Alicia,Ann,Annie,Arthur,Austin,Barry,Brandon,...,Shari,Shelly,Simon,Spencer,Susan,Theresa,Tommy,Vivian,William,Adam
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron,1.000000,0.076119,0.462425,0.142444,-0.238939,-0.330538,0.349338,-0.228658,0.508999,-0.228301,...,0.039038,0.599828,0.109074,0.042534,-0.157952,-0.176667,0.121871,-0.358289,0.240899,0.076119
Adam,0.076119,1.000000,-0.084226,-0.315293,0.280761,0.074519,-0.017725,0.440950,0.494416,0.442111,...,0.490510,-0.353699,0.037743,-0.446293,-0.108726,0.363922,-0.045312,0.021381,0.388837,1.000000
Albert,0.462425,-0.084226,1.000000,0.692633,-0.490877,-0.318284,0.236616,-0.176322,0.171141,0.077527,...,-0.501821,-0.069378,0.658611,0.181290,0.238446,-0.085097,0.399037,-0.294959,-0.231621,-0.084226
Alicia,0.142444,-0.315293,0.692633,1.000000,-0.227059,-0.557927,-0.006910,0.248624,-0.155664,-0.313249,...,-0.080334,-0.127168,-0.179007,0.150729,-0.212103,-0.316410,0.239822,-0.431651,-0.125598,-0.315293
Ann,-0.238939,0.280761,-0.490877,-0.227059,1.000000,0.152029,-0.524517,0.555554,0.082296,0.234783,...,0.163872,-0.328440,-0.436271,-0.227154,-0.406169,0.252304,-0.466186,0.138426,0.439804,0.280761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Theresa,-0.176667,0.363922,-0.085097,-0.316410,0.252304,0.585470,-0.418828,0.362673,0.269192,0.265566,...,0.103357,-0.289309,-0.122327,-0.117522,-0.016662,1.000000,-0.248531,0.367295,0.276169,0.363922
Tommy,0.121871,-0.045312,0.399037,0.239822,-0.466186,-0.315614,0.042556,-0.394884,-0.023106,-0.006015,...,-0.623745,0.039669,0.381354,0.480337,0.275807,-0.248531,1.000000,-0.017506,-0.511199,-0.045312
Vivian,-0.358289,0.021381,-0.294959,-0.431651,0.138426,0.204141,0.087972,0.171006,0.278577,0.318351,...,-0.380980,-0.450744,-0.143593,-0.399412,0.137707,0.367295,-0.017506,1.000000,-0.032108,0.021381
William,0.240899,0.388837,-0.231621,-0.125598,0.439804,-0.149204,0.441397,0.250164,0.409945,0.151578,...,-0.029175,-0.239983,-0.309597,-0.088906,-0.171942,0.276169,-0.511199,-0.032108,1.000000,0.388837


In [26]:
#corr for all users:
# caculate corr between each pair users who bought more 60% items together with random user
corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
corr_df = pd.DataFrame(corr_df, columns=["corr"])
corr_df.index.names = ['user_1', 'user_2']
corr_df = corr_df.reset_index()
corr_df.head()

Unnamed: 0,user_1,user_2,corr
0,Louis,Henry,-0.884492
1,Connie,Janet,-0.871703
2,Gene,Brandon,-0.862302
3,Aaron,Kay,-0.850448
4,Joan,Shari,-0.834144


In [55]:
# Users with a correlation of %30 or more with random user:
top_users = corr_df[(corr_df["user_1"] == random_user) & (corr_df["corr"] >= 0.3)][
    ["user_2", "corr"]].reset_index(drop=True) # correlation >= 40% with random user 

top_users = top_users.sort_values(by='corr', ascending=False) # corr between User and random user
top_users.rename(columns={"user_2": "User"}, inplace=True)
top_users

Unnamed: 0,User,corr
10,Adam,1.0
9,Gloria,0.533943
8,Barry,0.494416
7,Shari,0.49051
6,Brandon,0.442111
5,Janet,0.414107
4,Leslie,0.404898
3,Leslie,0.404898
2,Theresa,0.363922
1,Cathy,0.301187


In [52]:
# let's see the ratings of users:
# create a dataframe that insert Items, Rating into top_users
top_users_ratings = top_users.merge(df[["User", "Items", "Rating"]], how='inner')

top_users_ratings = top_users_ratings[top_users_ratings["User"] != random_user]
top_users_ratings.head()

Unnamed: 0,User,corr,Items,Rating
49,Gloria,0.533943,Bread,3
50,Gloria,0.533943,Coffee,3
51,Gloria,0.533943,Juice,2
52,Gloria,0.533943,Medialuna,3
53,Gloria,0.533943,Coffee,5


In [56]:
# Calculate the Weighted Average Recommendation Score and keep the first 5 items.

#Let's do a single score with the most similar by corr * rating:
top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['Rating'] # a new column weighted_rating = corr * ratings
top_users_ratings.groupby('Items').agg({"weighted_rating": "mean"}) # caculate weighted ratings which random user can rate for items 

recommendation_df = top_users_ratings.groupby('Items').agg({"weighted_rating": "mean"})
recommendation_df = recommendation_df.reset_index()
recommendation_df

Unnamed: 0,Items,weighted_rating
0,Afternoon with the baker,1.188075
1,Alfajores,0.56807
2,Baguette,1.977663
3,Bakewell,2.135773
4,Bread,1.212299
5,Brioche and salami,0.404898
6,Brownie,1.609235
7,Cake,1.274912
8,Chicken Stew,0.981021
9,Coffee,0.986417


In [57]:
# weighted rating greater than 4:
recommendation_df[recommendation_df["weighted_rating"] > 1]

# items which random user will like:
items_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 1].sort_values("weighted_rating", ascending=False)

# items_to_be_recommend.merge(movie[["Items", "title"]])

# #Let's see the top 5 movies:
items_to_be_recommend[:5]

Unnamed: 0,Items,weighted_rating
3,Bakewell,2.135773
2,Baguette,1.977663
33,Spanish Brunch,1.873567
21,Jammie Dodgers,1.759114
6,Brownie,1.609235


In [109]:
# Make an item-based suggestion based on the name of the movie that the user has watched with the highest score.

# ▪ 5 suggestions user-based
# ▪ 5 suggestions item-based
?

# The last highly-rated movie by user 108170:

# user = "Alice"
# item = df[(df["User"] == user) & (df["Rating"] == 5.0)].sort_values(by="DateTime", ascending=False)["Items"][0:1].values[0]
# print(item)

In [58]:
# ▪ 5 suggestions user-based
items_to_be_recommend[:5]['Items'].to_list()

['Bakewell', 'Baguette', 'Spanish Brunch', 'Jammie Dodgers', 'Brownie']

In [113]:
# ▪ 5 suggestions item-based
# item_name = df["Items"].unique()
item_name_col = user_item_df[item_name]
moveis_from_item_based = user_item_df.corrwith(item_name_col).sort_values(ascending=False)
mask = moveis_from_item_based.index != item_name
moveis_from_item_based = moveis_from_item_based[mask]
moveis_from_item_based[0:5].index.to_list()

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


['Duck egg', 'Focaccia', 'Granola', 'Mineral water', 'Keeping It Local']