
#This Baseline Model is implemented from the paper : 
# **Factorization Meets the Neighborhood: a Multifaceted Collaborative Filtering Model** by Yehuda Koren

Sir told us to implement this paper. Based on this he gave us another paper **Scalable Collaborative Filtering with Jointly Derived Neighborhood Interpolation Weights**

# The given below math has been dervied by ourselves from understanding the paper by which we have implemented our code.

---
* **Baseline Equation**
---
\begin{equation}
b_{ui} = \mu + b_u + b_i
\end{equation}
---
---
* **Optimization Problem**
---
\begin{equation}
\min_{b_u, b_i} \sum_{(u,i) \in R_{train}} (r_{ui} - b_{ui})^2 + \lambda (||b_u||^2 + ||b_i||^2)
\end{equation}
---
---
* **Gradient Descent**
---
\begin{equation}
\frac{\partial}{\partial b_u} = -2 \sum_{i \in I_u} (r_{ui} - \mu - b_u - b_i) + 2 \lambda b_u
\end{equation}

\begin{equation}
\frac{\partial}{\partial b_i} = -2 \sum_{u \in U_i} (r_{ui} - \mu - b_u - b_i) + 2 \lambda b_i
\end{equation}

\begin{equation}
b_{u}^{(k+1)} = b_{u}^{(k)} - \gamma \cdot \frac{\partial}{\partial b_u} J(b_u^{(k)}, b_i^{(k)})
\end{equation}

\begin{equation}
b_{i}^{(k+1)} = b_{i}^{(k)} - \gamma \cdot \frac{\partial}{\partial b_i} J(b_u^{(k)}, b_i^{(k)})
\end{equation}

\begin{aligned}
b_{u}^{(k+1)} &= b_{u}^{(k)} + \gamma \cdot \left( \sum_{i \in I_u} (r_{ui} - \mu - b_u^{(k)} - b_i^{(k)}) - \lambda b_u^{(k)} \right) \\
b_{i}^{(k+1)} &= b_{i}^{(k)} + \gamma \cdot \left( \sum_{u \in U_i} (r_{ui} - \mu - b_u^{(k)} - b_i^{(k)}) - \lambda b_i^{(k)} \right)
\end{aligned}

---
* **Update Rule**
---
\begin{equation}
b_{u}^{(k+1)} = b_{u}^{(k)} + \gamma \cdot \left( e_{ui} - \lambda \cdot b_{u}^{(k)} \right)
\end{equation}


# Importing Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os
import pickle
from IPython.display import display

# Importing Data and preprocessing

In [3]:
data = pd.read_csv('/content/drive/MyDrive/RS Data/Assignment 2/short-recipes-20.csv')


In [4]:
df = data.copy()

In [5]:
df = data.pivot_table(index='user_id',columns='recipe_id',values='rating')


## Train Test Split

In [6]:
zero_mask = df == 0
train = pd.DataFrame(columns=df.columns)
test = pd.DataFrame(columns=df.columns)
from sklearn.model_selection import train_test_split

for row_index, row_data in df.iterrows():
  train_data, test_data = train_test_split(row_data, test_size=0.2)
  train_data[zero_mask.loc[row_index]] = 0
  train = train.append(train_data)
  test = test.append(test_data)
train = train.fillna(0)


In [7]:
train = train.fillna(0)

## Initializing mu, bu and bi parameters 

In [8]:
mask = train != 0

# calculate mean of non-zero values
mu = np.mean(train[mask]).mean()

In [9]:
bu= np.random.randn(train.shape[0])
bi= np.random.randn(train.shape[1])

In [10]:
bu

array([-1.42354828, -0.38441031,  0.12388244, ...,  1.7156224 ,
       -0.23372841, -0.46469502])

In [11]:
bi

array([-0.20587831,  1.13470694, -0.86770647, ...,  1.72737742,
       -1.31715559,  0.23112701])

# Baseline Estimate on Ratings

In [None]:
iter = 15
lam =0.09
lr=0.002

for _ in range(iter):
  print(f"iteration {_}")

  del_bu,del_bi =0,0
  delt=0
  for i in range(train.shape[0]):
    print(f'iteration {_}, user {i}')
    for j in (np.where(np.array(train.iloc[i,:]) != 0)[0]):
      b_u = bu[i]
      b_i = bi[j]
      delt = (train.iloc[i,j] - mu - b_u -b_i)
      del_bu = -delt + lam*bu[i]
      del_bi = -delt + lam*bi[j]
      bu[i] = bu[i]-(lr*del_bu)
      bi[j]= bi[j]-(lr*del_bi)



In [46]:
iter = 10
lam =0.09
lr=0.005

for _ in range(iter):
  print(f"iteration {_}")

  del_bu,del_bi =0,0
  delt=0
  for i in range(train.shape[0]):
    print(f'iteration {_}, user {i}')
    for j in (np.where(np.array(train.iloc[i,:]) != 0)[0]):
      b_u = bu[i]
      b_i = bi[j]
      delt = (train.iloc[i,j] - mu - b_u -b_i)
      del_bu = -delt + lam*bu[i]
      del_bi = -delt + lam*bi[j]
      bu[i] = bu[i]-(lr*del_bu)
      bi[j]= bi[j]-(lr*del_bi)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
iteration 7, user 120
iteration 7, user 121
iteration 7, user 122
iteration 7, user 123
iteration 7, user 124
iteration 7, user 125
iteration 7, user 126
iteration 7, user 127
iteration 7, user 128
iteration 7, user 129
iteration 7, user 130
iteration 7, user 131
iteration 7, user 132
iteration 7, user 133
iteration 7, user 134
iteration 7, user 135
iteration 7, user 136
iteration 7, user 137
iteration 7, user 138
iteration 7, user 139
iteration 7, user 140
iteration 7, user 141
iteration 7, user 142
iteration 7, user 143
iteration 7, user 144
iteration 7, user 145
iteration 7, user 146
iteration 7, user 147
iteration 7, user 148
iteration 7, user 149
iteration 7, user 150
iteration 7, user 151
iteration 7, user 152
iteration 7, user 153
iteration 7, user 154
iteration 7, user 155
iteration 7, user 156
iteration 7, user 157
iteration 7, user 158
iteration 7, user 159
iteration 7, user 160
iteration 7, user 161
iteration 7

In [47]:
bu

array([ 0.0463612 , -0.12778221, -0.07434244, ..., -0.31461793,
        0.10581531,  0.32926396])

In [48]:
bi

array([-0.1371722 ,  0.80111608, -0.34159401, ...,  1.32963318,
       -1.10363828,  0.09206979])

# Predicting Rating Matrix with filled Values

In [49]:
BU=bu.reshape([-1,1])

In [50]:
BI = bi.reshape([-1,1])

In [51]:
R= np.dot(BU,BI.T)

In [52]:
R_ = R+mu

In [59]:
R_

array([[4.67728555, 4.72078572, 4.66780831, ..., 4.74528841, 4.63247902,
        4.68791349],
       [4.70117319, 4.58127664, 4.72729466, ..., 4.51374155, 4.82467036,
        4.67188014],
       [4.69384274, 4.62408809, 4.70903995, ..., 4.58479684, 4.76569219,
        4.67680033],
       ...,
       [4.72680186, 4.43159954, 4.79111662, ..., 4.26531858, 5.03086941,
        4.65467821],
       [4.6691301 , 4.76841537, 4.64749914, ..., 4.82434057, 4.56686319,
        4.69338741],
       [4.63847916, 4.94742367, 4.57117043, ..., 5.1214453 , 4.32025671,
        4.71396028]])

In [54]:
np.save("bu.npy",BU)
np.save("bi.npy",BI)

In [55]:
mu

4.683645020416299

In [56]:
diff = train[mask] - R_

# square the differences
squared_diff = np.square(diff)

# take mean of the squared differences
mean_squared_diff = np.mean(squared_diff)
mean_squared_diff = (np.nansum(mean_squared_diff)/np.count_nonzero(~np.isnan(mean_squared_diff)))
# take square root of the mean
rmse = np.sqrt(mean_squared_diff)


In [57]:
rmse

0.6272442914453036

## Creating Recipe Name Dictionary

In [None]:
ds =pd.read_csv("/content/drive/MyDrive/RS Data/Assignment 2/RAW_recipes.csv")
ds.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [None]:
recipe_dict = {recipe_id: ds.loc[ds['id'] == recipe_id, 'name'].iloc[0] for recipe_id in train.keys()}

In [None]:
recipe_dict

In [None]:
with open('recipe_names.pkl', 'wb') as fp:
    pickle.dump(recipe_dict, fp)
    print('dictionary saved successfully to file')

dictionary saved successfully to file


# Run From Here to directly use the Recommendation System

## Loading Parameters

In [60]:
mu= 4.6818430002382145
rating = pd.read_csv("/content/drive/MyDrive/RS Data/Assignment 2/Baseline/Train_data.csv") # Empty Rating Matrix
bi =np.load("/content/drive/MyDrive/RS Data/Assignment 2/Baseline/bi.npy")
bu =np.load("/content/drive/MyDrive/RS Data/Assignment 2/Baseline/bi.npy")


In [61]:
with open('/content/drive/MyDrive/RS Data/Assignment 2/Baseline/recipe_names.pkl', 'rb') as fp:
    names = pickle.load(fp)
    print('Recipe_Name dictionary imported successfully')
    

Recipe_Name dictionary imported successfully


# Recommendation System

In [62]:
class Recommendation_system():
  def __init__(self,train,mu,bi,bu,names):
    self.train = train
    self.mu = mu
    self.bi = bi
    self.bu = bu
    self.R_ = self.mu + np.dot(self.bu,self.bi.T)
    self.dct = {user_id: index for index, user_id in enumerate(self.train.index)}
    self.rec_name = names
  
  def BaseLine(self):

    user_id = int(input("Enter Your User Id:  "))
    os.system('cls')
    print("\n\n")
    
    print(f"Welcome User {user_id}")
    print("\n\n")
    
    uid = self.dct[user_id]
    unrated_items = np.where(self.train.iloc[uid, :] == 0)[0]

    # Sort the predicted ratings for unrated items in descending order
    sorted_ratings = np.argsort(self.R_[uid, unrated_items])[::-1]

    # Recommend the top N items to the user
    N = 5
    recommended_items = unrated_items[sorted_ratings][:N]
    print(f" We have these recommendations for you today: \n ")
    new_dict = {}
    for idx in recommended_items:
      key = list(self.rec_name.keys())[idx]
      value = self.rec_name[key]
      new_dict[key] = value

    output_df = pd.DataFrame.from_dict(new_dict, orient='index', columns=['Recpie_Name'])
    output_df = output_df.rename_axis('Recipe_ID')
    display(output_df)
    return recommended_items

  def get_recommendations(self):
    os.system('cls')
    print("You are using Baseline Estimated Recommendation System")
    self.BaseLine()

In [63]:
RS=Recommendation_system(train,mu,BI,BU,names)
RS.get_recommendations()

You are using Baseline Estimated Recommendation System
Enter Your User Id:  1533



Welcome User 1533



 We have these recommendations for you today: 
 


Unnamed: 0_level_0,Recpie_Name
Recipe_ID,Unnamed: 1_level_1
138526,grilled chicken spedinis
247634,herb s feather pancakes
176043,blender frosties
124186,almond pancake mix
302603,chili beef skewers


Note:  

1.   **RMSE: 0.6277**
2.   Almost Every item have predicted ratings less than 5
3.   The variety in rating prediction for each user and item 
4.   Recommended items are close to the test split items

