In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

## Loading Data

In [3]:
df = pd.read_csv("/content/drive/MyDrive/RS Data/Assignment 2/short-recipes-20.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,name,user_id,recipe_id,rating,minutes
0,21,calm your nerves tonic,65056,39959,5.0,5
1,76,homemade vegetable soup from a can,189616,87098,5.0,12
2,77,homemade vegetable soup from a can,369715,87098,4.0,12
3,170,i stole the idea from mirj sesame noodles,49304,90921,5.0,18
4,171,i stole the idea from mirj sesame noodles,82648,90921,5.0,18


In [4]:
df.shape

(122610, 6)

In [5]:
len(df["user_id"].unique())

1706

In [6]:
len(df["recipe_id"].unique())

38665

## Train-Test Splitting

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,user_id,recipe_id,rating,minutes
0,21,calm your nerves tonic,65056,39959,5.0,5
1,76,homemade vegetable soup from a can,189616,87098,5.0,12
2,77,homemade vegetable soup from a can,369715,87098,4.0,12
3,170,i stole the idea from mirj sesame noodles,49304,90921,5.0,18
4,171,i stole the idea from mirj sesame noodles,82648,90921,5.0,18


In [8]:
def split_group(group, split_ratio=0.8):
    n_total = len(group)
    n_train = int(n_total * split_ratio)
    idx = group.index
    train_idx = idx[:n_train].tolist()
    test_idx = idx[n_train:].tolist()
    return train_idx, test_idx

In [9]:
grouped = df.groupby('user_id')
indices = grouped.apply(split_group)

# concatenate the train and test indices for each group
train_idx = pd.Index([idx for indices in indices for idx in indices[0]])
test_idx = pd.Index([idx for indices in indices for idx in indices[1]])

# create the train and test dataframes
train_df = df.loc[train_idx]
test_df = df.loc[test_idx]

In [10]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,name,user_id,recipe_id,rating,minutes
10406,59299,basil garlic butter,1533,32311,5.0,15
16051,94713,broccoli with cheddar vinaigrette,1533,81473,5.0,10
18791,108924,californian apple crunch,1533,14807,5.0,19
27166,163923,cinnamon roll toast,1533,57549,5.0,10
29882,177669,costa rican marinated mango,1533,24415,5.0,20


In [11]:
train_df.shape

(97412, 6)

In [12]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,name,user_id,recipe_id,rating,minutes
84080,484932,peggy s cheese savories,1533,28407,5.0,18
86166,495490,pizza breadsticks,1533,17387,5.0,15
102278,592111,spicy banana fritters zitumbuwa,1533,52077,5.0,15
114754,656590,trout almondine,1533,30979,5.0,15
115748,660164,turkey nachos,1533,34061,5.0,20


In [13]:
test_df.shape

(25198, 6)

In [14]:
unique_users = train_df['user_id'].unique() 
unique_recipes = train_df['recipe_id'].unique()

In [15]:
users_dict = {unique_users[i] : i for i in range(len(unique_users))}
recipes_dict = {unique_recipes[i] : i for i in range(len(unique_recipes))}

In [16]:
users_dict_rev = {v: k for k, v in users_dict.items()}
recipes_dict_rev = {v: k for k, v in recipes_dict.items()}

In [17]:
train_df['user_id'] = train_df['user_id'].map(users_dict)
train_df['recipe_id'] = train_df['recipe_id'].map(recipes_dict)
test_df['user_id'] = test_df['user_id'].map(users_dict)
test_df['recipe_id'] = test_df['recipe_id'].map(recipes_dict)

## Creating Pivot Table

In [18]:
train_pt = train_df.pivot_table(index='user_id', columns='recipe_id', values='rating')
train_pt_new = train_pt.copy()
test_pt = test_df.pivot_table(index='user_id', columns='recipe_id', values='rating')

In [19]:
train_pt

recipe_id,0,1,2,3,4,5,6,7,8,9,...,31470,31471,31472,31473,31474,31475,31476,31477,31478,31479
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1701,,,,,,,,,,,...,,,,,,,,,,
1702,,,,,,,,,,,...,,,,,,,,,,
1703,,,,,,,,,,,...,,,,,,,,,,
1704,,,,,,,,,,,...,5.0,5.0,5.0,5.0,5.0,5.0,,,,


## Creating Sparse Matrix

In [20]:
import scipy.sparse as sp

train_sparse = sp.csr_matrix(train_pt.fillna(0).values)

test_sparse = sp.csr_matrix(test_pt.fillna(0).values)


In [21]:
train_dense = train_sparse.toarray()
test_dense = test_sparse.toarray()
train_dense.shape

(1706, 31480)

In [22]:
np.count_nonzero(train_dense[0]),np.count_nonzero(test_dense[0])

(16, 1)

## Filling missing Values based on User-User Affinity score
( We are taking 100 Neighbours in consideration)


### Applying KNN Model

In [23]:
from sklearn.neighbors import NearestNeighbors

knn_model = NearestNeighbors(metric='cosine', algorithm='brute',n_neighbors=100)
knn_model.fit(train_dense)

In [24]:
distances, indices = knn_model.kneighbors(train_dense, n_neighbors=101)

In [25]:
indices

array([[   0, 1072,   67, ..., 1162, 1163, 1159],
       [   1,  758,  173, ..., 1690, 1056,   29],
       [   2, 1495,   67, ...,  280, 1076,  983],
       ...,
       [1703,  984,  958, ...,  890,  907,  910],
       [1704,  737, 1129, ..., 1620,    6, 1317],
       [1705,  687, 1580, ..., 1477, 1673,  294]])

In [26]:
distances

array([[0.        , 0.84325172, 0.91316094, ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.89540275, 0.8977789 , ..., 0.96325633, 0.96339413,
        0.96358819],
       [0.        , 0.8587903 , 0.90274656, ..., 0.97579111, 0.97609018,
        0.97619656],
       ...,
       [0.        , 0.91543062, 0.91919116, ..., 0.97067708, 0.97078181,
        0.97078181],
       [0.        , 0.92116085, 0.93463668, ..., 0.985398  , 0.98543882,
        0.985493  ],
       [0.        , 0.85072225, 0.87776669, ..., 0.95159981, 0.95216351,
        0.95216351]])

## Recommendation for single User

In [31]:
from scipy.sparse import find
from sklearn.metrics.pairwise import cosine_similarity

def get_top_k_recommendations(user_id, k, train_sparse, users_dict):
    user_id = users_dict[user_id]
    original_ratings = train_sparse.getrow(user_id).toarray()[0]
    
    # Create a mask for recipes that have not been rated by the user
    mask = original_ratings == 0
    
    # Fill missing values and select only the unrated recipes
    train_sparse_new = fill_missing_values(user_id, train_sparse)
    recommendations = train_sparse_new.getrow(user_id).toarray()[0, mask]
    
    # Sort the recommended recipes in descending order by predicted rating
    top_k = (-recommendations).argsort()[:k]
    top_k_recipe_ids = [recipes_dict_rev[x] for x in top_k]
    return top_k_recipe_ids

def predict_rating(user_id, recipe_id, train_sparse, dists, inds):
    # Get the indices of the K nearest neighbors for the user
    dist = dists[user_id]
    ind = inds[user_id]
    
    # Calculate the weighted average of the ratings for the K nearest neighbors
    neighbor_ratings = train_sparse.getcol(recipe_id).toarray().ravel()[ind]
    neighbor_distances = 1 - dist
    
    mask = ~np.isnan(neighbor_ratings)
    numerator = np.sum(neighbor_ratings[mask] * neighbor_distances[mask])
    denominator = np.sum(neighbor_distances[mask])
    
    if denominator == 0:
        return None
    else:
        return numerator / denominator

def fill_missing_values(user_id, train_sparse):
    train_sparse_new = train_sparse.copy()
    
    # Find unrated recipes for the user
    row, col, _ = find(train_sparse.getrow(user_id))
    unrated_recipe_ids = set(range(train_sparse.shape[1])) - set(col)
    
    # Compute distances and indices of nearest neighbors for the user
    dists = cosine_similarity(train_sparse)
    inds = dists.argsort()[:, ::-1]
    
    # Fill in missing values for each unrated recipe
    for recipe_id in unrated_recipe_ids:
        predicted_rating = predict_rating(user_id, recipe_id, train_sparse, dists, inds)
        if predicted_rating is not None:
            train_sparse_new[user_id, recipe_id] = predicted_rating
    
    return train_sparse_new


In [32]:
top_recipes = get_top_k_recommendations(47559,10, train_sparse, users_dict)
top_recipes

[235618, 206009, 506781, 273428, 57921, 109314, 26489, 38905, 20378, 166675]

## Evaluation

In [33]:
def evaluate_recommendations(user_id, k):
    # get the actual rated recipes for the user
    user_ID = users_dict[user_id]
    actual_rated_recipes = test_pt.loc[user_ID]
    actual_rated_recipes = actual_rated_recipes[actual_rated_recipes.notna()]
    actual_rated_recipes_set = set(actual_rated_recipes.index)
    
    # get the top-k recommended recipes for the user
    top_k_recommendations = get_top_k_recommendations(user_id, k,train_sparse,users_dict)
    top_k_recommendations_set = set(top_k_recommendations)
    
    # calculate precision and recall
    intersection = actual_rated_recipes_set.intersection(top_k_recommendations_set)
    if len(top_k_recommendations_set) == 0:
      precision = 0.0
    else:
      precision = len(intersection) / len(top_k_recommendations_set)
    recall = len(intersection) / len(actual_rated_recipes_set)
    
    return precision, recall

In [36]:
P,R = evaluate_recommendations(23333,10)
print("Precision: ",P)
print("Recall: ",R)

Precision:  0.0
Recall:  0.0
