# Importing Important Libraries

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator
%matplotlib inline

# Dataset Import and Preparations

## Importing the dataset

In [2]:
dataset = pd.read_csv("../input/restaurantrecommendationdata/train_100k.csv")
dataset.head()

Unnamed: 0,customer_id,gender,latitude_x,longitude_x,latitude_y,longitude_y,vendor_category_en,delivery_charge,serving_distance,commission,delivery_available,discount_percentage,language,rank,restaurent_rating,restaurent_tag_name,restaurant_id
0,SZ5JI7X,male,-0.505,0.095,0.6187,0.5273,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,11,4.5,"American,Burgers,Free Delivery,Hot Dogs,Pasta",106
1,WKYG878,male,0.2096,0.281,-0.601,0.096,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,1,4.4,"American,Burgers,Desserts,Mojitos ,Pasta",90
2,UXCWXNG,male,0.1357,-78.6,0.6187,0.5273,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,11,4.5,"American,Burgers,Free Delivery,Hot Dogs,Pasta",106
3,B9HSJBN,male,-0.88,0.0755,-0.601,0.096,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,1,4.4,"American,Burgers,Desserts,Mojitos ,Pasta",90
4,K3RGL6T,male,0.3879,0.5815,-0.115,0.546,Restaurants,0.7,15.0,0.0,Yes,0.0,EN,11,4.3,"American,Burgers,Fries,Sandwiches",43


## Checking whether any columns has missing value

In [3]:
dataset.isnull().sum()

customer_id            0
gender                 0
latitude_x             0
longitude_x            0
latitude_y             0
longitude_y            0
vendor_category_en     0
delivery_charge        0
serving_distance       0
commission             0
delivery_available     0
discount_percentage    0
language               0
rank                   0
restaurent_rating      0
restaurent_tag_name    0
restaurant_id          0
dtype: int64

## Let's check the number of columns or rows for our restaurant dataset

In [4]:
dataset.shape

(100000, 17)

### So our restaurant dataset has a total of 17 columns and 100000 rows. However, the customer id is of object type and our model is not able to handle object type data. So, we need to do something about it. In the next section we will preprocess our data and generate unique id number for customers. Since, there is no missing value we don't need to do any imputation for this dataset

# Collaborative Filtering Recommendation

### In this section we will build an interaction matrix that will be used to train our collaborative filetering recommendation. As, collaborative filtering works with the interaction between item and user we can't use the dataset as it is

## Important Functions

In [5]:
def generate_id(df, column_name, id_column):
    
    value_list = df[column_name].unique().tolist()
    int_value_list = [i for i in range(len(value_list))]
    id_list = []
    
    for i in df[column_name]:
        id_list.append(int_value_list[ value_list.index(i) ])
    
    df[id_column] = id_list

def similar_user_recs(user, interaction_mat, user_sim):

    if user not in interaction_mat.columns:
        return('No data available on user {}'.format(user))

    sim_users = user_sim.sort_values(by=user, ascending=False).index[1:11]
    best = []
    most_common = {}

    for i in sim_users:
        max_score = interaction_mat.loc[:, i].max()
        best.append(interaction_mat[interaction_mat.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]   

## Unique id generation for customer_id column

In [6]:
generate_id(dataset, 'customer_id', 'id_customer')
dataset.head()

Unnamed: 0,customer_id,gender,latitude_x,longitude_x,latitude_y,longitude_y,vendor_category_en,delivery_charge,serving_distance,commission,delivery_available,discount_percentage,language,rank,restaurent_rating,restaurent_tag_name,restaurant_id,id_customer
0,SZ5JI7X,male,-0.505,0.095,0.6187,0.5273,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,11,4.5,"American,Burgers,Free Delivery,Hot Dogs,Pasta",106,0
1,WKYG878,male,0.2096,0.281,-0.601,0.096,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,1,4.4,"American,Burgers,Desserts,Mojitos ,Pasta",90,1
2,UXCWXNG,male,0.1357,-78.6,0.6187,0.5273,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,11,4.5,"American,Burgers,Free Delivery,Hot Dogs,Pasta",106,2
3,B9HSJBN,male,-0.88,0.0755,-0.601,0.096,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,1,4.4,"American,Burgers,Desserts,Mojitos ,Pasta",90,3
4,K3RGL6T,male,0.3879,0.5815,-0.115,0.546,Restaurants,0.7,15.0,0.0,Yes,0.0,EN,11,4.3,"American,Burgers,Fries,Sandwiches",43,4


### Let's see the unique restaurant id's that is present in our dataset. We will need this later in our process.

In [7]:
restaurant_id_list = list(dataset['restaurant_id'].unique())
restaurant_id_list[:10]

[106, 90, 43, 82, 189, 4, 191, 192, 157, 33]

### Also the unique customer id that we generated will also be used later. I put it here just to have a glimpse of what we are expecting

In [8]:
customer_list = list(dataset['id_customer'].unique())
customer_list[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

## Interaction Matrix Creation

### In this section we will generate an interaction matrix that will be used for our recommendation system. First we will generate a dictionary then we will parse that dictionary to our pandas dataset.

In [9]:
interaction_dict = dict()
for id_name in restaurant_id_list:
    temp_user_order = []
    for customer in customer_list:
        temp_user_order.append( dataset[ (dataset['id_customer']==customer) & (dataset['restaurant_id']==id_name) ].shape[0] )
    interaction_dict[str(id_name)] = temp_user_order
#interaction_dict

In [10]:
interaction_dataset = pd.DataFrame(interaction_dict)
interaction_dataset

Unnamed: 0,106,90,43,82,189,4,191,192,157,33,28,44,105,289,13,265,104,148,110
0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
1,0,1,1,1,1,1,1,0,0,0,0,0,0,0,1,1,0,0,1
2,2,2,2,1,0,2,1,1,0,0,1,1,0,0,0,1,0,1,1
3,1,1,1,0,0,0,0,0,1,1,0,1,1,0,1,0,0,1,0
4,1,0,1,2,1,2,2,1,2,0,1,1,1,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15262,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
15263,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
15264,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
15265,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Now, we have a dataset that has number of total restaurant as columns and number of total customer id's as rows. This represents our interaction matrix where each row represents how many the user has made order from that particular restaurant. This will be used to make weight for user/item interaction in out interaction matrix.

## Transpose Dataset to fit the interaction matrix

In [11]:
interaction_dataset = interaction_dataset.T
interaction_dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15257,15258,15259,15260,15261,15262,15263,15264,15265,15266
106,1,0,2,1,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
90,0,1,2,1,0,2,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43,0,1,2,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82,1,1,1,0,2,2,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
189,0,1,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,1
4,0,1,2,0,2,2,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
191,0,1,1,0,2,2,1,1,1,0,...,1,0,0,0,0,0,0,0,0,0
192,0,0,1,0,1,2,0,2,0,0,...,0,0,1,0,0,0,0,0,0,0
157,1,0,0,1,2,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
33,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Here we will convert our dataset to sparse array as it will be helpful for our system to work on interaction matrix efficiently

In [12]:
inter_sparse_data = sp.sparse.csr_matrix(interaction_dataset.values)
inter_sparse_data

<19x15267 sparse matrix of type '<class 'numpy.longlong'>'
	with 90005 stored elements in Compressed Sparse Row format>

## Finding Cosine Similarity Between User/Item

In [13]:
user_similarity = cosine_similarity(inter_sparse_data.T)
user_sim = pd.DataFrame(user_similarity, index = interaction_dataset.columns, columns = interaction_dataset.columns)
user_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15257,15258,15259,15260,15261,15262,15263,15264,15265,15266
0,1.000000,0.166667,0.306186,0.333333,0.588348,0.283473,0.129099,0.433013,0.447214,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.166667,1.000000,0.680414,0.333333,0.588348,0.692935,0.688530,0.384900,0.447214,0.500000,...,0.333333,0.000000,0.000000,0.000000,0.000000,0.333333,0.000000,0.000000,0.333333,0.333333
2,0.306186,0.680414,1.000000,0.544331,0.680545,0.771517,0.737865,0.412479,0.182574,0.510310,...,0.204124,0.204124,0.204124,0.000000,0.408248,0.408248,0.204124,0.000000,0.000000,0.000000
3,0.333333,0.333333,0.544331,1.000000,0.457604,0.440959,0.602464,0.192450,0.000000,0.166667,...,0.000000,0.333333,0.000000,0.333333,0.333333,0.000000,0.333333,0.333333,0.000000,0.000000
4,0.588348,0.588348,0.680545,0.457604,1.000000,0.778312,0.557007,0.679366,0.526235,0.392232,...,0.392232,0.196116,0.196116,0.196116,0.196116,0.392232,0.196116,0.196116,0.196116,0.196116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15262,0.000000,0.333333,0.408248,0.000000,0.392232,0.377964,0.258199,0.000000,0.000000,0.500000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
15263,0.000000,0.000000,0.204124,0.333333,0.196116,0.188982,0.000000,0.000000,0.000000,0.500000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
15264,0.000000,0.000000,0.000000,0.333333,0.196116,0.188982,0.258199,0.288675,0.000000,0.000000,...,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000
15265,0.000000,0.333333,0.000000,0.000000,0.196116,0.000000,0.258199,0.000000,0.447214,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000


# Making Recommendations

In [14]:
similar_user_recs(0, interaction_dataset, user_sim)

[('82', 10), ('104', 10), ('157', 8), ('106', 7), ('105', 1)]

### Here the first value the tuple represents the restaurant id and the second value represents the user that is most similar in terms of interactions.

 # Conclusion
 ### This is a test on the recommendation system on the restaurant dataset. It can be enhanced through applying various conditions and improving weight on the interaction matrix. The location also has a huge impact on the interaction system. However, for simplified learning I have ignored it for now. Kindly comment if you have any suggestions or advice regarding this notebook