In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator

In [2]:
def generate_id(df, column_name, id_column):
    
    value_list = df[column_name].unique().tolist()
    int_value_list = [i for i in range(len(value_list))]
    id_list = []
    
    for i in df[column_name]:
        id_list.append(int_value_list[ value_list.index(i) ])
    
    df[id_column] = id_list    

def similar_user_recs(user, interaction_mat, user_sim):

    if user not in interaction_mat.columns:
        return('No data available on user {}'.format(user))

    sim_users = user_sim.sort_values(by=user, ascending=False).index[1:11]
    best = []
    most_common = {}

    for i in sim_users:
        max_score = interaction_mat.loc[:, i].max()
        best.append(interaction_mat[interaction_mat.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]  

In [3]:
data = pd.read_csv("../input/restaurantrecommendationdata/train_100k.csv")
data = data[:50000]
data

Unnamed: 0,customer_id,gender,latitude_x,longitude_x,latitude_y,longitude_y,vendor_category_en,delivery_charge,serving_distance,commission,delivery_available,discount_percentage,language,rank,restaurent_rating,restaurent_tag_name,restaurant_id
0,SZ5JI7X,male,-0.5050,0.0950,0.6187,0.5273,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,11,4.5,"American,Burgers,Free Delivery,Hot Dogs,Pasta",106
1,WKYG878,male,0.2096,0.2810,-0.6010,0.0960,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,1,4.4,"American,Burgers,Desserts,Mojitos ,Pasta",90
2,UXCWXNG,male,0.1357,-78.6000,0.6187,0.5273,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,11,4.5,"American,Burgers,Free Delivery,Hot Dogs,Pasta",106
3,B9HSJBN,male,-0.8800,0.0755,-0.6010,0.0960,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,1,4.4,"American,Burgers,Desserts,Mojitos ,Pasta",90
4,K3RGL6T,male,0.3879,0.5815,-0.1150,0.5460,Restaurants,0.7,15.0,0.0,Yes,0.0,EN,11,4.3,"American,Burgers,Fries,Sandwiches",43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,X18R5HU,male,-16.6700,-3.4410,-0.8610,-0.0650,Sweets & Bakes,0.0,10.0,0.0,Yes,0.0,EN,11,4.3,"Desserts,Free Delivery,Fresh Juices,Healthy Fo...",265
49996,FBLBZVK,male,-0.1160,0.1165,-0.6010,0.0960,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,1,4.4,"American,Burgers,Desserts,Mojitos ,Pasta",90
49997,WKC3778,male,0.2124,-78.6000,-0.0600,0.6530,Restaurants,0.7,8.0,0.0,Yes,0.0,EN,11,4.5,"Fresh Juices,Milkshakes,Mojitos ,Sandwiches,Sh...",191
49998,S3ML00U,male,-1.3680,0.0918,-0.9680,0.0808,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,11,4.5,"American,Burgers,Free Delivery,Hot Dogs,Pasta",105


In [4]:
generate_id(data, 'customer_id', 'id_customer')
data

Unnamed: 0,customer_id,gender,latitude_x,longitude_x,latitude_y,longitude_y,vendor_category_en,delivery_charge,serving_distance,commission,delivery_available,discount_percentage,language,rank,restaurent_rating,restaurent_tag_name,restaurant_id,id_customer
0,SZ5JI7X,male,-0.5050,0.0950,0.6187,0.5273,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,11,4.5,"American,Burgers,Free Delivery,Hot Dogs,Pasta",106,0
1,WKYG878,male,0.2096,0.2810,-0.6010,0.0960,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,1,4.4,"American,Burgers,Desserts,Mojitos ,Pasta",90,1
2,UXCWXNG,male,0.1357,-78.6000,0.6187,0.5273,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,11,4.5,"American,Burgers,Free Delivery,Hot Dogs,Pasta",106,2
3,B9HSJBN,male,-0.8800,0.0755,-0.6010,0.0960,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,1,4.4,"American,Burgers,Desserts,Mojitos ,Pasta",90,3
4,K3RGL6T,male,0.3879,0.5815,-0.1150,0.5460,Restaurants,0.7,15.0,0.0,Yes,0.0,EN,11,4.3,"American,Burgers,Fries,Sandwiches",43,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,X18R5HU,male,-16.6700,-3.4410,-0.8610,-0.0650,Sweets & Bakes,0.0,10.0,0.0,Yes,0.0,EN,11,4.3,"Desserts,Free Delivery,Fresh Juices,Healthy Fo...",265,7749
49996,FBLBZVK,male,-0.1160,0.1165,-0.6010,0.0960,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,1,4.4,"American,Burgers,Desserts,Mojitos ,Pasta",90,11303
49997,WKC3778,male,0.2124,-78.6000,-0.0600,0.6530,Restaurants,0.7,8.0,0.0,Yes,0.0,EN,11,4.5,"Fresh Juices,Milkshakes,Mojitos ,Sandwiches,Sh...",191,9620
49998,S3ML00U,male,-1.3680,0.0918,-0.9680,0.0808,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,11,4.5,"American,Burgers,Free Delivery,Hot Dogs,Pasta",105,4710


In [5]:
data["filtered_restaurant_tag"] = list( map(lambda x: ' '.join(elem for elem in x.split(",")), data['restaurent_tag_name']) )
#data["filtered_restaurant_tag"] = data['gender'] + " " + data['vendor_category_en'] + " " + data["filtered_restaurant_tag"]
data

Unnamed: 0,customer_id,gender,latitude_x,longitude_x,latitude_y,longitude_y,vendor_category_en,delivery_charge,serving_distance,commission,delivery_available,discount_percentage,language,rank,restaurent_rating,restaurent_tag_name,restaurant_id,id_customer,filtered_restaurant_tag
0,SZ5JI7X,male,-0.5050,0.0950,0.6187,0.5273,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,11,4.5,"American,Burgers,Free Delivery,Hot Dogs,Pasta",106,0,American Burgers Free Delivery Hot Dogs Pasta
1,WKYG878,male,0.2096,0.2810,-0.6010,0.0960,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,1,4.4,"American,Burgers,Desserts,Mojitos ,Pasta",90,1,American Burgers Desserts Mojitos Pasta
2,UXCWXNG,male,0.1357,-78.6000,0.6187,0.5273,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,11,4.5,"American,Burgers,Free Delivery,Hot Dogs,Pasta",106,2,American Burgers Free Delivery Hot Dogs Pasta
3,B9HSJBN,male,-0.8800,0.0755,-0.6010,0.0960,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,1,4.4,"American,Burgers,Desserts,Mojitos ,Pasta",90,3,American Burgers Desserts Mojitos Pasta
4,K3RGL6T,male,0.3879,0.5815,-0.1150,0.5460,Restaurants,0.7,15.0,0.0,Yes,0.0,EN,11,4.3,"American,Burgers,Fries,Sandwiches",43,4,American Burgers Fries Sandwiches
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,X18R5HU,male,-16.6700,-3.4410,-0.8610,-0.0650,Sweets & Bakes,0.0,10.0,0.0,Yes,0.0,EN,11,4.3,"Desserts,Free Delivery,Fresh Juices,Healthy Fo...",265,7749,Desserts Free Delivery Fresh Juices Healthy Fo...
49996,FBLBZVK,male,-0.1160,0.1165,-0.6010,0.0960,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,1,4.4,"American,Burgers,Desserts,Mojitos ,Pasta",90,11303,American Burgers Desserts Mojitos Pasta
49997,WKC3778,male,0.2124,-78.6000,-0.0600,0.6530,Restaurants,0.7,8.0,0.0,Yes,0.0,EN,11,4.5,"Fresh Juices,Milkshakes,Mojitos ,Sandwiches,Sh...",191,9620,Fresh Juices Milkshakes Mojitos Sandwiches Sh...
49998,S3ML00U,male,-1.3680,0.0918,-0.9680,0.0808,Restaurants,0.0,15.0,0.0,Yes,0.0,EN,11,4.5,"American,Burgers,Free Delivery,Hot Dogs,Pasta",105,4710,American Burgers Free Delivery Hot Dogs Pasta


In [6]:
tf = TfidfVectorizer()
tf_matrix = tf.fit_transform(data["filtered_restaurant_tag"])
#cosine_sim = linear_kernel(tf_matrix, tf_matrix)

In [7]:
tf_matrix = tf_matrix.astype(int)
tf_matrix

<50000x40 sparse matrix of type '<class 'numpy.int64'>'
	with 302729 stored elements in Compressed Sparse Row format>

In [8]:
cosine_sim = linear_kernel(tf_matrix, tf_matrix)

In [9]:
results = dict()

for idx, row in data.iterrows():
    
    similar_indices = cosine_sim[idx].argsort()[:-100:-1]
    similar_items = [(cosine_sim[idx][i], data['id_customer'][i]) for i in similar_indices]
    
    results[row['id_customer']] = similar_items[1:]

In [10]:
results[0][:10]

[(0.0, 4596),
 (0.0, 2087),
 (0.0, 3568),
 (0.0, 9539),
 (0.0, 2511),
 (0.0, 7206),
 (0.0, 9538),
 (0.0, 9537),
 (0.0, 9536),
 (0.0, 3654)]

In [11]:
restaurant_id_list = list(data['restaurant_id'].unique())
restaurant_id_list[:10]

[106, 90, 43, 82, 189, 4, 191, 192, 157, 33]

In [12]:
customer_list = list(data['id_customer'].unique())
customer_list[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [13]:
interaction_dict = dict()
for id_name in restaurant_id_list:
    temp_user_order = []
    for customer in customer_list:
        temp_user_order.append( data[ (data['id_customer']==customer) & (data['restaurant_id']==id_name) ].shape[0] )
    interaction_dict[str(id_name)] = temp_user_order

In [14]:
interaction_dataset = pd.DataFrame(interaction_dict)
interaction_dataset

Unnamed: 0,106,90,43,82,189,4,191,192,157,33,28,44,105,289,13,265,104,148,110
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,1,0,1,1,0,1,1,0,0,0,1,1,0,0,0,1,0,1,0
3,1,1,1,0,0,0,0,0,1,1,0,1,1,0,0,0,0,1,0
4,0,0,1,1,1,1,1,1,1,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14307,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
14308,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
14309,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
14310,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [15]:
interaction_dataset = interaction_dataset.T
interaction_dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14302,14303,14304,14305,14306,14307,14308,14309,14310,14311
106,1,0,1,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
90,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43,0,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82,0,1,1,0,1,2,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
189,0,1,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,1,0,1,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
191,0,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
192,0,0,0,0,1,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
157,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
33,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
inter_sparse_data = sp.sparse.csr_matrix(interaction_dataset.values)
inter_sparse_data

<19x14312 sparse matrix of type '<class 'numpy.longlong'>'
	with 47360 stored elements in Compressed Sparse Row format>

In [17]:
user_similarity = cosine_similarity(inter_sparse_data.T)
user_sim = pd.DataFrame(user_similarity, index = interaction_dataset.columns, columns = interaction_dataset.columns)
user_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14302,14303,14304,14305,14306,14307,14308,14309,14310,14311
0,1.000000,0.000000,0.333333,0.353553,0.000000,0.000000,0.447214,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,1.000000,0.408248,0.288675,0.516398,0.492366,0.547723,0.166667,0.408248,0.235702,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.408248,0.000000,0.000000,0.000000
2,0.333333,0.408248,1.000000,0.471405,0.421637,0.603023,0.447214,0.408248,0.333333,0.384900,...,0.000000,0.333333,0.333333,0.0,0.000000,0.333333,0.333333,0.000000,0.000000,0.333333
3,0.353553,0.288675,0.471405,1.000000,0.335410,0.319801,0.316228,0.144338,0.000000,0.000000,...,0.000000,0.353553,0.353553,0.0,0.000000,0.000000,0.000000,0.353553,0.353553,0.353553
4,0.000000,0.516398,0.421637,0.335410,1.000000,0.667424,0.282843,0.516398,0.474342,0.365148,...,0.316228,0.000000,0.000000,0.0,0.316228,0.316228,0.316228,0.316228,0.316228,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14307,0.000000,0.000000,0.333333,0.000000,0.316228,0.000000,0.000000,0.000000,0.500000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
14308,0.000000,0.408248,0.333333,0.000000,0.316228,0.301511,0.000000,0.000000,0.000000,0.577350,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
14309,0.000000,0.000000,0.000000,0.353553,0.316228,0.000000,0.000000,0.408248,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000
14310,0.000000,0.000000,0.000000,0.353553,0.316228,0.301511,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000


In [18]:
similar_user_recs(106, interaction_dataset, user_sim)

[('189', 10), ('43', 3), ('105', 3), ('265', 3), ('44', 2)]

## AWS Sample- Retail Demo Store

In [19]:
df_items = pd.read_csv('/kaggle/input/awssampleretaildemostore/items.csv')
df_users = pd.read_csv('/kaggle/input/awssampleretaildemostore/users.csv')
df_raw_users = pd.read_csv('/kaggle/input/awssampleretaildemostore/raw_users.csv')
df_interactions = pd.read_csv('/kaggle/input/awssampleretaildemostore/interactions.csv')
df_raw_products = pd.read_csv('/kaggle/input/awssampleretaildemostore/raw_products.csv')

In [20]:
df_items.head()

Unnamed: 0,ITEM_ID,CATEGORY,STYLE
0,e1669081-8ffc-4dec-97a6-e9176d7f6651,apparel,scarf
1,cfafd627-7d6b-43a5-be05-4c7937be417d,housewares,kitchen
2,6e6ad102-7510-4a02-b8ce-5a0cd6f431d1,apparel,jacket
3,49b89871-5fe7-4898-b99d-953e15fb42b2,electronics,speaker
4,5cb18925-3a3c-4867-8f1c-46efd7eba067,footwear,sandals


In [21]:
df_users.head()

Unnamed: 0,USER_ID,AGE,GENDER
0,1,31,M
1,2,58,F
2,3,43,M
3,4,38,M
4,5,24,M


In [22]:
df_interactions.head()

Unnamed: 0,ITEM_ID,USER_ID,EVENT_TYPE,TIMESTAMP,DISCOUNT
0,94cc3c8d-7efd-4f7b-84d0-9996f7e90c2f,3156,ProductViewed,1591803788,No
1,94cc3c8d-7efd-4f7b-84d0-9996f7e90c2f,3156,ProductViewed,1591803788,No
2,e2c8393e-2109-4a91-966f-f30274d0515d,332,ProductViewed,1591803812,Yes
3,e2c8393e-2109-4a91-966f-f30274d0515d,332,ProductViewed,1591803812,Yes
4,494d3480-3c7d-448e-8d3b-834b06fff156,3981,ProductViewed,1591803830,Yes


In [23]:
df_raw_users.head()

Unnamed: 0,id,username,email,first_name,last_name,addresses,age,gender,persona,discount_persona
0,1,user1,mark.johnson@example.com,Mark,Johnson,"[{'first_name': 'Mark', 'last_name': 'Johnson'...",31,M,furniture_homedecor_housewares,lower_priced_products
1,2,user2,kristen.calderon@example.com,Kristen,Calderon,"[{'first_name': 'Kristen', 'last_name': 'Calde...",58,F,tools_housewares_apparel,discount_indifferent
2,3,user3,joseph.maddox@example.com,Joseph,Maddox,"[{'first_name': 'Joseph', 'last_name': 'Maddox...",43,M,floral_beauty_jewelry,lower_priced_products
3,4,user4,jay.lewis@example.com,Jay,Lewis,"[{'first_name': 'Jay', 'last_name': 'Lewis', '...",38,M,books_apparel_homedecor,discount_indifferent
4,5,user5,anthony.valdez@example.com,Anthony,Valdez,"[{'first_name': 'Anthony', 'last_name': 'Valde...",24,M,instruments_books_electronics,discount_indifferent


In [24]:
df_raw_products.head()

Unnamed: 0,id,url,sk,name,category,style,description,price,image,gender_affinity,current_stock,featured
0,e1669081-8ffc-4dec-97a6-e9176d7f6651,http://d3idkbp2p2okv.cloudfront.net/#/product/...,,Sans Pareil Scarf,apparel,scarf,Sans pareil scarf for women,124.99,http://d3idkbp2p2okv.cloudfront.net/images/app...,F,12,
1,cfafd627-7d6b-43a5-be05-4c7937be417d,http://d3idkbp2p2okv.cloudfront.net/#/product/...,,Chef Knife,housewares,kitchen,A must-have for your kitchen,57.99,http://d3idkbp2p2okv.cloudfront.net/images/hou...,,9,
2,6e6ad102-7510-4a02-b8ce-5a0cd6f431d1,http://d3idkbp2p2okv.cloudfront.net/#/product/...,,Gainsboro Jacket,apparel,jacket,This gainsboro jacket for women is perfect for...,133.99,http://d3idkbp2p2okv.cloudfront.net/images/app...,F,13,
3,49b89871-5fe7-4898-b99d-953e15fb42b2,http://d3idkbp2p2okv.cloudfront.net/#/product/...,,High Definition Speakers,electronics,speaker,High definition speakers to fill the house wit...,196.99,http://d3idkbp2p2okv.cloudfront.net/images/ele...,,6,
4,5cb18925-3a3c-4867-8f1c-46efd7eba067,http://d3idkbp2p2okv.cloudfront.net/#/product/...,,Spiffy Sandals,footwear,sandals,This spiffy pair of sandals for woman is perfe...,9.99,http://d3idkbp2p2okv.cloudfront.net/images/foo...,F,14,
