# Collaborative Filtering

### Uses reviews by user and reviews for restaurants to find recommendations

### Import necessary modules

In [1]:
import pandas as pd
import time
from IPython.display import display
import matplotlib.pyplot as plt
import math
import numpy as np
import seaborn as sns
from datetime import datetime
from datetime import date
from dateutil import parser
import collections
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from itertools import chain
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from itertools import combinations
from nltk.tokenize import WordPunctTokenizer

%matplotlib inline

### Load Data into three DFs: user, review and restaurant info DFs

In [2]:
start = time.time()
restaurant_info = pd.read_csv(r'\\rf3prd431n1\F$\Viraj_work\misc_2\Data_1\restaurant_info.csv', low_memory=False)
user_info = pd.read_csv(r'\\rf3prd431n1\F$\Viraj_work\misc_2\Data_1\user_info_for_restaurants.csv',low_memory=False)
review_info = pd.read_csv(r'\\rf3prd431n1\F$\Viraj_work\misc_2\Data_1\review_info_for_restaurants.csv',low_memory=False)
done = time.time()
elapsed = done-start
print('Reading data took {} seconds'.format(round(elapsed, 2)))

Reading data took 157.3 seconds


### Filter the city and delete rest of the files to clean up memory

In [3]:
city = 'Phoenix'
restaurant_info_city = restaurant_info[restaurant_info.city==city]
review_info_city = review_info[review_info.business_id.isin(restaurant_info_city.business_id)]
user_info_city = user_info[user_info.user_id.isin(review_info_city.user_id)]

del restaurant_info
del review_info
del user_info

In [4]:
user_ids_larger_1 = pd.value_counts(review_info_city.user_id, sort=False) > 1
user_ids_larger_1 = user_ids_larger_1[user_ids_larger_1].index
rest_lens = review_info_city[review_info_city['user_id'].isin(user_ids_larger_1)]
print(rest_lens.shape)
assert np.all(rest_lens.user_id.value_counts() > 1)

(331412, 10)


In [5]:
def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.ix[sampled_ids, 'for_testing'] = True
    return df

rest_lens['for_testing'] = False
grouped = rest_lens.groupby('user_id', group_keys=False).apply(assign_to_set)
rest_lens_train = rest_lens[grouped.for_testing == False]
rest_lens_test = rest_lens[grouped.for_testing == True]
print(rest_lens.shape)
print(rest_lens_train.shape)
print(rest_lens_test.shape)
assert len(rest_lens_train.index & rest_lens_test.index) == 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


(331412, 11)
(237882, 11)
(93530, 11)


### Obtain User and item-based similarities

In [6]:
def fast_similarity(ratings, kind, epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        print(kind)
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        print(kind)
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)


rating_matrix_train = pd.pivot_table(rest_lens_train,index=['user_id'],values='stars',columns=['business_id'])
rating_matrix_train.fillna(0,inplace=True)
rating_matrix_test = pd.pivot_table(rest_lens_test,index=['user_id'],values='stars',columns=['business_id'])
rating_matrix_test.fillna(0,inplace=True)

start = time.time()
user_similarity = fast_similarity(rating_matrix_train.to_numpy(), kind='user')
item_similarity = fast_similarity(rating_matrix_train.to_numpy(), kind='item')
done = time.time()
elapsed = done-start
print('Calculating similarities took {} seconds'.format(round(elapsed, 2)))

user
item
Calculating similarities took 191.46 seconds


In [7]:
from sklearn.metrics import mean_squared_error

def predict_fast_simple(ratings, similarity, kind):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    
def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

start = time.time()
user_prediction = predict_fast_simple(rating_matrix_train, user_similarity, kind='user')
print('User-based CF MSE: ' + str(get_mse(user_prediction, rating_matrix_test.to_numpy())))
item_prediction = predict_fast_simple(rating_matrix_train.to_numpy(), item_similarity, kind='item')
print('Item-based CF MSE: ' + str(get_mse(item_prediction, rating_matrix_test.to_numpy())))

done = time.time()
elapsed = done-start
print('Predicting took {} seconds'.format(round(elapsed, 2)))

User-based CF MSE: 16.35333556329218
Item-based CF MSE: 16.24519623725275
Predicting took 255.35 seconds


### Recommendations based on User-Item Collaborative Filtering


In [28]:
rest_11 = ['rest' + str(i) for i in range(1,1001)]
final_df = pd.DataFrame(columns=rest_11)
for ctr in tqdm(range(len(item_similarity))):
    sim_scores=[]
    for i,j in enumerate(item_similarity[ctr]):
        k=restaurant_info_city['stars'].iloc[i]
        sim_scores.append((restaurant_info_city['name'].iloc[i],j,k))
    sim_scores = sorted(sim_scores, key=lambda x: (x[1],x[2]) , reverse=True)
    a_series = pd.Series(sim_scores[0:1000], index=rest_11)
    final_df = final_df.append(a_series, ignore_index=True)
item_final_df = final_df

100%|██████████████████████████████████████| 3968/3968 [28:55<00:00,  1.82it/s]


### Recommendations based on user-user collaborative filtering

In [49]:
rest_11 = ['rest' + str(i) for i in range(1,17)]
final_df = pd.DataFrame(columns=rest_11)
for ctr in tqdm(range(len(user_similarity[0:1000]))):
    sim_scores=[]
    for i,j in enumerate(user_similarity[ctr]):
        sim_scores.append((i,j))
    sim_scores = sorted(sim_scores, key=lambda x: x[1] , reverse=True)
    a_series = pd.Series(sim_scores[0:16], index=rest_11)
    final_df = final_df.append(a_series, ignore_index=True)

rest_rec_df = pd.DataFrame(columns=['Recommendations'])
#for idx in len(final_df)
for idx in tqdm(range(len(final_df))):
    list_rec = []
    sim_users = [x[0] for x in final_df.iloc[idx][1:]]
    for user in sim_users:
        B = rating_matrix_train.iloc[user]
        business_list = B[B.values>4.0].index.values.tolist()
        for rest_id in business_list:
            rest_name = restaurant_info_city[restaurant_info_city.business_id==rest_id]['name'].values
            list_rec.append(rest_name[0])
    rest_rec_df = rest_rec_df.append({'Recommendations': set(list_rec)}, ignore_index=True)    

100%|██████████████████████████████████████| 1000/1000 [01:20<00:00, 12.54it/s]
100%|██████████████████████████████████████| 1000/1000 [00:27<00:00, 36.34it/s]


In [50]:
rest_rec_df.head(10)

Unnamed: 0,Recommendations
0,"{Asian Cafe, Satisfied Frog On Bell, El Bravo ..."
1,"{HEK Yeah BBQ, Carolina's Mexican Food, Rocket..."
2,{Be Coffee + Food + Stuff}
3,"{La Grande Orange Pizzeria, Pork On A Fork, Ta..."
4,{Hooters}
5,{}
6,{Olive & Ivy}
7,{The Vig Uptown}
8,"{Culinary Dropout, Da Vang Restaurant, Pho Tha..."
9,"{Adobo Dragon, Hana Japanese Eatery}"


# Get Recommendations

In [26]:
from collections import OrderedDict

list_rest = ['The Madison Deli','Hopdoddy Burger Bar',"Oregano's Pizza Bistro",
             'Comedor Guadalajara','Black Forest Mill','Five Guys']

### Recommendations based on user-similarity

In [37]:
from collections import defaultdict
rating_dict = defaultdict(lambda:0)
for iter_rest in list_rest:
    rest_id = restaurant_info_city[restaurant_info_city.name==iter_rest].business_id
    if len(rest_id) == 1:
        rest_id = restaurant_info_city[restaurant_info_city.name==iter_rest].business_id.iloc[0]
        rating_dict[rest_id] = 4
    else:
        for instance in rest_id.values.tolist():
            rating_dict[instance] = 4

dummy = pd.DataFrame(columns = rating_matrix_train.columns)
dummy_2 = dummy.append(rating_dict,ignore_index=True)
dummy_2.index = ['new_user']
rating_matrix_train_new = rating_matrix_train.append(dummy_2)
rating_matrix_train_new.fillna(0,inplace=True)

start = time.time()
# Calculate New User Similarities
user_similarity_new = fast_similarity(rating_matrix_train_new.to_numpy(), kind='user')
done = time.time()
elapsed = done-start
print('Calculating new user similarities took {} seconds'.format(round(elapsed, 2)))

rest_11 = ['rest' + str(i) for i in range(1,51)]
sim_scores=[]
for i,j in enumerate(user_similarity_new[-1]):
    sim_scores.append((i,j))
sim_scores = sorted(sim_scores, key=lambda x: x[1] , reverse=True)
a_series = pd.Series(sim_scores[0:50], index=rest_11)

list_rec_user = []
sim_users = [x[0] for x in a_series.values[1:]]
for user in sim_users:
    B = rating_matrix_train_new.iloc[user]
    business_list = B[B.values>4.0].index.values.tolist()
    for rest_id in business_list:
        rest_name = restaurant_info_city[restaurant_info_city.business_id==rest_id]['name'].values
        list_rec_user.append(rest_name[0])
list_rec_user = list(OrderedDict.fromkeys(list_rec_user)) 

user
Calculating new user similarities took 611.13 seconds


### Recommendations based on item similarity

In [32]:
recommendations_full = []
for iter_rest in list_rest:
    rest_id = restaurant_info_city[restaurant_info_city.name==iter_rest].business_id
    recommendations_name = []
    if len(rest_id) == 1:
        rest_id = restaurant_info_city[restaurant_info_city.name==iter_rest].business_id.iloc[0]
        rating_mat_train_trans = rating_matrix_train.T
        location_of_rest = rating_mat_train_trans.index.get_loc(rest_id)
        item_rec_series = item_final_df.loc[location_of_rest]
        item_recs_df = pd.DataFrame.from_dict(dict(item_rec_series)).T
        item_recs_list = list(item_recs_df[0])
        recommendations_name.extend(item_recs_list)
    else:
        for instance in rest_id.values.tolist():
            rating_mat_train_trans = rating_matrix_train.T
            location_of_rest = rating_mat_train_trans.index.get_loc(instance)
            item_rec_series = item_final_df.loc[location_of_rest]
            item_recs_df = pd.DataFrame.from_dict(dict(item_rec_series)).T
            item_recs_list = list(item_recs_df[0])
            recommendations_name.extend(item_recs_list)
    recommendations_full.extend(recommendations_name)
recommendations_full = list(OrderedDict.fromkeys(recommendations_full))    

In [42]:
list_rec_user.extend(recommendations_full)
list_rec_user = list(OrderedDict.fromkeys(list_rec_user))

In [43]:
print('\n{} recommendations found\n'.format(len(list_rec_user)))
print_flag = True
counter = 0
while print_flag:
    try:
        print('\nHere are some recommendations:\n')
        print(list_rec_user[counter:counter+5])
        counter = counter+5
        flag = input('\nDo you want more recommendations? (Y/N) \n')
        if flag == 'Y':
            continue
        else:
            print('\nThank you. Enjoy your next Restaurant!')
            print_flag = False
    except:
        print('\n\nEnd of Recommendation List. Please choose from available recommendations or change preferences')


2599 recommendations found


Here are some recommendations:

['Hopdoddy Burger Bar', 'Comedor Guadalajara', 'Dilla Libre Cantina', 'Five Guys', "Oregano's Pizza Bistro"]

Do you want more recommendations? (Y/N) 
Y

Here are some recommendations:

['Wicked Bakery', 'Casita Del Mar', 'Z-Grill', 'Yama Sushi House', 'Sushi Sonora']

Do you want more recommendations? (Y/N) 
Y

Here are some recommendations:

['Culichi Sushi', 'Tonys Original Burger Factory', "Fiorella's", "Rolberto's", 'Meritage - An Urban Tavern']

Do you want more recommendations? (Y/N) 
N

Thank you. Enjoy your next Restaurant!
