# Collaborative Filtering

### Uses reviews by user and reviews for restaurants to find recommendations

### Import necessary modules

In [1]:
import pandas as pd
import time
from IPython.display import display
import matplotlib.pyplot as plt
import math
import numpy as np
import seaborn as sns
from datetime import datetime
from datetime import date
from dateutil import parser
import collections
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from itertools import chain
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from itertools import combinations
from nltk.tokenize import WordPunctTokenizer

%matplotlib inline

### Load Data into three DFs: user, review and restaurant info DFs

In [2]:
start = time.time()
restaurant_info = pd.read_csv(r'\\rf3prd431n1\F$\Viraj_work\misc_2\Data_1\restaurant_info.csv', low_memory=False)
user_info = pd.read_csv(r'\\rf3prd431n1\F$\Viraj_work\misc_2\Data_1\user_info_for_restaurants.csv',low_memory=False)
review_info = pd.read_csv(r'\\rf3prd431n1\F$\Viraj_work\misc_2\Data_1\review_info_for_restaurants.csv',low_memory=False)
done = time.time()
elapsed = done-start
print('Reading data took {} seconds'.format(round(elapsed, 2)))

Reading data took 108.27 seconds


### Filter the city and delete rest of the files to clean up memory

In [3]:
city = 'Phoenix'
restaurant_info_city = restaurant_info[restaurant_info.city==city]
review_info_city = review_info[review_info.business_id.isin(restaurant_info_city.business_id)]
user_info_city = user_info[user_info.user_id.isin(review_info_city.user_id)]

del restaurant_info
del review_info
del user_info

In [4]:
user_ids_larger_1 = pd.value_counts(review_info_city.user_id, sort=False) > 1
user_ids_larger_1 = user_ids_larger_1[user_ids_larger_1].index
rest_lens = review_info_city[review_info_city['user_id'].isin(user_ids_larger_1)]
print(rest_lens.shape)
assert np.all(rest_lens.user_id.value_counts() > 1)

(331412, 10)


In [6]:
def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.ix[sampled_ids, 'for_testing'] = True
    return df

rest_lens['for_testing'] = False
grouped = rest_lens.groupby('user_id', group_keys=False).apply(assign_to_set)
rest_lens_train = rest_lens[grouped.for_testing == False]
rest_lens_test = rest_lens[grouped.for_testing == True]
print(rest_lens.shape)
print(rest_lens_train.shape)
print(rest_lens_test.shape)
assert len(rest_lens_train.index & rest_lens_test.index) == 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


(331412, 11)
(237882, 11)
(93530, 11)


### Obtain User and item-based similarities

In [7]:
def fast_similarity(ratings, kind, epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        print(kind)
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        print(kind)
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)


rating_matrix_train = pd.pivot_table(rest_lens_train,index=['user_id'],values='stars',columns=['business_id'])
rating_matrix_train.fillna(0,inplace=True)
rating_matrix_test = pd.pivot_table(rest_lens_test,index=['user_id'],values='stars',columns=['business_id'])
rating_matrix_test.fillna(0,inplace=True)

start = time.time()
user_similarity = fast_similarity(rating_matrix_train.to_numpy(), kind='user')
item_similarity = fast_similarity(rating_matrix_train.to_numpy(), kind='item')
done = time.time()
elapsed = done-start
print('Calculating similarities took {} seconds'.format(round(elapsed, 2)))

user
item
Calculating similarities took 202.34 seconds


In [9]:
from sklearn.metrics import mean_squared_error

def predict_fast_simple(ratings, similarity, kind):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    
def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

start = time.time()
user_prediction = predict_fast_simple(rating_matrix_train, user_similarity, kind='user')
print('User-based CF MSE: ' + str(get_mse(user_prediction, rating_matrix_test.to_numpy())))
item_prediction = predict_fast_simple(rating_matrix_train.to_numpy(), item_similarity, kind='item')
print('Item-based CF MSE: ' + str(get_mse(item_prediction, rating_matrix_test.to_numpy())))

done = time.time()
elapsed = done-start
print('Predicting took {} seconds'.format(round(elapsed, 2)))

User-based CF MSE: 16.377537446054856
Item-based CF MSE: 16.26825853829439
Predicting took 63.36 seconds


In [15]:
rest_11 = ['rest' + str(i) for i in range(1,1001)]
final_df = pd.DataFrame(columns=rest_11)
for ctr in tqdm(range(len(item_similarity))):
    sim_scores=[]
    for i,j in enumerate(item_similarity[ctr]):
        k=restaurant_info_city['stars'].iloc[i]
        sim_scores.append((restaurant_info_city['name'].iloc[i],j,k))
    sim_scores = sorted(sim_scores, key=lambda x: (x[1],x[2]) , reverse=True)
    a_series = pd.Series(sim_scores[0:1000], index=rest_11)
    final_df = final_df.append(a_series, ignore_index=True)

100%|██████████████████████████████████████| 3964/3964 [25:18<00:00,  1.70it/s]


In [17]:
final_df.head(10)

Unnamed: 0,rest1,rest2,rest3,rest4,rest5,rest6,rest7,rest8,rest9,rest10,...,rest991,rest992,rest993,rest994,rest995,rest996,rest997,rest998,rest999,rest1000
0,"(Taco Bell, 1.0, 3.0)","(Tropical Smoothie Cafe, 0.2304604807744351, 3.5)","(Filiberto's Mexican Food, 0.15052812771815174...","(Boston Market, 0.12883132528626093, 3.0)","(Raising Cane's, 0.12539520404718804, 3.5)","(Bear And The Honey Specialty Bakery, 0.112751...","(Jack in the Box, 0.10767304610836526, 2.0)","(Subway, 0.10310715337201987, 3.5)","(Peter Piper Pizza, 0.09935643173535201, 3.5)","(My Slice of the Pie Pizzeria, 0.0987184237760...",...,"(Mariscos Bahia De Guaymas, 9.395990083207513...","(ZK Grill, 9.395990083207513e-12, 4.0)","(Chick-fil-A, 9.395990083207513e-12, 3.5)","(Maximillians Coyote Cafe, 9.395990083207513e-...","(Schlotzskys Deli, 9.395990083207513e-12, 2.5)","(E-Z Buffet, 9.395990083207513e-12, 2.0)","(Palatte, 9.371099999529637e-12, 4.0)","(Panda Express, 9.351877210058245e-12, 3.0)","(Capri Market & Deli, 9.297600041203884e-12, 5.0)","(Leafy Sea Dragon, 9.297600041203884e-12, 5.0)"
1,"(Nee House Chinese Restaurant, 1.0, 3.5)","(Kwan & Wok Chinese Restaurant, 0.132669104664...","(SOLO Trattoria, 0.12539994245346842, 4.5)","(McDonald's, 0.10193912752488543, 3.0)","(Good Taste House, 0.09945545739658587, 2.5)","(Generation Y Design, 0.09929361821364144, 4.5)","(Dragon Bowl, 0.09501013935183851, 4.0)","(La Cocina Economica, 0.09441534317944332, 4.0)","(Rosie McCaffrey's Irish Pub & Restaurant, 0.0...","(Ziggy's IL Posto, 0.0933670743895788, 4.0)",...,"(Gojo Ethiopian Restaurant, 0.0143939684813481...","(Burger Shoppe, 0.014383274957562524, 4.0)","(Ingallina's Box Lunch, 0.01438093010037784, 3.5)","(Persian King, 0.014351276530171493, 3.5)","(Twisted Curry, 0.014340148486464569, 4.5)","(Sardella's Pizza & Wings, 0.01432890507990468...","(Humphrey's Kitchen & Koffee, 0.01427415019714...","(Subway, 0.014272205534606537, 2.5)","(Panera Bread, 0.014250298836135104, 2.0)","(Province Urban Kitchen & Bar, 0.0141495952228..."
2,"(Vals Getaway DES Cafeteria, 1.0, 3.0)","(Spinato's Pizza, 0.20232565951193443, 4.0)","(Original ChopShop, 0.1805597678887862, 4.0)","(Burger King, 0.12139539574230751, 2.0)","(Alida Restaurant Supply, 0.10924558756364385,...","(Panda Express, 0.10290702075278949, 2.5)","(Cafe On The Way, 0.09575627721490212, 5.0)","(La Fontanella Restaurant, 0.09528537154237327...","(The Plaza Bistro & Catering, 0.09269795493860...","(Giant Manhattan Pizza & Pasta, 0.090279883948...",...,"(Choon, 1.1561466260071055e-11, 3.5)","(Crazy Jim's, 1.1561466260071055e-11, 3.5)","(Marcia's Long Wong's, 1.1561466260071055e-11,...","(Paisley Violin, 1.1561466260071055e-11, 3.5)","(Brennan's Pub & Grub, 1.1561466260071055e-11,...","(PINO, 1.1561466260071055e-11, 3.5)","(Denny's, 1.1561466260071055e-11, 3.0)","(Whataburger, 1.1561466260071055e-11, 3.0)","(Logan's Roadhouse, 1.1561466260071055e-11, 3.0)","(Yo Mama's Good Cook'n, 1.1561466260071055e-11..."
3,"(Sushi Mocorito, 1.0, 3.0)","(Clarendon Kitchen + Bar C4, 0.215121482333735...","(Wild Tuna Sushi and Spirits, 0.19818812256932...","(Dickey's Barbecue Pit, 0.18571070798006273, 3.0)","(Plaza Bonita, 0.15855049805625135, 3.5)","(Wingstop, 0.132125415047235, 3.0)","(Lai Lai Mongolian Bbq, 0.12676154911590007, 3.5)","(McDonald's, 0.12057275466248572, 1.5)","(San Carlos Bay Seafood Restaurant, 0.11891287...","(Subway, 0.11536243243901527, 2.5)",...,"(Pizza Hut, 7.0905938301051465e-12, 2.0)","(Kyoto Bowl, 7.0905938301051465e-12, 2.0)","(La Taqueria De Jalisco, 7.01309867452966e-12,...","(Pizza People Food Truck, 7.01309867452966e-12...","(Cafe Rio, 7.01309867452966e-12, 4.0)","(St. Francis Restaurant, 7.01309867452966e-12,...","(Taqueria La Korita, 7.01309867452966e-12, 4.0)","(Altamimi Restutant, 7.01309867452966e-12, 4.0)","(High & Rye, 7.01309867452966e-12, 3.5)","(Sardella's Pizza & Wings, 7.01309867452966e-1..."
4,"(Oregano's Pizza Bistro, 0.9999999999999998, 3.5)","(Yogurt Mart, 0.16884628332656088, 2.0)","(Draw 10 Bar & Grill, 0.15134153492426894, 4.0)","(Chelsea's Kitchen, 0.1468228550675322, 4.0)","(The Ol Pizzeria & Cafe, 0.13980332148557134, ...","(Tariq Restaurant, 0.13620738143244107, 4.0)","(Pita Jungle, 0.11872206766233943, 3.5)","(Rally's, 0.11745828405549401, 2.5)","(Smackin' Wings, 0.11652517012921594, 4.5)","(Rokerij, 0.11557579567069953, 4.0)",...,"(Royal Palms Resort and Spa, in the Unbound Co...","(Potbelly Sandwich Shop, 9.571679098274112e-12...","(Chico Malo, 9.571679098274112e-12, 4.0)","(Nogales Hot Dogs, 9.571679098274112e-12, 4.0)","(The Rose and Crown, 9.571679098274112e-12, 3.5)","(Noodles A GoGo, 9.571679098274112e-12, 3.0)","(Frank's Pizza & Wings, 9.571679098274112e-12,...","(Zinc Brasserie, 9.571679098274112e-12, 3.0)","(Pho Bang Restaurant, 9.571679098274112e-12, 3.0)","(Philly Affairs, 9.571679098274112e-12, 3.0)"
5,"(Santanas Mexican Food, 1.0, 3.0)","(Palms Restaurant and Market, 0.08095441172500...","(Giant Manhattan Pizza & Pasta, 0.078955228797...","(Oscar's Pizza, 0.07583402087796932, 2.5)","(Firehouse Subs, 0.07417964248623424, 4.0)","(Fatburger, 0.06929800405592003, 3.0)","(Biscuits, 0.06502713643186547, 4.0)","(CC'S Mesquite Broiler, 0.06316418303868729, 3.5)","(Panda Express, 0.061509513185148905, 3.0)","(Taqueria La Korita, 0.0596326207010694, 4.0)",...,"(Choon, 5.05560139155286e-12, 3.5)","(Crazy Jim's, 5.05560139155286e-12, 3.5)","(Marcia's Long Wong's, 5.05560139155286e-12, 3.5)","(Paisley Violin, 5.05560139155286e-12, 3.5)","(Brennan's Pub & Grub, 5.05560139155286e-12, 3.5)","(PINO, 5.05560139155286e-12, 3.5)","(Denny's, 5.05560139155286e-12, 3.0)","(Whataburger, 5.05560139155286e-12, 3.0)","(Logan's Roadhouse, 5.05560139155286e-12, 3.0)","(Yo Mama's Good Cook'n, 5.05560139155286e-12, ..."
6,"(Five Guys, 1.0000000000000002, 3.5)","(Del Taco, 0.2371293305171645, 2.5)","(Moon Valley Grill, 0.23539701236371996, 4.0)","(O Bar & Grill, 0.20265015962545727, 1.5)","(Tacos Huicho, 0.19023098935654995, 4.5)","(Chick-fil-A, 0.13518451761480468, 2.5)","(Pete's Fish & Chips, 0.12361284652592897, 3.0)","(The Good Egg - Uptown, 0.12070759759739236, 3.0)","(Green Chile Grill, 0.12067160600562839, 4.5)","(Pizza Hut, 0.11327304709950527, 2.0)",...,"(Del Taco, 1.0218988991294978e-11, 1.5)","(Plaza Bonita, 1.0196254838571187e-11, 3.5)","(Luu's Chicken Bowl, 1.01289520909276e-11, 4.5)","(Flaming Wok, 1.01289520909276e-11, 4.0)","(Pocho's, 1.01289520909276e-11, 4.0)","(Pho Binh Minh, 1.01289520909276e-11, 4.0)","(Mediterranean Pita, 1.01289520909276e-11, 4.0)","(Snooze, An A.M. Eatery, 1.01289520909276e-11,...","(J.K. Sushi, 1.01289520909276e-11, 3.5)","(Ben Brothers Market, 1.01289520909276e-11, 3.5)"
7,"(Pizza Hut, 1.0, 2.0)","(R.Kidd's Pizza & Wings, 0.22271770159972334, ...","(Baja Loco Mesquite Grill & Cantina, 0.0929824...","(Jack-In-the Box Drive Thru, 0.092623368936063...","(ThirdSpace, 0.06780635037243385, 4.5)","(Panera Bread, 0.0569987973486639, 3.0)","(Wendy's, 0.03702900210524054, 3.0)","(2 Fat Guys Grilled Cheese, 0.0352578551469853...","(The Greek Pita, 0.034610879954114514, 4.5)","(Cherry Tree Smokehouse BBQ, 0.032725037326897...",...,"(Eribertos Mexican Food, 1.7417664808793052e-1...","(Charlie D's Catfish & Chicken, 1.732917483002...","(Wild Game Grill, 1.732917483002491e-11, 4.5)","(Comedor Guadalajara, 1.732917483002491e-11, 4.0)","(Imperio 667 Mariscos & Sushi, 1.7329174830024...","(Akaihana Sushi & Grill, 1.732917483002491e-11...","(Kool Jerk, 1.732917483002491e-11, 4.0)","(Denny's, 1.732917483002491e-11, 3.5)","(Einstein Bros Bagels, 1.732917483002491e-11, ...","(Barros Pizza, 1.732917483002491e-11, 3.0)"
8,"(Mariscos El Dorado Sin, 1.0, 2.0)","(Fattoush Restaurant, 0.15124041846997158, 3.5)","(Taylor's Place, 0.11478340719546638, 4.5)","(Casita Del Mar, 0.11372686419821781, 4.5)","(Chick-fil-A, 0.10756429941162123, 3.5)","(Kona Grill, 0.10358720791491557, 3.5)","(El Kora, 0.10028512003482302, 2.5)","(Tatum's Restaurant, 0.09074425108305843, 2.5)","(Royal Palms Resort and Spa, in the Unbound Co...","(Shenanigans Bar & Grill, 0.0889913228219233, ...",...,"(Long John Silvers, 2.2227711218151442e-11, 3.0)","(Eye Opener Family Restaurant, 2.2227711218151...","(Waffle House, 2.2227711218151442e-11, 3.0)","(Mimi's Cafe, 2.2227711218151442e-11, 3.0)","(Las Islas Del Capitan, 2.2227711218151442e-11...","(Fusilli's, 2.2227711218151442e-11, 2.5)","(Subway, 2.2227711218151442e-11, 1.0)","(Playas Del Novilleros, 1.8148850212981917e-11...","(Mariscos El Dorado Sin, 1.8148850212981917e-1...","(Mi Pueblo Mexican Food, 1.8148850212981917e-1..."
9,"(12 East Cafe, 0.9999999999999999, 3.0)","(Chipotle Mexican Grill, 0.15894053677122327, ...","(Lucky's Burgers & Shakes, 0.1169770677277359,...","(Smashburger, 0.08219370883167365, 3.0)","(Rev Cafe, 0.07913281349444069, 3.5)","(Taco Bell, 0.075247548360615, 2.5)","(Blue Pacific Super Buffet, 0.0746071320474584...","(Tandoori Times 3 Indian Bistro, 0.07027413828...","(Mi Pueblo Mexican Food, 0.07018624064599972, ...","(Old Station Sub Shop, 0.06874069982761126, 3.0)",...,"(Kabob Palace, 8.104408984562337e-12, 5.0)","(Taqueria Castillo, 8.104408984562337e-12, 5.0)","(Tacos Chiwas, 8.104408984562337e-12, 4.5)","(La Grande Orange Pizzeria, 8.104408984562337e...","(The Best Ever Subs and More, 8.10440898456233...","(Krachai Thai Kitchen, 8.104408984562337e-12, ...","(Song Lynn, 8.104408984562337e-12, 4.5)","(Z's Greek, 8.104408984562337e-12, 4.5)","(El Sabroso Hot Dogs, 8.104408984562337e-12, 4.5)","(Reign Of Thai, 8.104408984562337e-12, 4.5)"


In [144]:
rest_11 = ['rest' + str(i) for i in range(1,17)]
final_df = pd.DataFrame(columns=rest_11)
for ctr in tqdm(range(len(user_similarity[0:1000]))):
    sim_scores=[]
    for i,j in enumerate(user_similarity[ctr]):
        sim_scores.append((i,j))
    sim_scores = sorted(sim_scores, key=lambda x: x[1] , reverse=True)
    a_series = pd.Series(sim_scores[0:16], index=rest_11)
    final_df = final_df.append(a_series, ignore_index=True)

100%|██████████████████████████████████████| 1000/1000 [01:03<00:00, 15.58it/s]


In [160]:
rest_rec_df = pd.DataFrame(columns=['Recommendations'])
#for idx in len(final_df)
for idx in tqdm(range(len(final_df))):
    list_rec = []
    sim_users = [x[0] for x in final_df.iloc[idx][1:]]
    for user in sim_users:
        B = rating_matrix_train.iloc[user]
        business_list = B[B.values>4.0].index.values.tolist()
        for rest_id in business_list:
            rest_name = restaurant_info_city[restaurant_info_city.business_id==rest_id]['name'].values
            list_rec.append(rest_name[0])
    rest_rec_df = rest_rec_df.append({'Recommendations': set(list_rec)}, ignore_index=True)

100%|██████████████████████████████████████| 1000/1000 [00:22<00:00, 43.91it/s]


In [175]:
rest_rec_df.iloc[9].values

array([{'Adobo Dragon', 'Hana Japanese Eatery'}], dtype=object)

In [170]:
rest_rec_df

Unnamed: 0,Recommendations
0,"{Viet Kitchen, Asian Cafe, Las Jicaras Mexican..."
1,"{Naked BBQ, Rustic Cafe, Paradise Valley Pizza..."
2,{Ollie Vaughn's}
3,"{La Grande Orange Pizzeria, Bobby Q}"
4,{Hooters}
5,{}
6,"{The Henry, Jobot Coffee & Diner, Olive & Ivy}"
7,{The Vig Uptown}
8,"{Doughbird, Great Wall Cuisine, Pappadeaux Sea..."
9,"{Adobo Dragon, Hana Japanese Eatery}"
