In [79]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split

In [73]:
# reading Restraunt's data files
r_accepts = pd.read_csv("data/chefmozaccepts.csv", encoding = 'latin1')
r_cuisine = pd.read_csv("data/chefmozcuisine.csv", encoding = 'latin1')
r_hours = pd.read_csv("data/chefmozhours4.csv", encoding = 'latin1')
r_parking = pd.read_csv("data/chefmozparking.csv", encoding = 'latin1')
r_geoplaces = pd.read_csv("data/geoplaces2.csv", encoding = 'latin1')
# reading Consumers' data files
u_cuisine = pd.read_csv("data/usercuisine.csv", encoding = 'latin1')
u_payment = pd.read_csv("data/userpayment.csv", encoding = 'latin1')
u_profile = pd.read_csv("data/userprofile.csv", encoding = 'latin1')
# reading user-item rating file
ratings = pd.read_csv("data/rating_final.csv", encoding = 'latin1')

## Model-based Recommendation Engine

In [74]:
# print(rating.describe())
ratings['tot_rating'] = ratings.apply(lambda x: round((int(x.rating) + int(x.food_rating) + int(x.service_rating))/3), axis=1)
del ratings['rating'], ratings['food_rating'], ratings['service_rating']
ratings.head()

Unnamed: 0,userID,placeID,tot_rating
0,U1077,135085,2
1,U1077,135038,2
2,U1077,132825,2
3,U1077,135060,2
4,U1068,135104,1


In [122]:
# print(r_geoplaces.info())
r_info = r_geoplaces.copy()
val = ['the_geom_meter', 'fax', 'zip', 'url', 'name', 'address', 'city', 'state', 'country', 'other_services', 'latitude', 'longitude']
for i in val:
    del r_info[i]
r_info.head()

Unnamed: 0,placeID,alcohol,smoking_area,dress_code,accessibility,price,Rambience,franchise,area
0,134999,No_Alcohol_Served,none,informal,no_accessibility,medium,familiar,f,closed
1,132825,No_Alcohol_Served,none,informal,completely,low,familiar,f,open
2,135106,Wine-Beer,only at bar,informal,partially,medium,familiar,f,open
3,132667,No_Alcohol_Served,none,informal,completely,low,familiar,t,closed
4,132613,No_Alcohol_Served,permitted,informal,completely,medium,familiar,t,closed


In [123]:
onehot = ['alcohol', 'smoking_area', 'dress_code', 'accessibility', 'price', 'Rambience', 'franchise', 'area']
for i in onehot:
    tmp = pd.get_dummies(r_geoplaces[i], prefix=[i])
    r_info = pd.concat([r_info, tmp], axis=1)
    del r_info[i]
r_info.head()    

Unnamed: 0,placeID,['alcohol']_Full_Bar,['alcohol']_No_Alcohol_Served,['alcohol']_Wine-Beer,['smoking_area']_none,['smoking_area']_not permitted,['smoking_area']_only at bar,['smoking_area']_permitted,['smoking_area']_section,['dress_code']_casual,...,['accessibility']_partially,['price']_high,['price']_low,['price']_medium,['Rambience']_familiar,['Rambience']_quiet,['franchise']_f,['franchise']_t,['area']_closed,['area']_open
0,134999,0,1,0,1,0,0,0,0,0,...,0,0,0,1,1,0,1,0,1,0
1,132825,0,1,0,1,0,0,0,0,0,...,0,0,1,0,1,0,1,0,0,1
2,135106,0,0,1,0,0,1,0,0,0,...,1,0,0,1,1,0,1,0,0,1
3,132667,0,1,0,1,0,0,0,0,0,...,0,0,1,0,1,0,0,1,1,0
4,132613,0,1,0,0,0,0,1,0,0,...,0,0,0,1,1,0,0,1,1,0


In [124]:
metadata = pd.merge(r_info, ratings, left_on='placeID', right_on='placeID')
metadata.head()

Unnamed: 0,placeID,['alcohol']_Full_Bar,['alcohol']_No_Alcohol_Served,['alcohol']_Wine-Beer,['smoking_area']_none,['smoking_area']_not permitted,['smoking_area']_only at bar,['smoking_area']_permitted,['smoking_area']_section,['dress_code']_casual,...,['price']_low,['price']_medium,['Rambience']_familiar,['Rambience']_quiet,['franchise']_f,['franchise']_t,['area']_closed,['area']_open,userID,tot_rating
0,134999,0,1,0,1,0,0,0,0,0,...,0,1,1,0,1,0,1,0,U1093,2
1,134999,0,1,0,1,0,0,0,0,0,...,0,1,1,0,1,0,1,0,U1066,1
2,134999,0,1,0,1,0,0,0,0,0,...,0,1,1,0,1,0,1,0,U1040,1
3,134999,0,1,0,1,0,0,0,0,0,...,0,1,1,0,1,0,1,0,U1110,2
4,134999,0,1,0,1,0,0,0,0,0,...,0,1,1,0,1,0,1,0,U1121,2


In [125]:
t = []
for i in metadata.columns:
    if i not in ['placeID', 'userID', 'tot_rating']:
        t.append(i)
x = pd.DataFrame(scale(metadata[t]), index=metadata.index, columns=t)
# x = pd.concat([metadata[['placeID', 'userID']], x], axis=1)
y = metadata['tot_rating']
x.head()

Unnamed: 0,['alcohol']_Full_Bar,['alcohol']_No_Alcohol_Served,['alcohol']_Wine-Beer,['smoking_area']_none,['smoking_area']_not permitted,['smoking_area']_only at bar,['smoking_area']_permitted,['smoking_area']_section,['dress_code']_casual,['dress_code']_formal,...,['accessibility']_partially,['price']_high,['price']_low,['price']_medium,['Rambience']_familiar,['Rambience']_quiet,['franchise']_f,['franchise']_t,['area']_closed,['area']_open
0,-0.27204,0.723617,-0.615516,0.908555,-0.49029,-0.128986,-0.209916,-0.499731,-0.333174,-0.11048,...,-0.25713,-0.502421,-0.656937,1.006048,0.241539,-0.241539,0.395442,-0.395442,0.350463,-0.350463
1,-0.27204,0.723617,-0.615516,0.908555,-0.49029,-0.128986,-0.209916,-0.499731,-0.333174,-0.11048,...,-0.25713,-0.502421,-0.656937,1.006048,0.241539,-0.241539,0.395442,-0.395442,0.350463,-0.350463
2,-0.27204,0.723617,-0.615516,0.908555,-0.49029,-0.128986,-0.209916,-0.499731,-0.333174,-0.11048,...,-0.25713,-0.502421,-0.656937,1.006048,0.241539,-0.241539,0.395442,-0.395442,0.350463,-0.350463
3,-0.27204,0.723617,-0.615516,0.908555,-0.49029,-0.128986,-0.209916,-0.499731,-0.333174,-0.11048,...,-0.25713,-0.502421,-0.656937,1.006048,0.241539,-0.241539,0.395442,-0.395442,0.350463,-0.350463
4,-0.27204,0.723617,-0.615516,0.908555,-0.49029,-0.128986,-0.209916,-0.499731,-0.333174,-0.11048,...,-0.25713,-0.502421,-0.656937,1.006048,0.241539,-0.241539,0.395442,-0.395442,0.350463,-0.350463


In [126]:
# train-test split - randomized
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, test_size=0.25, random_state = 50)

### SVM Classifier

In [127]:
from sklearn import svm
from sklearn.metrics import accuracy_score
model = svm.SVC()
model.fit(x_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [128]:
pred = model.predict(x_test)
cm = accuracy_score(y_test, pred)
print(cm)

0.4879725085910653


### Decision Tree Classifier

In [129]:
from sklearn import tree
model = tree.DecisionTreeClassifier(criterion='gini')
model.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)

0.48109965635738833

## Implementing Item-based Collaborative Filtering
Preferring item-based CF over user-based CF as user based CF is prone to fake profiles/users

In [151]:
ratings.head()

Unnamed: 0,userID,placeID,tot_rating
0,U1077,135085,2
1,U1077,135038,2
2,U1077,132825,2
3,U1077,135060,2
4,U1068,135104,1


In [158]:
r_info = r_geoplaces[['placeID','name']]
r_info.head()

Unnamed: 0,placeID,name
0,134999,Kiku Cuernavaca
1,132825,puesto de tacos
2,135106,El Rincón de San Francisco
3,132667,little pizza Emilio Portes Gil
4,132613,carnitas_mata


In [159]:
new_ratings = pd.merge(ratings, r_info, left_on="placeID", right_on = "placeID")
new_ratings.info() # no missing values
new_ratings.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1161 entries, 0 to 1160
Data columns (total 4 columns):
userID        1161 non-null object
placeID       1161 non-null int64
tot_rating    1161 non-null int64
name          1161 non-null object
dtypes: int64(2), object(2)
memory usage: 45.4+ KB


Unnamed: 0,userID,placeID,tot_rating,name
0,U1077,135085,2,Tortas Locas Hipocampo
1,U1108,135085,1,Tortas Locas Hipocampo
2,U1081,135085,1,Tortas Locas Hipocampo
3,U1056,135085,2,Tortas Locas Hipocampo
4,U1134,135085,2,Tortas Locas Hipocampo


In [160]:
train, val = train_test_split(new_ratings, train_size=0.75, test_size=0.25, random_state = 50)
train.info()
val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 870 entries, 202 to 109
Data columns (total 4 columns):
userID        870 non-null object
placeID       870 non-null int64
tot_rating    870 non-null int64
name          870 non-null object
dtypes: int64(2), object(2)
memory usage: 34.0+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 291 entries, 993 to 255
Data columns (total 4 columns):
userID        291 non-null object
placeID       291 non-null int64
tot_rating    291 non-null int64
name          291 non-null object
dtypes: int64(2), object(2)
memory usage: 11.4+ KB


In [156]:
# print(len(train['placeID'].unique()))
# print(len(val['placeID'].unique()))
set(val['placeID'].unique()).issubset(set(train['placeID'].unique())) # verifying that each restraunt of val is in train

True

In [161]:
# considering overall rating only
overallRating = train.pivot_table(index=['userID'],columns=['name'],values='tot_rating')
overallRating.head()

name,Abondance Restaurante Bar,Arrachela Grill,Cabana Huasteca,Cafe Chaires,Cafeteria cenidet,Cafeteria y Restaurant El Pacifico,Carls Jr,Carnitas Mata Calle 16 de Septiembre,Carreton de Flautas y Migadas,Cenaduria El RincÃ³n de Tlaquepaque,...,puesto de tacos,rockabilly,shi ro ie,sirloin stockade,tacos abi,tacos de barbacoa enfrente del Tec,tacos de la estacion,tacos los volcanes,tortas hawai,vips
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U1001,,,,,,,,,,,...,2.0,,,,,,,,,
U1002,,,,,,,,,,,...,1.0,,,,,,,,,
U1003,,,2.0,,,,,,,,...,2.0,2.0,,,,,,,,
U1004,,,,,,2.0,,,,,...,,,,,,,,2.0,,
U1005,,,,,,2.0,,,,,...,,,,,,,,,,


In [162]:
corrMatrix = overallRating.corr(method='pearson', min_periods=5) # find correlation between each pair of restraunts for atleast 5 ratings
corrMatrix.head()

name,Abondance Restaurante Bar,Arrachela Grill,Cabana Huasteca,Cafe Chaires,Cafeteria cenidet,Cafeteria y Restaurant El Pacifico,Carls Jr,Carnitas Mata Calle 16 de Septiembre,Carreton de Flautas y Migadas,Cenaduria El RincÃ³n de Tlaquepaque,...,puesto de tacos,rockabilly,shi ro ie,sirloin stockade,tacos abi,tacos de barbacoa enfrente del Tec,tacos de la estacion,tacos los volcanes,tortas hawai,vips
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abondance Restaurante Bar,1.0,,,,,,,,,,...,,,,,,,,,,
Arrachela Grill,,,,,,,,,,,...,,,,,,,,,,
Cabana Huasteca,,,1.0,,,,,,,,...,0.789352,,,,,,,,,
Cafe Chaires,,,,1.0,,,,,,,...,,,,,,,,,,
Cafeteria cenidet,,,,,1.0,,,,,,...,,,,,,,,,,


In [163]:
x = overallRating.loc['U1001'].dropna()
x.head()

name
El Rincon de San Francisco          2.0
Restaurant El Muladar de Calzada    1.0
Restaurant de Mariscos de Picon     1.0
Restaurant los Compadres            1.0
Restaurante Versalles               1.0
Name: U1001, dtype: float64

In [164]:
a = corrMatrix['Abondance Restaurante Bar'].dropna()
a.head()

name
Abondance Restaurante Bar    1.0
Potzocalli                   0.0
Name: Abondance Restaurante Bar, dtype: float64

Assuming that user should rate top 3 recommendations as 2, next 7 as 1 and rest as 0

In [165]:
def find_recommendations(user_id, u_rating):
    
    user_rating = overallRating.loc[user_id].dropna()
    # a = corrMatrix[r_name].dropna()
    simCandidates = pd.Series()

    for i in range(0, len(user_rating.index)):
        # print("Adding sims for " + user_rating.index[i] + "...")
        # Retrieve similar restraunts to this one that user rated
        sims = corrMatrix[user_rating.index[i]].dropna()
        # Now scale its similarity by how well the user rated this restraunt, adding 1 to rating to avoid diminishing affect of 0
        sims = sims.map(lambda x: x * (user_rating[i]+1))
        # Add the score to the list of similarity candidates
        simCandidates = simCandidates.append(sims)

    simCandidates.sort_values(inplace = True, ascending = False)
    # grouping and summing redundant occurances
    simCandidates = simCandidates.groupby(simCandidates.index).sum()
    simCandidates.sort_values(inplace = True, ascending = False)
    
    r_recomm = list(simCandidates.index)
    return r_recomm

def find_rating(user_id, r_name, u_rating):
    r_recomm = find_recommendations(user_id, u_rating)
    if r_name in r_recomm[0:3]:
        return 2
    elif r_name in r_recomm[3:10]:
        return 1
    else:
        return 0

In [167]:
val['pred_ratings'] = val[['userID','name','tot_rating']].apply(lambda x: find_rating(x[0],x[1],x[2]), axis=1)
accuracy_score(val['tot_rating'],val['pred_ratings'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


0.2542955326460481

In [168]:
# manual check
val['err'] = val[['tot_rating','pred_ratings']].apply(lambda x: 1 if (x[0]==x[1]) else 0, axis = 1)
metric = val['err'].sum()/len(val)
print(metric)

0.2542955326460481


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Model-based RS (~49%) is better than our item-based RS (~26%)

### Improvements:
1. We can use Content-based RS or Context-aware recommender systems
2. To improve the results, we can ensemble various techniques and their results!
3. Also, we can choose different cut-offs for different rating in the recommended places.