## Simple Approaches to Recommender System 

https://www.kaggle.com/datasets/uciml/restaurant-data-with-consumer-ratings

### Popularity-Based Recommenders

In [1]:
import numpy as np
import pandas as pd

In [5]:
frame = pd.read_csv("data/rating_final.csv")
cuisine = pd.read_csv("data/chefmozcuisine.csv")

In [3]:
frame.head()

Unnamed: 0,userID,placeID,rating,food_rating,service_rating
0,U1077,135085,2,2,2
1,U1077,135038,2,2,1
2,U1077,132825,2,2,2
3,U1077,135060,1,2,2
4,U1068,135104,1,1,2


In [6]:
cuisine.head()

Unnamed: 0,placeID,Rcuisine
0,135110,Spanish
1,135109,Italian
2,135107,Latin_American
3,135106,Mexican
4,135105,Fast_Food


#### Recommending based on counts

In [10]:
frame.groupby('placeID')['rating'].count()

placeID
132560     4
132561     4
132564     4
132572    15
132583     4
          ..
135088     6
135104     7
135106    10
135108    11
135109     4
Name: rating, Length: 130, dtype: int64

In [11]:
rating_by_count = pd.DataFrame(frame.groupby('placeID')['rating'].count())
rating_by_count

Unnamed: 0_level_0,rating
placeID,Unnamed: 1_level_1
132560,4
132561,4
132564,4
132572,15
132583,4
...,...
135088,6
135104,7
135106,10
135108,11


In [15]:
rating_by_count.sort_values('rating', ascending=False, inplace=True)

In [16]:
rating_by_count

Unnamed: 0_level_0,rating
placeID,Unnamed: 1_level_1
135085,36
132825,32
135032,28
135052,25
132834,25
...,...
132766,3
132717,3
135011,3
132668,3


In [21]:
# Top 10 Cuisine
summary = pd.merge(rating_by_count, cuisine, on='placeID').head(10)
summary

Unnamed: 0,placeID,rating,Rcuisine
0,135085,36,Fast_Food
1,132825,32,Mexican
2,135032,28,Cafeteria
3,135032,28,Contemporary
4,135052,25,Bar
5,135052,25,Bar_Pub_Brewery
6,132834,25,Mexican
7,135060,22,Seafood
8,135042,20,Chinese
9,132862,18,International


In [23]:
cuisine['Rcuisine'].describe() 
# Mexican food is popular and that places that serve it are good candidates for recommending. 
#Mexican most frequently served type of cuisine in the dataset

count         916
unique         59
top       Mexican
freq          239
Name: Rcuisine, dtype: object

#### Recommending based on Correlation

##### Item Based Similarity

In [25]:
frame = pd.read_csv('data/rating_final.csv')
cuisine = pd.read_csv("data/chefmozcuisine.csv")
geodata = pd.read_csv("data/geoplaces2.csv", encoding='mbcs')

In [26]:
geodata.head()

Unnamed: 0,placeID,latitude,longitude,the_geom_meter,name,address,city,state,country,fax,...,alcohol,smoking_area,dress_code,accessibility,price,url,Rambience,franchise,area,other_services
0,134999,18.915421,-99.184871,0101000020957F000088568DE356715AC138C0A525FC46...,Kiku Cuernavaca,Revolucion,Cuernavaca,Morelos,Mexico,?,...,No_Alcohol_Served,none,informal,no_accessibility,medium,kikucuernavaca.com.mx,familiar,f,closed,none
1,132825,22.147392,-100.983092,0101000020957F00001AD016568C4858C1243261274BA5...,puesto de tacos,esquina santos degollado y leon guzman,s.l.p.,s.l.p.,mexico,?,...,No_Alcohol_Served,none,informal,completely,low,?,familiar,f,open,none
2,135106,22.149709,-100.976093,0101000020957F0000649D6F21634858C119AE9BF528A3...,El Rincï¿½n de San Francisco,Universidad 169,San Luis Potosi,San Luis Potosi,Mexico,?,...,Wine-Beer,only at bar,informal,partially,medium,?,familiar,f,open,none
3,132667,23.752697,-99.163359,0101000020957F00005D67BCDDED8157C1222A2DC8D84D...,little pizza Emilio Portes Gil,calle emilio portes gil,victoria,tamaulipas,?,?,...,No_Alcohol_Served,none,informal,completely,low,?,familiar,t,closed,none
4,132613,23.752903,-99.165076,0101000020957F00008EBA2D06DC8157C194E03B7B504E...,carnitas_mata,lic. Emilio portes gil,victoria,Tamaulipas,Mexico,?,...,No_Alcohol_Served,permitted,informal,completely,medium,?,familiar,t,closed,none


In [27]:
places = geodata[['placeID','name']]
places.head()

Unnamed: 0,placeID,name
0,134999,Kiku Cuernavaca
1,132825,puesto de tacos
2,135106,El Rincï¿½n de San Francisco
3,132667,little pizza Emilio Portes Gil
4,132613,carnitas_mata


##### Group and Ranking Data

In [29]:
rating_avg = pd.DataFrame(frame.groupby('placeID')['rating'].mean())
rating_avg.head()

Unnamed: 0_level_0,rating
placeID,Unnamed: 1_level_1
132560,0.5
132561,0.75
132564,1.25
132572,1.0
132583,1.0


In [31]:
rating_avg['rating_count'] = pd.DataFrame(frame.groupby('placeID')['rating'].count())
rating_avg.head()

Unnamed: 0_level_0,rating,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
132560,0.5,4
132561,0.75,4
132564,1.25,4
132572,1.0,15
132583,1.0,4


In [32]:
rating_avg.describe()

Unnamed: 0,rating,rating_count
count,130.0,130.0
mean,1.179622,8.930769
std,0.349354,6.124279
min,0.25,3.0
25%,1.0,5.0
50%,1.181818,7.0
75%,1.4,11.0
max,2.0,36.0


In [34]:
# Top place that have been rated
rating_avg.sort_values('rating_count',ascending=False).head()

Unnamed: 0_level_0,rating,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
135085,1.333333,36
132825,1.28125,32
135032,1.178571,28
135052,1.28,25
132834,1.0,25


In [38]:
summary = pd.merge(rating_avg, places, on='placeID')
summary.sort_values('rating_count',ascending=False).head()

Unnamed: 0,placeID,rating,rating_count,name
123,135085,1.333333,36,Tortas Locas Hipocampo
31,132825,1.28125,32,puesto de tacos
80,135032,1.178571,28,Cafeteria y Restaurant El Pacifico
98,135052,1.28,25,La Cantina Restaurante
33,132834,1.0,25,Gorditas Doa Gloria


In [39]:
cuisine[cuisine['placeID']==135085]

Unnamed: 0,placeID,Rcuisine
44,135085,Fast_Food


##### Preparing  Data For Analysis

In [42]:
place_crosstab = pd.pivot_table(data=frame, values='rating', index='userID', columns='placeID')
place_crosstab

placeID,132560,132561,132564,132572,132583,132584,132594,132608,132609,132613,...,135080,135081,135082,135085,135086,135088,135104,135106,135108,135109
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U1001,,,,,,,,,,,...,,,,0.0,,,,,,
U1002,,,,,,,,,,,...,,,,1.0,,,,1.0,,
U1003,,,,,,,,,,,...,2.0,,,,,,,,,
U1004,,,,,,,,,,,...,,,,,,,,2.0,,
U1005,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U1134,,,,0.0,,,,,,,...,1.0,,,2.0,,,,,,
U1135,,,,,,,,,,,...,,,,0.0,,,,0.0,,
U1136,,,,,,,,,,,...,,,,,,,,,,
U1137,,,,,,,,,,,...,,,,2.0,,,,,,


In [46]:
tortas_rating = place_crosstab[135085]
tortas_rating[tortas_rating>=0]

userID
U1001    0.0
U1002    1.0
U1007    1.0
U1013    1.0
U1016    2.0
U1027    1.0
U1029    1.0
U1032    1.0
U1033    2.0
U1036    2.0
U1045    2.0
U1046    1.0
U1049    0.0
U1056    2.0
U1059    2.0
U1062    0.0
U1077    2.0
U1081    1.0
U1084    2.0
U1086    2.0
U1089    1.0
U1090    2.0
U1092    0.0
U1098    1.0
U1104    2.0
U1106    2.0
U1108    1.0
U1109    2.0
U1113    1.0
U1116    2.0
U1120    0.0
U1122    2.0
U1132    2.0
U1134    2.0
U1135    0.0
U1137    2.0
Name: 135085, dtype: float64

##### Evaluating Similarity Based on Correlation

In [50]:
similar_to_tortas = place_crosstab.corrwith(tortas_rating)
corr_tortas = pd.DataFrame(similar_to_tortas, columns=['PearsonR'])
corr_tortas.dropna(inplace=True)

In [51]:
corr_tortas

Unnamed: 0_level_0,PearsonR
placeID,Unnamed: 1_level_1
132572,-0.428571
132723,0.301511
132754,0.930261
132825,0.700745
132834,0.814823
132856,0.475191
132861,0.5
132862,0.559017
132872,0.840168
132921,0.493013


In [52]:
tortas_core_summary = corr_tortas.join(rating_avg['rating_count'])
tortas_core_summary

Unnamed: 0_level_0,PearsonR,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
132572,-0.428571,15
132723,0.301511,12
132754,0.930261,13
132825,0.700745,32
132834,0.814823,25
132856,0.475191,14
132861,0.5,7
132862,0.559017,18
132872,0.840168,12
132921,0.493013,17


In [54]:
# Valid recommendations are places that have a rating count greater than 10
tortas_core_summary[tortas_core_summary['rating_count']>=10].sort_values('PearsonR', ascending=False).head(10)

Unnamed: 0_level_0,PearsonR,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
135076,1.0,13
135085,1.0,36
135066,1.0,12
132754,0.930261,13
135045,0.912871,13
135062,0.898933,21
135028,0.892218,15
135042,0.881409,20
135046,0.867722,11
132872,0.840168,12


In [55]:
#Note from the video I want to point out these places here that have a PearsonR value of one, though. These PearsonR values of one aren't meaningful here. The reason you're seeing these is because for those places there was only one user who gave a review to both places. That user gave both places the same score, which is why you're seeing a PearsonR value of one, but a correlation that's based on similarities between only one review rating. That's not meaningful. The places need to have more than one reviewer in common. So we'll throw those places out. 
places_corr_tortas = pd.DataFrame([135085,132754,135045,135062,135028,135042,135046], index =np.arange(7), columns=['placeID'])
summary = pd.merge(places_corr_tortas, cuisine, on ='placeID')
summary # only returned 5 since 2 of it is not in the cuisine dataset

Unnamed: 0,placeID,Rcuisine
0,135085,Fast_Food
1,132754,Mexican
2,135028,Mexican
3,135042,Chinese
4,135046,Fast_Food


In [56]:
pd.merge(summary,places, on ='placeID')

Unnamed: 0,placeID,Rcuisine,name
0,135085,Fast_Food,Tortas Locas Hipocampo
1,132754,Mexican,Cabana Huasteca
2,135028,Mexican,La Virreina
3,135042,Chinese,Restaurant Oriental Express
4,135046,Fast_Food,Restaurante El Reyecito
