![title](Chapter-1.png "Header")
___
# Chapter 1 - Simple Approaches to Recommender Systems
## Segment 3 - Making Recommendations Based on Correlation

In [6]:
import numpy as np
import pandas as pd
import chardet

In [7]:
with open("geoplaces2.csv", "rb") as rawdata:
    result = chardet.detect(rawdata.read(100000))

print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.7292665071770335, 'language': ''}


These datasets are hosted on: https://archive.ics.uci.edu/ml/datasets/Restaurant+%26+consumer+data

They were originally published by: Blanca Vargas-Govea, Juan Gabriel GonzÃ¡lez-Serna, Rafael Ponce-MedellÃ­n. Effects of relevant contextual features in the performance of a restaurant recommender system. In RecSysâ€™11: Workshop on Context Aware Recommender Systems (CARS-2011), Chicago, IL, USA, October 23, 2011.

In [8]:
frame =  pd.read_csv('rating_final.csv')
cuisine = pd.read_csv('chefmozcuisine.csv')
geodata = pd.read_csv('geoplaces2.csv', encoding = 'ISO-8859-1')

In [9]:
#frame.head()
geodata.shape

(130, 21)

In [10]:
geodata.head().T

Unnamed: 0,0,1,2,3,4
placeID,134999,132825,135106,132667,132613
latitude,18.9154,22.1474,22.1497,23.7527,23.7529
longitude,-99.1849,-100.983,-100.976,-99.1634,-99.1651
the_geom_meter,0101000020957F000088568DE356715AC138C0A525FC46...,0101000020957F00001AD016568C4858C1243261274BA5...,0101000020957F0000649D6F21634858C119AE9BF528A3...,0101000020957F00005D67BCDDED8157C1222A2DC8D84D...,0101000020957F00008EBA2D06DC8157C194E03B7B504E...
name,Kiku Cuernavaca,puesto de tacos,El Rincón de San Francisco,little pizza Emilio Portes Gil,carnitas_mata
address,Revolucion,esquina santos degollado y leon guzman,Universidad 169,calle emilio portes gil,lic. Emilio portes gil
city,Cuernavaca,s.l.p.,San Luis Potosi,victoria,victoria
state,Morelos,s.l.p.,San Luis Potosi,tamaulipas,Tamaulipas
country,Mexico,mexico,Mexico,?,Mexico
fax,?,?,?,?,?


In [11]:
places =  geodata[['placeID', 'name']]
places.head()

Unnamed: 0,placeID,name
0,134999,Kiku Cuernavaca
1,132825,puesto de tacos
2,135106,El Rincón de San Francisco
3,132667,little pizza Emilio Portes Gil
4,132613,carnitas_mata


In [14]:
#cuisine.head()

## Grouping and Ranking Data

In [16]:
rating = pd.DataFrame(frame.groupby('placeID')['rating'].mean())
#rating.head()
rating.sort_values('rating', ascending = False).head()

Unnamed: 0_level_0,rating
placeID,Unnamed: 1_level_1
132955,2.0
135034,2.0
134986,2.0
132922,1.833333
132755,1.8


In [17]:
rating['rating_count'] = pd.DataFrame(frame.groupby('placeID')['rating'].count())
rating.head()

Unnamed: 0_level_0,rating,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
132560,0.5,4
132561,0.75,4
132564,1.25,4
132572,1.0,15
132583,1.0,4


In [18]:
rating.describe()

Unnamed: 0,rating,rating_count
count,130.0,130.0
mean,1.179622,8.930769
std,0.349354,6.124279
min,0.25,3.0
25%,1.0,5.0
50%,1.181818,7.0
75%,1.4,11.0
max,2.0,36.0


In [21]:
rating.sort_values('rating_count', ascending=False).head()

Unnamed: 0_level_0,rating,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
135085,1.333333,36
132825,1.28125,32
135032,1.178571,28
135052,1.28,25
132834,1.0,25


In [22]:
places[places['placeID']==135085]

Unnamed: 0,placeID,name
121,135085,Tortas Locas Hipocampo


In [23]:
cuisine[cuisine['placeID']==135085]

Unnamed: 0,placeID,Rcuisine
44,135085,Fast_Food


## Preparing Data For Analysis

In [24]:
places_crosstab = pd.pivot_table(data=frame, values='rating', index='userID', columns='placeID')
places_crosstab.head(10).T

userID,U1001,U1002,U1003,U1004,U1005,U1006,U1007,U1008,U1009,U1010
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
132560,,,,,,,,,,
132561,,,,,,,,,,
132564,,,,,,,,,,
132572,,,,,,1.0,1.0,,,
132583,,,,,,,,,,
132584,,,,,,,,,,
132594,,,,,,,,,,
132608,,,,,,,,,,
132609,,,,,,,,,,
132613,,,,,,,,,,


In [26]:
Tortas_ratings = places_crosstab[135085]
Tortas_ratings[Tortas_ratings>=0]

userID
U1001    0.0
U1002    1.0
U1007    1.0
U1013    1.0
U1016    2.0
U1027    1.0
U1029    1.0
U1032    1.0
U1033    2.0
U1036    2.0
U1045    2.0
U1046    1.0
U1049    0.0
U1056    2.0
U1059    2.0
U1062    0.0
U1077    2.0
U1081    1.0
U1084    2.0
U1086    2.0
U1089    1.0
U1090    2.0
U1092    0.0
U1098    1.0
U1104    2.0
U1106    2.0
U1108    1.0
U1109    2.0
U1113    1.0
U1116    2.0
U1120    0.0
U1122    2.0
U1132    2.0
U1134    2.0
U1135    0.0
U1137    2.0
Name: 135085, dtype: float64

## Evaluating Similarity Based on Correlation

In [27]:
similar_to_Tortas = places_crosstab.corrwith(Tortas_ratings)

corr_Tortas = pd.DataFrame(similar_to_Tortas, columns=['PearsonR'])
corr_Tortas.dropna(inplace=True)
corr_Tortas.head()

  c = cov(x, y, rowvar)
  c *= 1. / np.float64(fact)


Unnamed: 0_level_0,PearsonR
placeID,Unnamed: 1_level_1
132572,-0.428571
132723,0.301511
132754,0.930261
132825,0.700745
132834,0.814823


In [31]:
Tortas_corr_summary = corr_Tortas.join(rating['rating_count'])
Tortas_corr_summary.head()

Unnamed: 0_level_0,PearsonR,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
132572,-0.428571,15
132723,0.301511,12
132754,0.930261,13
132825,0.700745,32
132834,0.814823,25


In [33]:
Tortas_corr_summary[Tortas_corr_summary['rating_count']>=10].sort_values('PearsonR', ascending=False).head(10)

Unnamed: 0_level_0,PearsonR,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
135076,1.0,13
135085,1.0,36
135066,1.0,12
132754,0.930261,13
135045,0.912871,13
135062,0.898933,21
135028,0.892218,15
135042,0.881409,20
135046,0.867722,11
132872,0.840168,12


In [34]:
places_corr_Tortas = pd.DataFrame([135085, 132754, 135045, 135062, 135028, 135042, 135046], index = np.arange(7), columns=['placeID'])
summary = pd.merge(places_corr_Tortas, cuisine,on='placeID')
summary

Unnamed: 0,placeID,Rcuisine
0,135085,Fast_Food
1,132754,Mexican
2,135028,Mexican
3,135042,Chinese
4,135046,Fast_Food


In [35]:
places[places['placeID']==135046]

Unnamed: 0,placeID,name
42,135046,Restaurante El Reyecito


In [37]:
cuisine.Rcuisine.describe()

count         916
unique         59
top       Mexican
freq          239
Name: Rcuisine, dtype: object