### The objective of this notebook is to transform the data in a shape that can be used to build models

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import sklearn.metrics.pairwise as pw
from sklearn.metrics.pairwise import pairwise_distances

### Loading Dataset

In [2]:
cols=['business_id', 'name', 'city', 'state',
       'latitude', 'longitude', 'stars', 'review_count','open',
       'categories']

In [None]:
review_cols=['business_id', 'user_id']

In [3]:
df_business=pd.read_csv(r'C:\Users\Yogesh\Downloads\yelp-csv\yelp_academic_dataset_business.csv',low_memory=False,
                        usecols=cols)

In [None]:
df_review=pd.read_csv(r'C:\Users\Yogesh\Downloads\yelp-csv\yelp_academic_dataset_review.csv',
                      usecols=review_cols)

In [4]:
df_business.head()

Unnamed: 0,business_id,categories,latitude,name,review_count,state,open,city,stars,longitude
0,vcNAWiLM4dR7D2nwwJ7nCA,Doctors;Health & Medical,33.499313,"Eric Goldberg, MD",7,AZ,True,Phoenix,3.5,-111.983758
1,JwUE5GmEO-sH1FuwJgKBlQ,Restaurants,43.238893,Pine Cone Restaurant,26,WI,True,De Forest,4.0,-89.335844
2,uGykseHzyS5xAMWoN6YUqA,American (Traditional);Restaurants,43.252267,Deforest Family Restaurant,16,WI,True,De Forest,4.0,-89.353437
3,LRKJF43s9-3jG9Lgx4zODg,Food;Ice Cream & Frozen Yogurt;Fast Food;Resta...,43.251045,Culver's,7,WI,True,De Forest,4.5,-89.374983
4,RgDg-k9S5YD_BaxMckifkg,Chinese;Restaurants,43.240875,Chang Jiang Chinese Kitchen,3,WI,True,De Forest,4.0,-89.343722


### Data Transformation

In [5]:
df_business=df_business[df_business.open==1]
df_explode = df_business.assign(categories = df_business.categories
                         .str.split(';')).explode('categories')

In [7]:
df_explode.head(10)

Unnamed: 0,business_id,categories,latitude,name,review_count,state,open,city,stars,longitude
0,vcNAWiLM4dR7D2nwwJ7nCA,Doctors,33.499313,"Eric Goldberg, MD",7,AZ,True,Phoenix,3.5,-111.983758
0,vcNAWiLM4dR7D2nwwJ7nCA,Health & Medical,33.499313,"Eric Goldberg, MD",7,AZ,True,Phoenix,3.5,-111.983758
1,JwUE5GmEO-sH1FuwJgKBlQ,Restaurants,43.238893,Pine Cone Restaurant,26,WI,True,De Forest,4.0,-89.335844
2,uGykseHzyS5xAMWoN6YUqA,American (Traditional),43.252267,Deforest Family Restaurant,16,WI,True,De Forest,4.0,-89.353437
2,uGykseHzyS5xAMWoN6YUqA,Restaurants,43.252267,Deforest Family Restaurant,16,WI,True,De Forest,4.0,-89.353437
3,LRKJF43s9-3jG9Lgx4zODg,Food,43.251045,Culver's,7,WI,True,De Forest,4.5,-89.374983
3,LRKJF43s9-3jG9Lgx4zODg,Ice Cream & Frozen Yogurt,43.251045,Culver's,7,WI,True,De Forest,4.5,-89.374983
3,LRKJF43s9-3jG9Lgx4zODg,Fast Food,43.251045,Culver's,7,WI,True,De Forest,4.5,-89.374983
3,LRKJF43s9-3jG9Lgx4zODg,Restaurants,43.251045,Culver's,7,WI,True,De Forest,4.5,-89.374983
4,RgDg-k9S5YD_BaxMckifkg,Chinese,43.240875,Chang Jiang Chinese Kitchen,3,WI,True,De Forest,4.0,-89.343722


In [8]:
df_explode.categories.value_counts()

Restaurants           11347
Shopping               5864
Food                   4428
Beauty & Spas          3214
Nightlife              2309
                      ...  
Egyptian                  1
Singaporean               1
Brewing Supplies          1
Meditation Centers        1
Firewood                  1
Name: categories, Length: 714, dtype: int64

#### Filtering out only restaurants records

In [33]:
df_restaurants=df_explode[df_explode.categories=='Restaurants']

#### Group by States to check the number of restaurants and number of reviews in every state

In [35]:
df_restaurants.groupby('state').agg({'state': ['count'],'review_count':['sum']})

Unnamed: 0_level_0,state,review_count
Unnamed: 0_level_1,count,sum
state,Unnamed: 1_level_2,Unnamed: 2_level_2
AZ,5773,305943
EDH,884,13180
ELN,2,10
FIF,1,3
GA,1,16
KHL,1,7
MLN,47,407
NV,3692,344744
ON,207,1932
WI,738,25901


In [36]:
df_restaurants_WI=df_restaurants[df_restaurants.state=='WI']
df_restaurants_WI.shape

In [38]:
##Dropping columns which are not required
df_restaurants_WI.drop(['city', 'state',
       'latitude', 'longitude', 'review_count','open',
       'categories'], axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [39]:
df_restaurants_WI.head()

Unnamed: 0,business_id,name,stars
1,JwUE5GmEO-sH1FuwJgKBlQ,Pine Cone Restaurant,4.0
2,uGykseHzyS5xAMWoN6YUqA,Deforest Family Restaurant,4.0
3,LRKJF43s9-3jG9Lgx4zODg,Culver's,4.5
4,RgDg-k9S5YD_BaxMckifkg,Chang Jiang Chinese Kitchen,4.0
8,rdAdANPNOcvUtoFgcaY9KA,Green Lantern Restaurant,3.5


### Merging Review and Business Dataset

In [55]:
df_merged = pd.merge(df_review, df_restaurants_WI, on='business_id', how='inner')
df_merged.shape

In [57]:
df_merged.head()

Unnamed: 0,user_id,business_id,name,stars
0,zvNimI98mrmhgNOOrzOiGg,JwUE5GmEO-sH1FuwJgKBlQ,Pine Cone Restaurant,4.0
1,p4ySEi8PEli0auZGBsy6gA,JwUE5GmEO-sH1FuwJgKBlQ,Pine Cone Restaurant,4.0
2,ZYaumz29bl9qHpu-KVtMGA,JwUE5GmEO-sH1FuwJgKBlQ,Pine Cone Restaurant,4.0
3,SvS7NXWG2B2kFoaHaWdGfg,JwUE5GmEO-sH1FuwJgKBlQ,Pine Cone Restaurant,4.0
4,qOYI9O0ecMJ9VaqcM9phNw,JwUE5GmEO-sH1FuwJgKBlQ,Pine Cone Restaurant,4.0


In [58]:
## Unique Restaurants
df_merged.business_id.nunique()

728

In [59]:
## Unique Users
df_merged.user_id.nunique()

7404

### Exporting the transformed Dataset

In [60]:
df_merged.to_csv(r'C:\Users\Yogesh\Downloads\yelp-csv\merged.csv')