### Combine all the extracted yelp data into one file: FILE_1.csv, ensure Business_IDs are unique

In [1]:
import csv

In [2]:
types = ['chinese', 'italian', 'indian','mexican','american','sushi']

In [3]:
# combine all the file into FILE_1, ensure restaurant business_ids are unique
business_id_set = set()
line_count = 0
for t in types:
    print(t)
    file_name = 'yelp_'+t+'.csv'
    new_file = 'FILE_1.csv'
    with open(file_name) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        with open(new_file,'a') as out:
            csv_out = csv.writer(out)
            for row in csv_reader:
                # each row: ['Business_ID', 'Name', 'Address', 'Coordinates', 'Num_of_Reviews', 'Rating', 'Zip_Code']
                if line_count == 0:
                    csv_out.writerow(['Business_ID', 'Cuisine', 'Rating', 'Num_of_Reviews'])
                    line_count += 1
                else:
                    if row[0] and row[0] not in business_id_set:
                        csv_out.writerow([row[0], t, row[5], row[4]])
                        business_id_set.add(row[0])
                        line_count += 1
                    else:
                        pass

chinese
italian
indian
mexican
american
sushi


In [4]:
print(line_count-1)
assert line_count-1 == len(business_id_set)

5578


### Prepare training data into FILE_2.csv, also edit FILE_1.csv by filtering out data that is in FILE2.csv 

In [5]:
import pandas as pd
# sort all restaurants based on rating and num_of_reviews
# pick top 100 and last 100 as label 1 and 0
# using pandas to accelerate the process
df = pd.read_csv('FILE_1.csv', delimiter = ',')

In [6]:
df.head()

Unnamed: 0,Business_ID,Cuisine,Rating,Num_of_Reviews
0,wobICp6lQP7Y6dfnJ_4Ujw,chinese,2.5,439
1,lynQoI3w_pzYfHGeuUU-Qg,chinese,4.5,650
2,_XLLSXPIjgoZQtwU3RCcQA,chinese,4.0,274
3,0CjK3esfpFcxIopebzjFxA,chinese,4.0,5672
4,X8ZS-dgiMIJvhwf9SaDnjw,chinese,4.5,1152


In [7]:
len(df)

5578

In [8]:
df = df.convert_objects(convert_numeric=True)

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """Entry point for launching an IPython kernel.


In [9]:
df.dtypes

Business_ID        object
Cuisine            object
Rating            float64
Num_of_Reviews    float64
dtype: object

In [10]:
df.describe()

Unnamed: 0,Rating,Num_of_Reviews
count,5577.0,5577.0
mean,3.886408,233.096468
std,0.523872,383.432258
min,1.0,1.0
25%,3.5,39.0
50%,4.0,119.0
75%,4.0,275.0
max,5.0,9134.0


In [11]:
like = df.nlargest(100,['Rating','Num_of_Reviews'])

In [12]:
# substract like from dataframe 
testing = pd.concat([df, like, like]).drop_duplicates(keep=False)

In [13]:
len(testing)

5478

In [14]:
like['Recommended'] = 1

In [15]:
like.describe()

Unnamed: 0,Rating,Num_of_Reviews,Recommended
count,100.0,100.0,100.0
mean,5.0,43.51,1.0
std,0.0,81.982321,0.0
min,5.0,4.0,1.0
25%,5.0,8.0,1.0
50%,5.0,17.5,1.0
75%,5.0,40.5,1.0
max,5.0,671.0,1.0


In [16]:
like.head()

Unnamed: 0,Business_ID,Cuisine,Rating,Num_of_Reviews,Recommended
3621,MNNpsRXe7Wt5m3Jla9Wk0w,mexican,5.0,671.0,1
2936,VVe74-TKuROBvyS1XgVmvw,mexican,5.0,330.0,1
2322,UtknyDJ_DPut-FKRqs0wpQ,indian,5.0,245.0,1
4657,AFt1Qcec4_JNr6PWpkRYyw,sushi,5.0,204.0,1
2302,3rZwXDjkyDLAHNSdPUI5WA,indian,5.0,166.0,1


In [17]:
not_like = df.nsmallest(100,['Rating','Num_of_Reviews'])

In [18]:
testing = pd.concat([testing, not_like, not_like]).drop_duplicates(keep=False)

In [19]:
len(testing)

5378

In [20]:
not_like['Recommended'] = 0

In [21]:
not_like.describe()

Unnamed: 0,Rating,Num_of_Reviews,Recommended
count,100.0,100.0,100.0
mean,2.17,17.2,0.0
std,0.523296,19.626821,0.0
min,1.0,1.0,0.0
25%,2.0,3.0,0.0
50%,2.5,9.5,0.0
75%,2.5,23.0,0.0
max,2.5,82.0,0.0


In [22]:
not_like.head()

Unnamed: 0,Business_ID,Cuisine,Rating,Num_of_Reviews,Recommended
196,rHDfGuLVfI6WO4kyFIDwRw,chinese,1.0,1.0,0
2399,dgz6m-9WZhahheWVguwfBw,indian,2.0,1.0,0
2515,Hu-HQXpW1Dj0FxGsU8iLkA,indian,1.0,1.0,0
2524,2wkDxFATkqvhAojWtRdrQg,indian,1.0,1.0,0
2538,e_Ee9lzCrTTjghDepvbM9Q,indian,2.0,1.0,0


In [23]:
training = like.append(not_like, ignore_index=True)

In [24]:
training.head()

Unnamed: 0,Business_ID,Cuisine,Rating,Num_of_Reviews,Recommended
0,MNNpsRXe7Wt5m3Jla9Wk0w,mexican,5.0,671.0,1
1,VVe74-TKuROBvyS1XgVmvw,mexican,5.0,330.0,1
2,UtknyDJ_DPut-FKRqs0wpQ,indian,5.0,245.0,1
3,AFt1Qcec4_JNr6PWpkRYyw,sushi,5.0,204.0,1
4,3rZwXDjkyDLAHNSdPUI5WA,indian,5.0,166.0,1


In [25]:
training.describe()

Unnamed: 0,Rating,Num_of_Reviews,Recommended
count,200.0,200.0,200.0
mean,3.585,30.355,0.5
std,1.465782,60.903408,0.501255
min,1.0,1.0,0.0
25%,2.5,5.0,0.0
50%,3.75,13.5,0.5
75%,5.0,35.25,1.0
max,5.0,671.0,1.0


In [26]:
testing.describe()

Unnamed: 0,Rating,Num_of_Reviews
count,5377.0,5377.0
mean,3.897619,240.63753
std,0.449017,388.286558
min,2.0,1.0
25%,3.5,43.0
50%,4.0,126.0
75%,4.0,283.0
max,5.0,9134.0


In [27]:
#save data to csv
training.to_csv("FILE_2.csv",index=False)
testing.to_csv("FILE_1.csv", index=False)