## SENTIMENT ANALYSIS OF YELP REVIEWS DATASET
#### VISHAL BHARTI

* This Notebook pre-processes the yelp reviews dataset and extracts a subset of 300,000 reviews.

In [1]:
## Import libraries
import pandas as pd
from timeit import default_timer as timer

In [4]:
# Read yelp_reviews dataset
# assumes yelp_review.csv is in current directory
start_time = timer()
df = pd.read_csv("yelp_review.csv")
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 78.69 seconds


In [5]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0


In [6]:
# Add a rating column. The rating is "positive" if overall is 4.0 or 5.0, 
# "negative" if overall is 1.0 or 2.0 and "neutral" if overall is 3.0  
df["rating"] = "positive"
df.loc[(df["stars"]==1.0) | (df["stars"]==2.0), "rating"] = "negative"
df.loc[df["stars"]==3.0, "rating"] = "neutral"

In [7]:
# Get the positive, neutral and negative reviews
df_pos = df[df["rating"]=="positive"]
df_neg = df[df["rating"]=="negative"]
df_neut = df[df["rating"]=="neutral"]

In [8]:
# Get the first 100,000 positive, neutral and negative reviews
df_final = pd.concat([df_pos[0:100000], df_neut[0:100000], df_neg[0:100000]])

# Convert rating to categorical type and date to datetime type to
# reduce the memory consumption of the dataframe
df_final['rating'] = df_final.rating.astype('category')
df_final["date"] = pd.to_datetime(df_final["date"])

In [32]:
# Read the yelp_business info dataset
df_buss = pd.read_csv("yelp_business.csv")
df_buss.head()

Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",,"""4855 E Warner Rd, Ste B9""",Ahwatukee,AZ,85044,33.33069,-111.978599,4.0,22,1,Dentists;General Dentistry;Health & Medical;Or...
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",,"""3101 Washington Rd""",McMurray,PA,15317,40.291685,-80.1049,3.0,11,1,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""",,"""6025 N 27th Ave, Ste 1""",Phoenix,AZ,85017,33.524903,-112.11531,1.5,18,1,Departments of Motor Vehicles;Public Services ...
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""",,"""5000 Arizona Mills Cr, Ste 435""",Tempe,AZ,85282,33.383147,-111.964725,3.0,9,0,Sporting Goods;Shopping
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",,"""581 Howe Ave""",Cuyahoga Falls,OH,44221,41.119535,-81.47569,3.5,116,1,American (New);Nightlife;Bars;Sandwiches;Ameri...


In [17]:
# I merged the two datasets but for this project 
# I didn't use the information in yelp_business.csv file
final_df = df_final.merge(df_buss, left_on="business_id", right_on="business_id")
final_df = final_df.sort_values(by="rating")

In [21]:
# Save the processed data to a pickle file
final_df.to_pickle("final_df.pkl")