### ***Contributed By:-***

Divyanshi Bhojak

Tarushi Jat



### **Notebook II: Feature Extraction & Preprocessing of Yelp Dataset**

****

# **1. Necessary Library Import**

In [None]:
import numpy as np 
import pandas as pd
import nltk
nltk.download('vader_lexicon')
import re;
nltk.download('brown')
nltk.download('punkt')
from textblob import TextBlob
import warnings 

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **2. Data Loading**

#  **Dataset: *YELP* Data**    

The Yelp dataset is a subset of our businesses, reviews, and user data for use in personal, educational, and academic purposes. 

We are following data files from the YELP data directory:


*   **metadata** file consists attributes like User id, Product id, Rating, Label and Date.

*   **Product Mapping file**  consists attributes Product name and Product

*   **Review_Content**  consists attributes like Review_id, Product_id, date and Review. 







2.1 Meta_data file

In [None]:
metadata = pd.read_csv('/content/drive/MyDrive/Projects/Data/YelpNYC/metadata' , sep = '\t', header = 0)
metadata.columns = ['User_Id', 'Prod_Id', 'Rating', 'Label', 'Date']

 2.2 Product_Mapping file

In [None]:
ProdIdMap =pd.read_csv("/content/drive/MyDrive/Projects/Data/YelpNYC/productIdMapping", sep = '\t', header = 0)
ProdIdMap.columns = ['Product_Name', 'Prod_Id']

 2.3 Review_Content file

In [None]:
review = pd.read_csv("/content/drive/MyDrive/Projects/Data/YelpNYC/reviewContent", sep = '\t', header = 0)
review.columns = ['User_Id', 'Prod_Id', 'Date', 'Review']

# 3.**Data Pre-Processing**

3.1 Data Merging

In [None]:
review_columns  = ['User_Id', 'Prod_Id']
merge = review.merge(metadata, how = 'left', left_on = review_columns, right_on=review_columns)

In [None]:
yelp_load = pd.merge(merge, ProdIdMap, on = 'Prod_Id')
yelp_load.drop(columns='Date_y', inplace=True)
yelp_load.head(1)

Unnamed: 0,User_Id,Prod_Id,Date_x,Review,Rating,Label,Product_Name
0,1133,1,2013-12-21,My wife and I took my parents there for lunch ...,1.0,-1,Peppino’s


3.2 Calculating Average Rating per Product Basis

In [None]:
avg = yelp_load.groupby('Product_Name')['Rating'].mean() #average rating per product
avg =  avg.to_frame() #convert data series to dataframe 
avg.reset_index(level=0, inplace=True)
avg = avg.rename(columns={'Rating':'Avg_Prod_Rating'})

In [None]:
avg.head(1)

Unnamed: 0,Product_Name,Avg_Prod_Rating
0,10 Devoe,4.863636


3.2.1 Merging Average Product Rating Column to "yelp_load.csv" file

In [None]:
yelp_load = yelp_load.merge(avg, left_on='Product_Name', right_on='Product_Name')
yelp_load.head(1)

Unnamed: 0,User_Id,Prod_Id,Date_x,Review,Rating,Label,Product_Name,Avg_Prod_Rating
0,1133,1,2013-12-21,My wife and I took my parents there for lunch ...,1.0,-1,Peppino’s,4.328063


3.3 Calculating Average Rating Per User Basis

In [None]:
avg = yelp_load.groupby('User_Id')['Rating'].mean()
avg = avg.to_frame()
avg.reset_index(level=0, inplace=True)
avg = avg.rename(columns={'Rating':'Avg_user_rating'})
avg.head(1)

Unnamed: 0,User_Id,Avg_user_rating
0,923,4.473684


3.3.1 Merging Average User Rating Column to "yelp_load.csv" file

In [None]:
yelp_load = yelp_load.merge(avg, left_on='User_Id', right_on='User_Id')
yelp_load.head(1)

Unnamed: 0,User_Id,Prod_Id,Date_x,Review,Rating,Label,Product_Name,Avg_Prod_Rating,Avg_user_rating
0,1133,1,2013-12-21,My wife and I took my parents there for lunch ...,1.0,-1,Peppino’s,4.328063,1.0


3.4 Calculating Length of Reviews

In [None]:
yelp_load['Review_Len']= yelp_load['Review'].apply(len)
yelp_load.head(1)

Unnamed: 0,User_Id,Prod_Id,Date_x,Review,Rating,Label,Product_Name,Avg_Prod_Rating,Avg_user_rating,Review_Len
0,1133,1,2013-12-21,My wife and I took my parents there for lunch ...,1.0,-1,Peppino’s,4.328063,1.0,553


3.5 Calculating total review given by a particular user

In [None]:
user_total_reviews = yelp_load.groupby('User_Id').count()['Label']
user_total_reviews = user_total_reviews.to_frame()
user_total_reviews.reset_index(level=0, inplace=True)
user_total_reviews =  user_total_reviews.rename(columns={'Label':'user_total_reviews'})

3.5.1 Merging the new coulmn to "yelp_load.csv" file

In [None]:
yelp_load = yelp_load.merge(user_total_reviews, how='outer')
yelp_load.head(1)

Unnamed: 0,User_Id,Prod_Id,Date_x,Review,Rating,Label,Product_Name,Avg_Prod_Rating,Avg_user_rating,Review_Len,user_total_reviews
0,1133,1,2013-12-21,My wife and I took my parents there for lunch ...,1.0,-1,Peppino’s,4.328063,1.0,553,1


In [None]:
yelp_load.dropna()

Unnamed: 0,User_Id,Prod_Id,Date_x,Review,Rating,Label,Product_Name,Avg_Prod_Rating,Avg_user_rating,Review_Len,user_total_reviews
0,1133,1,2013-12-21,My wife and I took my parents there for lunch ...,1.0,-1,Peppino’s,4.328063,1.00,553,1
1,1134,1,2013-11-25,Got a margarita pizza. First off let me say ve...,1.0,-1,Peppino’s,4.328063,1.00,305,1
2,1135,1,2014-07-24,The food is average pizzeria and not cheap. A...,2.0,-1,Peppino’s,4.328063,3.25,157,8
3,1135,524,2013-10-20,Recent visit was very good. The servers don't ...,4.0,-1,Mighty Quinn’s Barbeque,4.135281,3.25,434,8
4,1135,542,2013-08-15,Going good!,4.0,-1,Lombardi’s Pizza,3.944656,3.25,11,8
...,...,...,...,...,...,...,...,...,...,...,...
358742,161143,349,2014-02-19,get the fried brussel sprouts. get the emily p...,5.0,1,Emily,4.185185,5.00,329,1
358743,161144,349,2014-02-11,The food was perfect. The wine was perfect. Th...,5.0,1,Emily,4.185185,5.00,411,1
358744,161145,349,2014-02-09,Came here for Sunday brunch. Everything we tas...,5.0,1,Emily,4.185185,5.00,252,1
358745,161146,349,2014-02-06,"I'm very spoiled with Pizza. Really, I have tr...",5.0,1,Emily,4.185185,5.00,1441,1


3.6 Extracting Sentiments from Reviews

In [None]:
def nltk_sentiment(sentence):
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    nltk_sentiment = SentimentIntensityAnalyzer()
    score = nltk_sentiment.polarity_scores(sentence)
    return score

In [None]:
review_sentiment=  yelp_load['Review']
nltk_result = [nltk_sentiment(row) for row in review_sentiment]
result = pd.DataFrame(nltk_result)

3.6.1 Merging Sentiments with "yelp_load.csv" file

In [None]:
nltk_r = review_sentiment.to_frame().join(result)
yelp_load = yelp_load.merge(nltk_r)
yelp_load.head(1)

Unnamed: 0,User_Id,Prod_Id,Date_x,Review,Rating,Label,Product_Name,Avg_Prod_Rating,Avg_user_rating,Review_Len,user_total_reviews,neg,neu,pos,compound
0,1133,1,2013-12-21,My wife and I took my parents there for lunch ...,1.0,-1,Peppino’s,4.328063,1.0,553,1,0.106,0.722,0.172,0.8431


3.7 Count Regex Pattern from Review Column

In [None]:
def count_regex(pattern, row):
  return len(re.findall(pattern, row))

count_capital_words = yelp_load['Review'].apply(lambda x: count_regex(r'\b[A-Z]{2,}\b', x))
yelp_load['number_Cap_Words'] = count_capital_words.to_frame();

count_digit_words = yelp_load['Review'].apply(lambda x: count_regex(r'\b[0-9]{2,}\b', x))
yelp_load['number_digit_Words'] = count_digit_words.to_frame();

yelp_load.head(1)

Unnamed: 0,User_Id,Prod_Id,Date_x,Review,Rating,Label,Product_Name,Avg_Prod_Rating,Avg_user_rating,Review_Len,user_total_reviews,neg,neu,pos,compound,number_Cap_Words,number_digit_Words
0,1133,1,2013-12-21,My wife and I took my parents there for lunch ...,1.0,-1,Peppino’s,4.328063,1.0,553,1,0.106,0.722,0.172,0.8431,0,0


3.8 Counting Noun Phrase in Review Column

In [None]:
def identify_noun_count(sentence):
  blob = TextBlob(sentence)
  return len(blob.noun_phrases)

yelp_load['noun_count'] = yelp_load['Review'].apply(identify_noun_count)
yelp_load.head(1)

Unnamed: 0,User_Id,Prod_Id,Date_x,Review,Rating,Label,Product_Name,Avg_Prod_Rating,Avg_user_rating,Review_Len,user_total_reviews,neg,neu,pos,compound,number_Cap_Words,number_digit_Words,noun_count
0,1133,1,2013-12-21,My wife and I took my parents there for lunch ...,1.0,-1,Peppino’s,4.328063,1.0,553,1,0.106,0.722,0.172,0.8431,0,0,6


3.9 Saving the "yelp_load.csv" file to drive

In [None]:
yelp_load.to_csv('yelp_load.csv')
!cp yelp_load.csv "drive/My Drive/"

**End of Feature Extraction**