In [5]:
import pandas as pd
import json
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from textblob import TextBlob
import string

In [6]:
data_file = open("data/yelp_academic_dataset_review.json")
data = []
c = 0
for line in data_file:
    data.append(json.loads(line))
    c+=1
    if c == 10000:
        break

df = pd.DataFrame(data)

data_file.close()

### Feature engineering

In [7]:
df

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15
...,...,...,...,...,...,...,...,...,...
9995,ZcBtCA9jGhLfakf1jJ2BAg,yab1cq5yzrTHzoyz8LYqYQ,1-z7wd860Rii4kbEMCT8DA,5.0,0,0,0,Excellent food and service. The place is funct...,2018-06-26 17:41:31
9996,UIkEO-10J6Y99IhRqUflvg,lYAmgL_l7A3MPFYe1DYKrw,EpREWeEpmR8f1qLHzzF0AA,5.0,0,1,0,Just about to get tucked into a meatloaf that ...,2018-01-09 20:26:13
9997,S-NQM3Axcg8JS3MXHUIvyw,rE2WwfgJbYfvDwBlgq__dQ,dvidzWEPgTQPeBc8CUV2OQ,5.0,0,0,0,Outstanding customer service! And my car is dr...,2015-04-01 21:50:28
9998,ME79YrEhm2xe4IQy_0zkGw,OnIklvzKDpk1BduC84TrTA,2XYPFRm7teCUr3eGsB2-qw,5.0,0,0,0,I and my husband went here for Dinner one day ...,2015-06-08 19:32:26


In [8]:
def split_time_to_feature(df):
    
    df['date'] =  pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S')

    df['year'] = pd.DatetimeIndex(df['date']).year
    df['month'] = pd.DatetimeIndex(df['date']).month
    df['day'] = pd.DatetimeIndex(df['date']).day
    df['hour'] = pd.DatetimeIndex(df['date']).hour
    df['min'] = pd.DatetimeIndex(df['date']).minute
    df['sec'] = pd.DatetimeIndex(df['date']).second

split_time_to_feature(df)

In [9]:
# Creating a new column in the dataset for the length of the reviews
df['length_of_reviews'] = df['text'].apply(len)

In [10]:
# Creating a new column in the dataset for the number of words in the reviews
df['num_of_words'] = df['text'].apply(lambda str:len(nltk.word_tokenize(str)))

In [11]:
# Creating a new column in the dataset for the number of sentences in the reviews
df['num_of_sentences'] = df['text'].apply(lambda paragraph:len(nltk.sent_tokenize(paragraph)))

In [12]:
df['capital_words'] = df['text'].apply(lambda sen:len(re.findall(r'\b[A-Z]+\b', sen)))
df['capital_words_ratio'] = df['capital_words']/df['num_of_words']
df.drop(columns='capital_words', inplace = True)

### Dealing with texts in reviews

In [13]:
# Cleaning the reviews - stemed, remove stopwords and punctuation
def clean_text(text):

    ps = PorterStemmer()
    stem_text = ps.stem(text)

    ## Remove puncuation
    nopunc = [char for char in stem_text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    ## Remove stop words
    nostop = [word for word in nopunc.split() if word.lower() not in stopwords.words('english') and len(word) >= 3]
    text = ' '.join(nostop)
   
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)    
    
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

In [14]:
# Sentiment analysis using textblob 
df['sentiment_polarity'] = df['cleaned_text'].apply(lambda w:TextBlob(w).polarity)
df['sentiment_subjectivity'] = df['cleaned_text'].apply(lambda w:TextBlob(w).subjectivity)

In [15]:
# Convert words to vectors
df = pd.concat([df, df['cleaned_text'].apply(lambda str:nltk.word_tokenize(str)).str.join('|').str.get_dummies().add_prefix('tags_')], axis = 1)

In [16]:
df.to_csv('data/cleaned_review_data.csv')

In [17]:
df.columns[:22]

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date', 'year', 'month', 'day', 'hour', 'min', 'sec',
       'length_of_reviews', 'num_of_words', 'num_of_sentences',
       'capital_words_ratio', 'cleaned_text', 'sentiment_polarity',
       'sentiment_subjectivity'],
      dtype='object')

In [18]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,year,...,tags_zorba,tags_zorbas,tags_zot,tags_zucchini,tags_zuchini,tags_zuckerberg,tags_zumba,tags_zuniga,tags_zurich,tags_zydeco
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,2018,...,0,0,0,0,0,0,0,0,0,0
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18,2012,...,0,0,0,0,0,0,0,0,0,0
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,2014,...,0,0,0,0,0,0,0,0,0,0
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,2015,...,0,0,0,0,0,0,0,0,0,0
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,2017,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# TODO
# 結合business 篩出michigan的所有餐廳