In [1]:
import math
import string, re
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import spacy
from spacy.lang.en import English, STOP_WORDS
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import matplotlib as plt
%matplotlib inline

from tqdm import tqdm, tqdm_pandas

In [2]:
nlp = spacy.load("en_core_web_sm")
stop = STOP_WORDS
punct = {p for p in string.punctuation}

## 1.Load reviews and user data

In [3]:
df = pd.read_json('yelp_academic_dataset_review.json',lines=True)

In [4]:
user = pd.read_json('yelp_academic_dataset_user.json',lines=True)

## 2. Pharse reviews data

features include:\
num of word\
num of sentences\
num of paragraphs\
num of letters\
if mentions pirce\
num of word in all caps\
num of exclamation marks

In [5]:
def decode(text):
    try:
        return text.decode('utf8')
    except:
        return text

def get_num_words(text):
    return float(len(text.split()))

def get_num_sents(text):
    return text.count('. ') + text.count('! ') + text.count('? ') + text.count(') ') + \
            text.count('.\n') + text.count('!\n') + text.count('?\n') + text.count(')\n') + 1.0

def get_num_para(text):
    return text.count('\n\n') + 1.0

def mentions_price(text):
    return 1 if '$' in text else 0

def get_allcaps(text):
    text = re.sub("[^a-zA-z]", " ", text)
    return len([word for word in text.split() if word.isupper() and len(word) > 2])

def get_exclamations(text):
    return text.count('!')

def get_num_chars(text):
    return float(len([char for char in text if char != ' ' and char not in punct]))

def get_clean_tokens(text):  
    """Return tokens for each review; exclude stop words and lemmatize."""
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    words = ' '.join(letters_only.lower().split())
    tokens = [token.lemma_ for token in nlp(words)]
    filtered = [t for t in tokens if t not in stop and t != '' and t != ' ' and t != '\n' and t != '\n\n']
    return ' '.join(filtered)

def tokenize(df):
    tokens = []
    for i in tqdm(range(len(df.text.values))):
        tokens.append(get_clean_tokens(df.text.values[i]))
    return tokens

In [6]:
def get_features(df):
    # decode
    df.loc[:, 'text'] = df.loc[:, 'text'].apply(decode)
    
    #get num of words in single review
    df.loc[:, 'review_len_wrds'] = df.loc[:, 'text'].apply(get_num_words)
    
    #get num of sentences in single review
    df.loc[:, 'review_len_sent'] = df.loc[:, 'text'].apply(get_num_sents)
    
    #get average number of words per sentence
    df.loc[:, 'avg_wrd_in_sent'] = df.loc[:, 'review_len_wrds']/df.loc[:, 'review_len_sent']
    
    #get num of paragraphs
    df.loc[:, 'num_para'] = df.loc[:, 'text'].apply(get_num_para)
    
    # check if price is mentioned
    df.loc[:, 'mentions_price'] = df.loc[:, 'text'].apply(mentions_price)
    
    # get number of all caps words
    df.loc[:, 'num_allcaps'] = df.loc[:, 'text'].apply(get_allcaps)
    
    # get number of exclamation marks
    df.loc[:, 'num_exclamations'] = df.loc[:, 'text'].apply(get_exclamations)
    
    # get number of characters
    df.loc[:,'num_chars'] = df.loc[:,'text'].apply(get_num_chars)
    
    # calculate ARI score (automatic readability index) for each review 
    df.loc[:,'ari_score'] = df.apply(
        lambda row: 4.71 * (row.num_chars/float(row.review_len_wrds)) \
        + 0.5 * (row.review_len_wrds/float(row.review_len_sent)) - 21.43, 
        axis = 1)
    
    # get characters per word
    df['avg_chars_per_word'] = df.loc[:,'num_chars'] / df.loc[:,'review_len_wrds']
    

In [7]:
clntkns = tokenize(df)
df['tokens'] = clntkns
get_features(df)

100%|█████████████████████████████████████████████████████████████████████| 2685066/2685066 [16:56:52<00:00, 44.01it/s]


Lable each user by elite or not

In [11]:
# add elite yes/no
user['is_elite'] = user.elite.apply(lambda x: 1 if len(x) > 0 else 0)

In [14]:
df

Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id,tokens,review_len_wrds,review_len_sent,avg_wrd_in_sent,num_para,mentions_price,num_allcaps,num_exclamations,num_chars,ari_score,avg_chars_per_word,is_elite
0,"{'funny': 0, 'useful': 0, 'cool': 0}",PUFPaY9KxDAcGqfsorJp3Q,Ya85v4eqdd6k9Od8HbQjyA,4,2012-08-01,"Mr Hoagie is an institution. Walking in, it do...",review,5UmKMjUEUNdYWqANhGckJw,mr hoagie institution walk like throwback year...,83.0,5.0,16.600000,1.0,0,0,0,357.0,7.128675,4.301205,0
1,"{'funny': 0, 'useful': 0, 'cool': 0}",Iu6AxdBYGR4A0wspR9BYHA,KPvLNJ21_4wbYNctrOwWdQ,5,2014-02-13,Excellent food. Superb customer service. I mis...,review,5UmKMjUEUNdYWqANhGckJw,excellent food superb customer service I miss ...,23.0,3.0,7.666667,1.0,0,0,0,109.0,4.724638,4.739130,0
2,"{'funny': 1, 'useful': 1, 'cool': 0}",auESFwWvW42h6alXgFxAXQ,fFSoGV46Yxuwbr3fHNuZig,5,2015-10-31,Yes this place is a little out dated and not o...,review,5UmKMjUEUNdYWqANhGckJw,yes place little date open weekend staff pleas...,73.0,6.0,12.166667,1.0,0,0,0,302.0,4.138539,4.136986,0
3,"{'funny': 0, 'useful': 0, 'cool': 0}",qiczib2fO_1VBG8IoCGvVg,pVMIt0a_QsKtuDfWVfSk2A,3,2015-12-26,PROS: Italian hoagie was delicious. Friendly ...,review,5UmKMjUEUNdYWqANhGckJw,pro italian hoagie delicious friendly counter ...,51.0,8.0,6.375000,3.0,0,7,0,226.0,2.629265,4.431373,0
4,"{'funny': 0, 'useful': 1, 'cool': 0}",qEE5EvV-f-s7yHC0Z4ydJQ,AEyiQ_Y44isJmNbMTyoMKQ,2,2016-04-08,First the only reason this place could possibl...,review,5UmKMjUEUNdYWqANhGckJw,reason place possibly win good hoagie s compet...,192.0,15.0,12.800000,6.0,1,1,2,803.0,4.668594,4.182292,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2685061,"{'funny': 0, 'useful': 0, 'cool': 0}",kONznNes89LWlc1jcZtD0A,5-pv7M86ZdrXjfHPkPsZug,1,2015-11-23,Still sick. Do not eat here unless you want to...,review,DH2Ujt_hwcMBIz8VvCb0Lg,sick eat want puke brain hour later food look ...,34.0,3.0,11.333333,1.0,0,0,0,129.0,2.106961,3.794118,0
2685062,"{'funny': 0, 'useful': 0, 'cool': 0}",6jXm3mrRGAPRENujxhlRpw,MjGrqy30haStX4Q6SWsdcg,1,2015-11-24,This place sucks especially the white manager ...,review,DH2Ujt_hwcMBIz8VvCb0Lg,place suck especially white manager guy charge...,43.0,1.0,43.000000,1.0,0,0,0,196.0,21.538837,4.558140,0
2685063,"{'funny': 0, 'useful': 0, 'cool': 0}",D8AR0UYdlHClqcjARPEr8Q,7ZfVeWubWTleBJUXXMPl_w,3,2016-02-13,Not a bad stop for airport food. I got the chi...,review,DH2Ujt_hwcMBIz8VvCb0Lg,bad stop airport food I chicken bowl filling s...,55.0,5.0,11.000000,1.0,0,0,0,212.0,2.224909,3.854545,0
2685064,"{'funny': 5, 'useful': 4, 'cool': 4}",nELVJlkX8T0mUAArSPSJxw,vwmqHxxmy9rEAwhbkLXmnQ,3,2016-04-30,"He stood in the face of a 2.5 star biz, and br...",review,DH2Ujt_hwcMBIz8VvCb0Lg,stand face star biz brave salsarita s overpric...,156.0,12.0,13.000000,4.0,1,0,1,658.0,4.936538,4.217949,1


In [13]:
user.set_index('user_id', inplace = True)
df['is_elite'] = df.user_id.apply(lambda x: user.loc[x, 'is_elite'])

In [15]:
train, test = train_test_split(df, test_size = .25)
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None, \
                             max_features = 500) 

In [16]:
elite_reviews = train[train.is_elite == 1]
nonelite_reviews = train[train.is_elite == 0]

In [20]:
elite_features = vectorizer.fit_transform(elite_reviews.tokens)
elite_words = vectorizer.get_feature_names_out()
elite_features = elite_features.toarray()
elite_dist = np.sum(elite_features, axis = 0)
elite_sorted = sorted(zip(elite_words, elite_dist), key = lambda x: x[1], reverse = True)
elite_wrds_dict = dict(elite_sorted)

In [21]:
nonelite_features = vectorizer.fit_transform(nonelite_reviews.tokens)
nonelite_words = vectorizer.get_feature_names_out()
nonelite_features = nonelite_features.toarray()
nonelite_dist = np.sum(nonelite_features, axis = 0)
nonelite_sorted = sorted(zip(nonelite_words, nonelite_dist), key = lambda x: x[1], reverse = True)
nonelite_wrds_dict = dict(nonelite_sorted)

In [22]:
# get unique words (not present in intersection)
onlyelite_words = {
    word : elite_wrds_dict[word] 
    for word in elite_wrds_dict 
    if word not in nonelite_wrds_dict}

onlynonelite_words = {
    word : nonelite_wrds_dict[word] 
    for word in nonelite_wrds_dict 
    if word not in elite_wrds_dict}
elite_top50 = sorted(onlyelite_words.items(), key = lambda x: x[1], reverse = True)[:50]
ne_top50 = sorted(onlynonelite_words.items(), key = lambda x: x[1], reverse = True)[:50]

In [62]:
a = np.array(elite_top50)
np.savetxt("elite_top50.csv", a, delimiter=",")

TypeError: Mismatch between array dtype ('<U21') and format specifier ('%.18e,%.18e')

In [63]:
a

array([['de', '30347'],
       ['la', '23394'],
       ['locate', '21937'],
       ['space', '21215'],
       ['tomato', '19821'],
       ['crispy', '19242'],
       ['butter', '17795'],
       ['mix', '17705'],
       ['le', '17427'],
       ['city', '17035'],
       ['pepper', '16975'],
       ['brunch', '16704'],
       ['grab', '16390'],
       ['et', '16328'],
       ['tender', '16159'],
       ['seating', '16155'],
       ['mushroom', '15777'],
       ['soft', '15620'],
       ['sound', '15494'],
       ['toast', '15456'],
       ['standard', '15293'],
       ['note', '15247'],
       ['mall', '15205'],
       ['black', '15016'],
       ['dance', '14989'],
       ['salmon', '14783'],
       ['ton', '14730'],
       ['center', '14722'],
       ['flavorful', '14711'],
       ['cute', '14648'],
       ['event', '14544'],
       ['seafood', '14468'],
       ['interesting', '14385'],
       ['lobster', '14315'],
       ['pour', '14249'],
       ['cafe', '14148'],
       ['sausage', '1

In [59]:
a = np.array(ne_top50)
numpy.savetxt("ne_top50.csv", a, delimiter=",")

array([['hair', '62560'],
       ['company', '54726'],
       ['phone', '54234'],
       ['nail', '51605'],
       ['receive', '49894'],
       ['rude', '47274'],
       ['professional', '47112'],
       ['fix', '46332'],
       ['appointment', '43114'],
       ['horrible', '42363'],
       ['dr', '42285'],
       ['office', '41020'],
       ['min', '41015'],
       ['speak', '38091'],
       ['explain', '38066'],
       ['terrible', '35512'],
       ['question', '35463'],
       ['send', '34376'],
       ['twice', '34182'],
       ['understand', '33984'],
       ['purchase', '33961'],
       ['salon', '33890'],
       ['completely', '32042'],
       ['provide', '31132'],
       ['massage', '30706'],
       ['break', '30533'],
       ['greet', '29674'],
       ['answer', '28834'],
       ['save', '28628'],
       ['product', '28389'],
       ['desk', '28346'],
       ['deliver', '28142'],
       ['class', '28012'],
       ['die', '27976'],
       ['complaint', '27725'],
       ['boyfri

In [42]:
elite_words_list = elite_words.tolist()
nonelite_words_list = nonelite_words.tolist()

## 5. Export data for use in classification notebook and viz

In [47]:
def get_elite_words(tokens):
    try:
        tokens = tokens.split()
        return len(set(elite_words_list).intersection(tokens))
    except:
        return 0

def get_nonelite_words(tokens):
    try:
        tokens = tokens.split()
        return len(set(nonelite_words_list).intersection(tokens))
    except:
        return 0

def add_content_counts(df):
    df['num_elite_words'] = df.text.apply(get_elite_words)
    df['num_ne_words'] = df.text.apply(get_nonelite_words)

add_content_counts(df)

In [51]:
byuser = df.groupby('user_id')
user_avgs = byuser.mean().loc[:, 'review_len_wrds':]

  user_avgs = byuser.mean().loc[:, 'review_len_wrds':]


In [53]:
user_avgs

Unnamed: 0_level_0,review_len_wrds,review_len_sent,avg_wrd_in_sent,num_para,mentions_price,num_allcaps,num_exclamations,num_chars,ari_score,avg_chars_per_word,is_elite,num_elite_words,num_ne_words
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
---teJGnwK07UO6_oJfbRw,36.000000,5.000000,7.200000,1.000000,0.000000,0.000000,0.000000,153.000000,2.187500,4.250000,0.0,4.000000,4.000000
--0HEXd4W6bJI8k7E0RxTA,19.000000,3.500000,6.500000,1.000000,0.000000,0.000000,0.000000,92.500000,4.702750,4.858333,0.0,3.500000,3.500000
--0KsjlAThNWua2Pr4HStQ,128.000000,8.333333,14.477183,3.833333,0.000000,0.500000,0.166667,548.166667,5.596217,4.201194,1.0,17.500000,18.000000
--0mI_q_0D1CdU4P_hoImQ,28.000000,3.000000,9.333333,1.000000,0.000000,0.000000,4.000000,115.000000,2.581310,4.107143,0.0,4.000000,3.000000
--106arHH4D3fLenTl3YZA,92.000000,8.000000,11.500000,2.000000,0.000000,0.000000,0.000000,397.000000,4.644674,4.315217,0.0,11.000000,11.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzx_41wDUNxpcgrdOtERvw,79.923077,6.538462,11.890354,1.000000,0.153846,0.000000,0.692308,357.538462,5.805732,4.520288,0.0,10.461538,11.153846
zzyeArRv6I5HpEJlOCOPAQ,206.666667,11.000000,19.444444,1.666667,0.666667,5.333333,6.666667,822.000000,7.421741,4.061469,0.0,17.000000,18.333333
zzyoUJV5QTUEuuVoICcdYQ,74.500000,4.500000,14.083333,1.000000,0.000000,0.000000,0.000000,317.500000,5.163643,4.151163,0.0,7.000000,8.500000
zzytqT9s0WS-d4WgLYmLoA,183.000000,12.000000,15.250000,2.000000,0.000000,0.000000,1.000000,710.000000,4.468770,3.879781,0.0,16.000000,17.000000


In [55]:
user_avgs.to_pickle('pickled/user_avgs.pkl')

In [60]:
df.to_pickle('pickled/all_text_feat.pkl')