In [2]:
import os
import csv
import pandas as pd
import numpy as np
import sklearn
import string
import statsmodels.api as sm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from nltk import tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [3]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [4]:
def display_topics(model, feature_names, num_topics, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        if topic_idx < num_topics:
            print("{:11}".format("Topic %d:" %(topic_idx)), end='')
            print(", ".join(['{:04.3f}*'.format(topic[i])+feature_names[i] \
                             for i in topic.argsort()[:-no_top_words-1:-1]]))

### Read in Data

In [5]:
# Read in Data
df1 = pd.read_csv('vegas.csv')
df1['userid'] = df1['Unnamed: 0']
df1.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,business_id,name,city,state,postal_code,review_count,stars_x,categories,cool,date,funny,review_id,stars_y,text,useful,user_id,userid
0,0,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2011-06-29 02:55:07,0.0,klcF45wKIOpJW_BhJslOJg,5.0,"We went there for dinner the other night, bein...",1.0,-Yz2wIcsdJxUOFMbTgoKQA,0
1,1,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2010-10-06 18:20:13,0.0,Li-pQG6A7p5gbgZHTMeDSQ,4.0,i had the best Chicken Marcela ever. The spagh...,1.0,jYcf_e5p0UG0S-9gJq_tNA,1
2,2,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2010-01-13 00:35:45,0.0,iRLX3dJ3ONvncIxPnXy1cw,5.0,Basically the best Italian in town for the pri...,1.0,nQC0JiPIk_jCooRDxpuw5A,2
3,3,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2008-08-23 20:30:33,0.0,rklteWf9xnTU3fAtMFBRRw,3.0,Mmmmm delicious food and a little history. Mr....,1.0,Gv_-mtOKhWFtCjn9xFe0SQ,3
4,4,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2009-06-01 20:02:55,0.0,UfRqM0RGdZa86hFcFEAnjw,3.0,"This is old Vegas, this atmosphere is old scho...",1.0,pabMYegF28KjHQ5hybAJ0A,4


In [6]:
data = df1[df1['postal_code']==89169].reset_index(drop=True)
data.head()

Unnamed: 0.1,Unnamed: 0,business_id,name,city,state,postal_code,review_count,stars_x,categories,cool,date,funny,review_id,stars_y,text,useful,user_id,userid
0,1111,4n81G-pmC3rfhmaPsbwYKg,"Vince Neil's Tatuado | Eat, Drink, Party",Las Vegas,NV,89169.0,3.0,4.5,"Bars, Sports Bars, Nightlife, Beer, Wine & Spi...",0.0,2018-10-07 21:51:30,0.0,8fcClVeEvil6G06wdD1Guw,5.0,A must see new spot!! Food and service is exce...,0.0,KMSVx7jmEPlrrit3Qh-6Og,1111
1,1112,4n81G-pmC3rfhmaPsbwYKg,"Vince Neil's Tatuado | Eat, Drink, Party",Las Vegas,NV,89169.0,3.0,4.5,"Bars, Sports Bars, Nightlife, Beer, Wine & Spi...",1.0,2018-10-27 03:32:19,0.0,g-xFr2ENx26CCLpPv9iOYA,4.0,I was pleasantly surprised by the quality of f...,0.0,ObQl16Vuc4sHPGKtzSGNUw,1112
2,1113,4n81G-pmC3rfhmaPsbwYKg,"Vince Neil's Tatuado | Eat, Drink, Party",Las Vegas,NV,89169.0,3.0,4.5,"Bars, Sports Bars, Nightlife, Beer, Wine & Spi...",0.0,2018-10-21 17:33:20,0.0,0SOcLtormS4ZCBtZW0dhFQ,5.0,Great prices and service! Everything we ate wa...,0.0,hYbKCWUX5aSJsS68a8u5_Q,1113
3,2358,H2Chxto2e6dHTDJ8-s3-pQ,Roberto's Taco Shop,Las Vegas,NV,89169.0,64.0,3.0,"Mexican, Fast Food, Restaurants",0.0,2018-09-25 19:53:25,0.0,I_LpuhR3aeHJI5Adgsdivg,1.0,I've been eating here for 4 years and everythi...,0.0,M5UjqBcQajQx1fmM_EGcyQ,2358
4,2359,H2Chxto2e6dHTDJ8-s3-pQ,Roberto's Taco Shop,Las Vegas,NV,89169.0,64.0,3.0,"Mexican, Fast Food, Restaurants",0.0,2012-07-16 20:18:36,0.0,rtwmNtE6XFv0Vl6a6ezXlA,2.0,"I like Roberto's, been to several other locati...",0.0,KMH0-vcV0atzuNN0LABsRQ,2359


In [7]:
len(data)

39039

In [8]:
data = data.drop(['Unnamed: 0','categories','date','stars_x','cool','review_id','funny','business_id','city','state','postal_code','review_count','useful','user_id'],axis=1)
data.head()

Unnamed: 0,name,stars_y,text,userid
0,"Vince Neil's Tatuado | Eat, Drink, Party",5.0,A must see new spot!! Food and service is exce...,1111
1,"Vince Neil's Tatuado | Eat, Drink, Party",4.0,I was pleasantly surprised by the quality of f...,1112
2,"Vince Neil's Tatuado | Eat, Drink, Party",5.0,Great prices and service! Everything we ate wa...,1113
3,Roberto's Taco Shop,1.0,I've been eating here for 4 years and everythi...,2358
4,Roberto's Taco Shop,2.0,"I like Roberto's, been to several other locati...",2359


### Number of Topics

In [22]:
# Split reviews into individual sentences 
df = pd.DataFrame(columns=['userid','sentence','stars'])
for i in range(0,10000,1):
    sentences = tokenize.sent_tokenize(data.text[i])
    for j in sentences:
        df = df.append({'userid':data.userid[i],'sentence':j,'stars':data.stars_y[i]},ignore_index=True)

In [20]:
# Create Corpus for TFIDF
corpus_4 = []
for i in df.sentence:
        corpus_4.append(i)

In [23]:
#1
df.to_csv('89169_1.csv')
#corpus_1

In [14]:
#2
df.to_csv('89169_2.csv')
#corpus_2

In [17]:
#3
df.to_csv('89169_3.csv')
#corpus_3

In [21]:
#4
df.to_csv('89169_4.csv')
#corpus_4

In [24]:
# Create Corpus for TFIDF
corpus = []
corpus = corpus_1+corpus_2+corpus_3+corpus_4

### 7 Topics

In [25]:
n_components = 7
n_top_words = 15

# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

# NMF reduction
nmf = NMF(n_components=n_components).fit(tfidf)
W_pos = nmf.fit_transform(tfidf)

# Output Topics
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: great experience atmosphere overall prices price meal staff beer selection happy drinks hour value dinner
Topic #1: good really pretty overall price just experience beer meat chicken quality prices salad pizza steak
Topic #2: place love recommend amazing awesome really nice try highly like fun eat loved stars strip
Topic #3: food amazing excellent quality awesome indian drinks just better came ok tasty atmosphere fresh price
Topic #4: service excellent friendly customer attentive staff fast awesome slow nice quick fantastic bad horrible outstanding
Topic #5: definitely vegas time come best try las restaurant just ve recommend eat like visit coming
Topic #6: delicious ordered fresh absolutely really chicken meat salad steak cheese super pizza sauce shrimp got



- Topic #0: Overall experience
- Topic #1: Price
- Topic #2: "Worth it"
- Topic #3: Food
- Topic #4: Service
- Topic #5: 
- Topic #6: Food

### Label Sentences

In [26]:
import glob
path =r'C:\Users\xinro\Downloads\89169'
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df)
frame = pd.concat(list_)
df = frame

In [27]:
# Append Topic with highest score
array = []
# For all NMF array
for i in range(0,len(W_pos),1):
    # Create dictionary with Topics and its NMF scores for each sentence
    topic_dict = {}
    # Drop sentences that have length less than 10 by setting topic to -1
    if len(corpus[i])>=10:
        for ind, w in enumerate(W_pos[i]):
            topic_dict[ind] = w
        # Classify sentence to the topic with highest score
        array.append(max(topic_dict, key=topic_dict.get))
    else:
        array.append(-1)
# Create new column in df for topic
df['Topic'] = array

In [28]:
df.head()

Unnamed: 0.1,Unnamed: 0,userid,sentence,stars,Topic
0,0,1111,A must see new spot!!,5.0,5
1,1,1111,Food and service is excellent!,5.0,4
2,2,1111,They play fun music and the happy hour is awes...,5.0,5
3,3,1111,Try out your luck on the machines too they pay...,5.0,5
4,4,1111,!,5.0,-1


### Vader Sentiment Analysis

In [29]:
# Initialize Sentiment Intensity Analyzer
analyser = SentimentIntensityAnalyzer()

In [30]:
# Append Sentiment Intensity Scores for each sentence
array = []
for i in df.sentence:
    # Generate Sentiment Intensity Scores and store in array
    score = analyser.polarity_scores(i)
    array.append(score['compound'])
# Create new column in df for sentiment intensity score
df['sentiment'] = array

In [31]:
# Initialize Final df of intensity scores
df_scores = pd.DataFrame(columns=['userid','0','1','2','3','4','5','6','stars'])
# For every user aggregate the sentiment scores by topic
for i in df.userid.unique():
    # Create df of scores from same user
    temp_df = df[df.userid==i].reset_index(drop=True)
    # For every topic
    topic_score = []
    for j in range(0,7,1):
        score = 0
        count = 0
        for k in range(0,len(temp_df),1):
            # If topic equal to current topic
            if temp_df.Topic[k] == j:
                # Add sentiment score
                score = score + temp_df.sentiment[k]
                # Increase count
                count = count + 1
        # If count = 0 then no score for topic
        if count==0:
            topic_score.append(0)
        # Else append average score for topic
        else:
            topic_score.append(score/count)
    # Insert UserId and Star Rating 
    topic_score.insert(0,temp_df.userid[0])
    topic_score.insert(len(topic_score),temp_df.stars[0])
    # Transform and Append into main df
    temp = pd.DataFrame(pd.Series(topic_score))
    temp = temp.transpose()
    temp.columns = df_scores.columns
    df_scores = df_scores.append(temp,ignore_index=True)

In [32]:
df_scores.head()

Unnamed: 0,userid,0,1,2,3,4,5,6,stars
0,1111.0,0.0,0.0,0.0,0.0,0.6114,0.513767,0.0,5.0
1,1112.0,-0.5096,0.409033,0.0,0.6588,0.7048,0.364225,0.3802,4.0
2,1113.0,0.0,0.0,0.0,0.0,0.6588,0.4215,0.0,5.0
3,2358.0,0.9169,0.0,0.0,-0.5848,0.0,-0.1746,0.0,1.0
4,2359.0,0.0,0.0,0.0,0.0,0.0,0.036844,0.0683,2.0


In [33]:
df_scores.mean()

userid    665789.171828
0              0.143677
1              0.190374
2              0.152640
3              0.144773
4              0.173888
5              0.183514
6              0.123678
stars          3.910116
dtype: float64

### Linear Regression

In [34]:
# Split into predictors and target
X = df_scores.drop(['userid','stars'],axis=1)
X = X.astype(float)
y = df_scores.stars
y = y.astype(float)
# Split Train vs Test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=1)
# Split Test set into validation & test set
X_test2, X_val, y_test2, y_val = train_test_split(X_test,y_test,test_size=0.5,stratify=y_test,random_state=1)

#### Simple Linear Regression

In [35]:
model = sm.OLS(y_train,sm.add_constant(X_train)).fit()
print(model.params)
print()
print('Mean Squared Error: ',mean_squared_error(y_val,model.predict(sm.add_constant(X_val))))
print('AIC: ',model.aic)

const    2.963996
0        0.594262
1        0.355163
2        0.683801
3        0.861802
4        0.943995
5        1.662487
6        0.766491
dtype: float64

Mean Squared Error:  1.131410523405917
AIC:  92970.6831784657


In [None]:
- Topic #0: Overall experience
- Topic #1: Price
- Topic #2: "Worth it"
- Topic #3: Food
- Topic #4: Service
- Topic #5: 
- Topic #6: Food

### 𝑦ℎ𝑎𝑡=2.96+0.59∗Overall+0.36∗Price+0.68∗𝑊𝑜𝑟𝑡ℎ+0.86∗𝐹𝑜𝑜𝑑+0.94∗Service+1.66∗Topic5+0.77∗Food

#### Removed Intercept and Non-Topics

In [36]:
# Split into predictors and target
X = df_scores.drop(['userid','stars','4'],axis=1)
X = X.astype(float)
y = df_scores.stars
y = y.astype(float)
# Split Train vs Test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=1)
# Split Test set into validation & test set
X_test2, X_val, y_test2, y_val = train_test_split(X_test,y_test,test_size=0.5,stratify=y_test,random_state=1)

model = sm.OLS(y_train,X_train).fit()
print(model.params)
print()
print('Mean Squared Error: ',mean_squared_error(y_val,model.predict(X_val)))
print('AIC: ',model.aic)

0    2.242862
1    2.690612
2    2.275193
3    2.524436
5    4.624144
6    2.439048
dtype: float64

Mean Squared Error:  4.98868224487254
AIC:  138779.42999644772
