In [1]:
import os
import csv
import pandas as pd
import numpy as np
import sklearn
import string
import statsmodels.api as sm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from nltk import tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [3]:
def display_topics(model, feature_names, num_topics, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        if topic_idx < num_topics:
            print("{:11}".format("Topic %d:" %(topic_idx)), end='')
            print(", ".join(['{:04.3f}*'.format(topic[i])+feature_names[i] \
                             for i in topic.argsort()[:-no_top_words-1:-1]]))

### Read in Data

In [4]:
# Read in Data
df1 = pd.read_csv('vegas.csv')
df1['userid'] = df1['Unnamed: 0']
df1.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,business_id,name,city,state,postal_code,review_count,stars_x,categories,cool,date,funny,review_id,stars_y,text,useful,user_id,userid
0,0,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2011-06-29 02:55:07,0.0,klcF45wKIOpJW_BhJslOJg,5.0,"We went there for dinner the other night, bein...",1.0,-Yz2wIcsdJxUOFMbTgoKQA,0
1,1,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2010-10-06 18:20:13,0.0,Li-pQG6A7p5gbgZHTMeDSQ,4.0,i had the best Chicken Marcela ever. The spagh...,1.0,jYcf_e5p0UG0S-9gJq_tNA,1
2,2,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2010-01-13 00:35:45,0.0,iRLX3dJ3ONvncIxPnXy1cw,5.0,Basically the best Italian in town for the pri...,1.0,nQC0JiPIk_jCooRDxpuw5A,2
3,3,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2008-08-23 20:30:33,0.0,rklteWf9xnTU3fAtMFBRRw,3.0,Mmmmm delicious food and a little history. Mr....,1.0,Gv_-mtOKhWFtCjn9xFe0SQ,3
4,4,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2009-06-01 20:02:55,0.0,UfRqM0RGdZa86hFcFEAnjw,3.0,"This is old Vegas, this atmosphere is old scho...",1.0,pabMYegF28KjHQ5hybAJ0A,4


In [24]:
data = df1[df1['postal_code']==89119].reset_index(drop=True)
data.head()

Unnamed: 0.1,Unnamed: 0,business_id,name,city,state,postal_code,review_count,stars_x,categories,cool,date,funny,review_id,stars_y,text,useful,user_id,userid
0,0,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2011-06-29 02:55:07,0.0,klcF45wKIOpJW_BhJslOJg,5.0,"We went there for dinner the other night, bein...",1.0,-Yz2wIcsdJxUOFMbTgoKQA,0
1,1,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2010-10-06 18:20:13,0.0,Li-pQG6A7p5gbgZHTMeDSQ,4.0,i had the best Chicken Marcela ever. The spagh...,1.0,jYcf_e5p0UG0S-9gJq_tNA,1
2,2,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2010-01-13 00:35:45,0.0,iRLX3dJ3ONvncIxPnXy1cw,5.0,Basically the best Italian in town for the pri...,1.0,nQC0JiPIk_jCooRDxpuw5A,2
3,3,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2008-08-23 20:30:33,0.0,rklteWf9xnTU3fAtMFBRRw,3.0,Mmmmm delicious food and a little history. Mr....,1.0,Gv_-mtOKhWFtCjn9xFe0SQ,3
4,4,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2009-06-01 20:02:55,0.0,UfRqM0RGdZa86hFcFEAnjw,3.0,"This is old Vegas, this atmosphere is old scho...",1.0,pabMYegF28KjHQ5hybAJ0A,4


In [25]:
len(data)

79907

In [26]:
data = data.drop(['Unnamed: 0','categories','date','stars_x','cool','review_id','funny','business_id','city','state','postal_code','review_count','useful','user_id'],axis=1)
data.head()

Unnamed: 0,name,stars_y,text,userid
0,Carluccio's Tivoli Gardens,5.0,"We went there for dinner the other night, bein...",0
1,Carluccio's Tivoli Gardens,4.0,i had the best Chicken Marcela ever. The spagh...,1
2,Carluccio's Tivoli Gardens,5.0,Basically the best Italian in town for the pri...,2
3,Carluccio's Tivoli Gardens,3.0,Mmmmm delicious food and a little history. Mr....,3
4,Carluccio's Tivoli Gardens,3.0,"This is old Vegas, this atmosphere is old scho...",4


### Number of Topics

In [27]:
# Split reviews into individual sentences 
df = pd.DataFrame(columns=['userid','sentence','stars'])
for i in range(0,len(data),1):
    sentences = tokenize.sent_tokenize(data.text[i])
    for j in sentences:
        df = df.append({'userid':data.userid[i],'sentence':j,'stars':data.stars_y[i]},ignore_index=True)

In [28]:
df.head()

Unnamed: 0,userid,sentence,stars
0,0,"We went there for dinner the other night, bein...",5.0
1,0,"We had driven by a couple of times, and had it...",5.0
2,0,Putting it simply; the food was incredible!,5.0
3,0,"Being from NY, I am an Italian food snob, and ...",5.0
4,0,"I had the veal scallopini, my wife had the chi...",5.0


In [29]:
# Create Corpus for TFIDF
corpus = []
for i in df.sentence:
        corpus.append(i)

### 7 Topics

In [30]:
n_components = 7
n_top_words = 15

# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

# NMF reduction
nmf = NMF(n_components=n_components).fit(tfidf)
W_pos = nmf.fit_transform(tfidf)

# Output Topics
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: great experience atmosphere staff prices overall selection drinks price meal spot happy beer lunch hour
Topic #1: good really pretty overall price fries pizza prices experience flavor chicken selection thing taste tasted
Topic #2: place love recommend amazing awesome try clean stars nice eat sushi highly favorite looking vegas
Topic #3: food delicious amazing excellent quality fresh fast awesome drinks price ok decent indian tasty mexican
Topic #4: service friendly excellent customer slow fast staff attentive amazing horrible nice quick awesome terrible bad
Topic #5: time just like vegas best really order ve ordered restaurant came got eat don try
Topic #6: definitely come recommend coming vegas try ll worth return highly going visit soon town returning



- Topic #0: Overall experience
- Topic #1: Price
- Topic #2: "Worth it"
- Topic #3: Food
- Topic #4: Service
- Topic #5: 
- Topic #6: "Worth it"

### Label Sentences

In [31]:
# Append Topic with highest score
array = []
# For all NMF array
for i in range(0,len(W_pos),1):
    # Create dictionary with Topics and its NMF scores for each sentence
    topic_dict = {}
    # Drop sentences that have length less than 10 by setting topic to -1
    if len(corpus[i])>=10:
        for ind, w in enumerate(W_pos[i]):
            topic_dict[ind] = w
        # Classify sentence to the topic with highest score
        array.append(max(topic_dict, key=topic_dict.get))
    else:
        array.append(-1)
# Create new column in df for topic
df['Topic'] = array

In [32]:
df.head()

Unnamed: 0,userid,sentence,stars,Topic
0,0,"We went there for dinner the other night, bein...",5.0,5
1,0,"We had driven by a couple of times, and had it...",5.0,5
2,0,Putting it simply; the food was incredible!,5.0,3
3,0,"Being from NY, I am an Italian food snob, and ...",5.0,3
4,0,"I had the veal scallopini, my wife had the chi...",5.0,5


### Vader Sentiment Analysis

In [33]:
# Initialize Sentiment Intensity Analyzer
analyser = SentimentIntensityAnalyzer()

In [34]:
# Append Sentiment Intensity Scores for each sentence
array = []
for i in df.sentence:
    # Generate Sentiment Intensity Scores and store in array
    score = analyser.polarity_scores(i)
    array.append(score['compound'])
# Create new column in df for sentiment intensity score
df['sentiment'] = array

In [70]:
# Initialize Final df of intensity scores
df_scores = pd.DataFrame(columns=['userid','0','1','2','3','4','5','6','stars'])
# For every user aggregate the sentiment scores by topic
for i in df.userid.unique():
    # Create df of scores from same user
    temp_df = df[df.userid==i].reset_index(drop=True)
    # For every topic
    topic_score = []
    for j in range(0,7,1):
        score = 0
        count = 0
        for k in range(0,len(temp_df),1):
            # If topic equal to current topic
            if temp_df.Topic[k] == j:
                # Add sentiment score
                score = score + temp_df.sentiment[k]
                # Increase count
                count = count + 1
        # If count = 0 then no score for topic
        if count==0:
            topic_score.append(0)
        # Else append average score for topic
        else:
            topic_score.append(score/count)
    # Insert UserId and Star Rating 
    topic_score.insert(0,temp_df.userid[0])
    topic_score.insert(len(topic_score),temp_df.stars[0])
    # Transform and Append into main df
    temp = pd.DataFrame(pd.Series(topic_score))
    temp = temp.transpose()
    temp.columns = df_scores.columns
    df_scores = df_scores.append(temp,ignore_index=True)

In [71]:
df_scores.head()

Unnamed: 0,userid,0,1,2,3,4,5,6,stars
0,0,0.0,0.0,0,0.148,0.7938,0.019875,0.4019,5
1,1,0.6249,0.0,0,0.0,0.0,0.31845,0.0,4
2,2,0.5499,0.4404,0,0.0,0.5719,0.6369,0.0,5
3,3,0.0,0.0,0,0.5719,0.0,0.339,0.0,3
4,4,0.0,0.8102,0,0.0,0.0,0.7096,0.0,3


In [72]:
df_scores.mean()

Series([], dtype: float64)

### Linear Regression

In [90]:
# Split into predictors and target
X = df_scores.drop(['userid','stars'],axis=1)
X = X.astype(float)
y = df_scores.stars
y = y.astype(float)
# Split Train vs Test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=1)
# Split Test set into validation & test set
X_test2, X_val, y_test2, y_val = train_test_split(X_test,y_test,test_size=0.5,stratify=y_test,random_state=1)

#### Simple Linear Regression

In [91]:
model = sm.OLS(y_train,sm.add_constant(X_train)).fit()
print(model.params)
print()
print('Mean Squared Error: ',mean_squared_error(y_val,model.predict(sm.add_constant(X_val))))
print('AIC: ',model.aic)

const    2.785095
0        0.693463
1        0.386584
2        0.841435
3        0.951460
4        1.016849
5        1.838966
6        0.614875
dtype: float64

Mean Squared Error:  1.2120485747138852
AIC:  192618.51175536733


In [None]:
- Topic #0: Overall experience
- Topic #1: Price
- Topic #2: "Worth it"
- Topic #3: Food
- Topic #4: Service
- Topic #5: 
- Topic #6: "Worth it"

### 𝑦ℎ𝑎𝑡=2.96+0.59∗Overall+0.36∗Price+0.68∗𝑊𝑜𝑟𝑡ℎ+0.86∗𝐹𝑜𝑜𝑑+0.94∗Service+1.66∗Topic5+0.77∗𝑊orth

#### Removed Intercept and Non-Topics

In [93]:
# Split into predictors and target
X = df_scores.drop(['userid','stars','4'],axis=1)
X = X.astype(float)
y = df_scores.stars
y = y.astype(float)
# Split Train vs Test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=1)
# Split Test set into validation & test set
X_test2, X_val, y_test2, y_val = train_test_split(X_test,y_test,test_size=0.5,stratify=y_test,random_state=1)

model = sm.OLS(y_train,X_train).fit()
print(model.params)
print()
print('Mean Squared Error: ',mean_squared_error(y_val,model.predict(X_val)))
print('AIC: ',model.aic)

0    2.238934
1    2.746020
2    2.346163
3    2.527677
5    4.596729
6    2.134483
dtype: float64

Mean Squared Error:  4.8720812615817515
AIC:  283305.64214061084
