In [1]:
import os
import csv
import pandas as pd
import numpy as np
import sklearn
import string
import statsmodels.api as sm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from nltk import tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [3]:
def display_topics(model, feature_names, num_topics, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        if topic_idx < num_topics:
            print("{:11}".format("Topic %d:" %(topic_idx)), end='')
            print(", ".join(['{:04.3f}*'.format(topic[i])+feature_names[i] \
                             for i in topic.argsort()[:-no_top_words-1:-1]]))

### Read in Data

In [4]:
# Read in Data
df1 = pd.read_csv('vegas.csv')
df1['userid'] = df1['Unnamed: 0']
df1.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,business_id,name,city,state,postal_code,review_count,stars_x,categories,cool,date,funny,review_id,stars_y,text,useful,user_id,userid
0,0,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2011-06-29 02:55:07,0.0,klcF45wKIOpJW_BhJslOJg,5.0,"We went there for dinner the other night, bein...",1.0,-Yz2wIcsdJxUOFMbTgoKQA,0
1,1,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2010-10-06 18:20:13,0.0,Li-pQG6A7p5gbgZHTMeDSQ,4.0,i had the best Chicken Marcela ever. The spagh...,1.0,jYcf_e5p0UG0S-9gJq_tNA,1
2,2,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2010-01-13 00:35:45,0.0,iRLX3dJ3ONvncIxPnXy1cw,5.0,Basically the best Italian in town for the pri...,1.0,nQC0JiPIk_jCooRDxpuw5A,2
3,3,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2008-08-23 20:30:33,0.0,rklteWf9xnTU3fAtMFBRRw,3.0,Mmmmm delicious food and a little history. Mr....,1.0,Gv_-mtOKhWFtCjn9xFe0SQ,3
4,4,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,Las Vegas,NV,89119.0,40.0,4.0,"Restaurants, Italian",0.0,2009-06-01 20:02:55,0.0,UfRqM0RGdZa86hFcFEAnjw,3.0,"This is old Vegas, this atmosphere is old scho...",1.0,pabMYegF28KjHQ5hybAJ0A,4


In [5]:
data = df1[df1['postal_code']==89118].reset_index(drop=True)
data.head()

Unnamed: 0.1,Unnamed: 0,business_id,name,city,state,postal_code,review_count,stars_x,categories,cool,date,funny,review_id,stars_y,text,useful,user_id,userid
0,22853,aNe8ofTYrealxqv7VtFTuw,Sierra Gold,Las Vegas,NV,89118.0,231.0,3.5,"Pubs, Sports Bars, Pizza, Nightlife, Food, Bre...",0.0,2015-07-16 07:31:28,2.0,XAm4_-lUxzF_Y449cWMHLQ,1.0,Th service here is very hit or miss... Sometim...,0.0,ABUt9wCHRsSpa8i7rtNAuQ,22853
1,22854,aNe8ofTYrealxqv7VtFTuw,Sierra Gold,Las Vegas,NV,89118.0,231.0,3.5,"Pubs, Sports Bars, Pizza, Nightlife, Food, Bre...",1.0,2015-10-08 01:13:50,1.0,yHeBEu-QQAWb0Mdq4SXEDg,3.0,Been going here the last couple Sundays to wat...,2.0,wXZS42D0L8hoCiSh2Krc_A,22854
2,22855,aNe8ofTYrealxqv7VtFTuw,Sierra Gold,Las Vegas,NV,89118.0,231.0,3.5,"Pubs, Sports Bars, Pizza, Nightlife, Food, Bre...",0.0,2013-08-15 16:06:29,0.0,1-9Mtjrnpk0cifMRyPgRwg,2.0,I know everyone has their favorites in pizza c...,0.0,48Ip5iJtWNvxo9QrdX07Og,22855
3,22856,aNe8ofTYrealxqv7VtFTuw,Sierra Gold,Las Vegas,NV,89118.0,231.0,3.5,"Pubs, Sports Bars, Pizza, Nightlife, Food, Bre...",1.0,2018-05-19 02:00:55,1.0,zMfx2xEWRDcO4n3w0BYpNw,5.0,"Good atmosphere, nice staff and menu is really...",1.0,Uj3qpOtr6Kr7QGLfiawMLA,22856
4,22857,aNe8ofTYrealxqv7VtFTuw,Sierra Gold,Las Vegas,NV,89118.0,231.0,3.5,"Pubs, Sports Bars, Pizza, Nightlife, Food, Bre...",2.0,2012-12-05 03:40:55,2.0,f50mHUfaAUTAGD5EfNanSA,4.0,"I've been coming here for lunch A LOT lately, ...",1.0,uj4iopBWA0RjpqoJ5xz_vQ,22857


In [6]:
len(data)

30956

In [7]:
data = data.drop(['Unnamed: 0','categories','date','stars_x','cool','review_id','funny','business_id','city','state','postal_code','review_count','useful','user_id'],axis=1)
data.head()

Unnamed: 0,name,stars_y,text,userid
0,Sierra Gold,1.0,Th service here is very hit or miss... Sometim...,22853
1,Sierra Gold,3.0,Been going here the last couple Sundays to wat...,22854
2,Sierra Gold,2.0,I know everyone has their favorites in pizza c...,22855
3,Sierra Gold,5.0,"Good atmosphere, nice staff and menu is really...",22856
4,Sierra Gold,4.0,"I've been coming here for lunch A LOT lately, ...",22857


### Number of Topics

In [16]:
# Split reviews into individual sentences 
df = pd.DataFrame(columns=['userid','sentence','stars'])
for i in range(20000,30956,1):
    sentences = tokenize.sent_tokenize(data.text[i])
    for j in sentences:
        df = df.append({'userid':data.userid[i],'sentence':j,'stars':data.stars_y[i]},ignore_index=True)

In [17]:
# Create Corpus for TFIDF
corpus_3 = []
for i in df.sentence:
        corpus_3.append(i)

In [11]:
#1
df.to_csv('89118_1.csv')
#corpus_1

In [14]:
#2
df.to_csv('89118_2.csv')
#corpus_2

In [21]:
#3
df.to_csv('89118_3.csv')
#corpus_3

In [19]:
# Create Corpus for TFIDF
corpus = []
corpus = corpus_1+corpus_2+corpus_3

### 7 Topics

In [20]:
n_components = 7
n_top_words = 15

# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

# NMF reduction
nmf = NMF(n_components=n_components).fit(tfidf)
W_pos = nmf.fit_transform(tfidf)

# Output Topics
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: great experience breakfast prices service atmosphere price overall pizza lunch spot menu coffee flavor selection
Topic #1: good really pretty pizza prices overall just chicken breakfast fries coffee price sauce taste experience
Topic #2: place love recommend amazing awesome breakfast really like sushi clean try nice favorite highly eat
Topic #3: food amazing fresh excellent awesome quality fast came just best tasty like order fantastic price
Topic #4: service friendly customer staff excellent fast amazing quick nice super awesome attentive clean slow helpful
Topic #5: definitely time come try vegas wait order just best coming like ll ve recommend eat
Topic #6: delicious fresh absolutely pizza chicken ordered super fries salad sauce breakfast hot menu healthy fried



- Topic #0: Overall experience
- Topic #1: Price
- Topic #2: "Worth it"
- Topic #3: Food
- Topic #4: Service
- Topic #5: 
- Topic #6: Food

### Label Sentences

In [22]:
import glob
path =r'C:\Users\xinro\Downloads\89118'
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df)
frame = pd.concat(list_)
df = frame

In [23]:
# Append Topic with highest score
array = []
# For all NMF array
for i in range(0,len(W_pos),1):
    # Create dictionary with Topics and its NMF scores for each sentence
    topic_dict = {}
    # Drop sentences that have length less than 10 by setting topic to -1
    if len(corpus[i])>=10:
        for ind, w in enumerate(W_pos[i]):
            topic_dict[ind] = w
        # Classify sentence to the topic with highest score
        array.append(max(topic_dict, key=topic_dict.get))
    else:
        array.append(-1)
# Create new column in df for topic
df['Topic'] = array

In [24]:
df.head()

Unnamed: 0.1,Unnamed: 0,userid,sentence,stars,Topic
0,0,22853,Th service here is very hit or miss...,1.0,4
1,1,22853,Sometimes the staff is personable and attentive.,1.0,4
2,2,22853,"Other times I ask myself, ""Why do I come here???""",1.0,5
3,3,22853,"The staff often has the ""GTFOH"" stamp placed ...",1.0,4
4,4,22853,I would say that my continuos visits were for ...,1.0,3


### Vader Sentiment Analysis

In [25]:
# Initialize Sentiment Intensity Analyzer
analyser = SentimentIntensityAnalyzer()

In [26]:
# Append Sentiment Intensity Scores for each sentence
array = []
for i in df.sentence:
    # Generate Sentiment Intensity Scores and store in array
    score = analyser.polarity_scores(i)
    array.append(score['compound'])
# Create new column in df for sentiment intensity score
df['sentiment'] = array

In [27]:
# Initialize Final df of intensity scores
df_scores = pd.DataFrame(columns=['userid','0','1','2','3','4','5','6','stars'])
# For every user aggregate the sentiment scores by topic
for i in df.userid.unique():
    # Create df of scores from same user
    temp_df = df[df.userid==i].reset_index(drop=True)
    # For every topic
    topic_score = []
    for j in range(0,7,1):
        score = 0
        count = 0
        for k in range(0,len(temp_df),1):
            # If topic equal to current topic
            if temp_df.Topic[k] == j:
                # Add sentiment score
                score = score + temp_df.sentiment[k]
                # Increase count
                count = count + 1
        # If count = 0 then no score for topic
        if count==0:
            topic_score.append(0)
        # Else append average score for topic
        else:
            topic_score.append(score/count)
    # Insert UserId and Star Rating 
    topic_score.insert(0,temp_df.userid[0])
    topic_score.insert(len(topic_score),temp_df.stars[0])
    # Transform and Append into main df
    temp = pd.DataFrame(pd.Series(topic_score))
    temp = temp.transpose()
    temp.columns = df_scores.columns
    df_scores = df_scores.append(temp,ignore_index=True)

In [28]:
df_scores.head()

Unnamed: 0,userid,0,1,2,3,4,5,6,stars
0,22853.0,0.0,0.0,0.0,0.3716,0.0,0.0,0.0,1.0
1,22854.0,0.0,0.1531,0.0,0.0,0.2726,0.097375,0.0,3.0
2,22855.0,0.0,-0.4854,0.8402,0.0,0.164633,-0.02181,0.0,2.0
3,22856.0,0.0,0.924,0.0,0.0,0.0,0.0,0.0,5.0
4,22857.0,0.8074,0.2882,0.6249,0.0,0.0,0.316991,0.0,4.0


In [29]:
df_scores.mean()

userid    554972.560085
0              0.122626
1              0.191292
2              0.172795
3              0.142847
4              0.187049
5              0.172974
6              0.128607
stars          3.972800
dtype: float64

### Linear Regression

In [30]:
# Split into predictors and target
X = df_scores.drop(['userid','stars'],axis=1)
X = X.astype(float)
y = df_scores.stars
y = y.astype(float)
# Split Train vs Test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=1)
# Split Test set into validation & test set
X_test2, X_val, y_test2, y_val = train_test_split(X_test,y_test,test_size=0.5,stratify=y_test,random_state=1)

#### Simple Linear Regression

In [31]:
model = sm.OLS(y_train,sm.add_constant(X_train)).fit()
print(model.params)
print()
print('Mean Squared Error: ',mean_squared_error(y_val,model.predict(sm.add_constant(X_val))))
print('AIC: ',model.aic)

const    2.992714
0        0.565678
1        0.477530
2        0.751501
3        0.877763
4        0.947501
5        1.703245
6        0.712843
dtype: float64

Mean Squared Error:  1.132343704733619
AIC:  73315.223981093


In [None]:
- Topic #0: Overall experience
- Topic #1: Price
- Topic #2: "Worth it"
- Topic #3: Food
- Topic #4: Service
- Topic #5: 
- Topic #6: Food

### 𝑦ℎ𝑎𝑡=2.99+0.57∗Overall+0.48∗Price+0.75∗𝑊𝑜𝑟𝑡ℎ+0.88∗𝐹𝑜𝑜𝑑+0.95∗Service+1.70∗Topic5+0.71∗Food

#### Removed Intercept and Non-Topics

In [32]:
# Split into predictors and target
X = df_scores.drop(['userid','stars','4'],axis=1)
X = X.astype(float)
y = df_scores.stars
y = y.astype(float)
# Split Train vs Test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=1)
# Split Test set into validation & test set
X_test2, X_val, y_test2, y_val = train_test_split(X_test,y_test,test_size=0.5,stratify=y_test,random_state=1)

model = sm.OLS(y_train,X_train).fit()
print(model.params)
print()
print('Mean Squared Error: ',mean_squared_error(y_val,model.predict(X_val)))
print('AIC: ',model.aic)

0    2.165249
1    2.895583
2    2.570985
3    2.605992
5    4.573152
6    2.414053
dtype: float64

Mean Squared Error:  4.940037126306319
AIC:  110465.07289288376
