In [1]:
import os
import csv
import pandas as pd
import numpy as np
import sklearn
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from nltk import tokenize

In [2]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [3]:
def display_topics(model, feature_names, num_topics, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        if topic_idx < num_topics:
            print("{:11}".format("Topic %d:" %(topic_idx)), end='')
            print(", ".join(['{:04.3f}*'.format(topic[i])+feature_names[i] \
                             for i in topic.argsort()[:-no_top_words-1:-1]]))

In [4]:
# Read in Data
data = pd.read_csv('data_89109.csv')
data['userid'] = data['Unnamed: 0']
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,business_id,name,city,state,postal_code,review_count,stars_x,categories,cool,date,funny,review_id,stars_y,text,useful,user_id,userid
0,0,273,6fPQJq4f_yiq1NHn0fd11Q,La Creperie,Las Vegas,NV,89109.0,535.0,3.5,"French, Restaurants, Creperies",0.0,2011-10-27 17:24:21,0.0,jByDg2ZFV0rtPabku-unxw,4.0,This is always a must visit on vegas trips. I...,0.0,6e0khvHCOJU1YTCx8gDfSw,0
1,1,274,6fPQJq4f_yiq1NHn0fd11Q,La Creperie,Las Vegas,NV,89109.0,535.0,3.5,"French, Restaurants, Creperies",0.0,2011-08-24 22:41:59,0.0,M8ebYJfjl6MkqfMGxywv-A,4.0,AWESOME! Im not even a crepe person.... which ...,1.0,m2j1IYqreZKF1crpx5-7Cg,1
2,2,275,6fPQJq4f_yiq1NHn0fd11Q,La Creperie,Las Vegas,NV,89109.0,535.0,3.5,"French, Restaurants, Creperies",0.0,2017-02-19 08:00:59,0.0,Db3RMWo2sSg22norLqeVGQ,4.0,"God selection of crepes! Not much of s wait, r...",0.0,Z2FuxpUUQ1pTbolsCxHPXw,2
3,3,276,6fPQJq4f_yiq1NHn0fd11Q,La Creperie,Las Vegas,NV,89109.0,535.0,3.5,"French, Restaurants, Creperies",0.0,2016-04-11 16:42:57,0.0,iAd4kyeMsNtmveg86zmzkg,4.0,Decent quality - they're super fast to get you...,0.0,rytnXVNx7NJMx6BGz0vsqw,3
4,4,277,6fPQJq4f_yiq1NHn0fd11Q,La Creperie,Las Vegas,NV,89109.0,535.0,3.5,"French, Restaurants, Creperies",0.0,2017-01-14 21:43:08,0.0,UhUH7iZzNumvgj-UVQGzyA,4.0,"Fast, friendly service and a little crepe stan...",2.0,Zohjr4ZPl76vryPqiQLJ4A,4


In [5]:
len(data)

414764

In [6]:
data = data.drop(['Unnamed: 0','categories','date','stars_x','cool','review_id','funny','business_id','city','state','postal_code','review_count','useful','user_id'],axis=1)
data.head()

Unnamed: 0,Unnamed: 0.1,name,stars_y,text,userid
0,273,La Creperie,4.0,This is always a must visit on vegas trips. I...,0
1,274,La Creperie,4.0,AWESOME! Im not even a crepe person.... which ...,1
2,275,La Creperie,4.0,"God selection of crepes! Not much of s wait, r...",2
3,276,La Creperie,4.0,Decent quality - they're super fast to get you...,3
4,277,La Creperie,4.0,"Fast, friendly service and a little crepe stan...",4


### Number of Topics

In [7]:
# Split reviews into individual sentences 
df = pd.DataFrame(columns=['userid','sentence','stars'])
for i in range(0,60000,1):
    sentences = tokenize.sent_tokenize(data.text[i])
    for j in sentences:
        df = df.append({'userid':data.userid[i],'sentence':j,'stars':data.stars_y[i]},ignore_index=True)

In [15]:
# Create Corpus for TFIDF choose the first 60000 rows
corpus_3 = []
for i in df.sentence:
        corpus_3.append(i)

In [9]:
#1
df.to_csv('89109_1.csv')
#corpus_1

In [11]:
#2
df.to_csv('89109_2.csv')
#corpus_2

In [16]:
#3
df.to_csv('89109_3.csv')
#corpus_3

In [25]:
df_3 = pd.read_csv('89109_3.csv')
corpus_3 = []
for i in df_3.sentence:
        corpus_3.append(i)

In [26]:
# Create Corpus for TFIDF
corpus = []
corpus = corpus_1+corpus_2+corpus_3

In [42]:
n_components = 10
n_top_words = 15

# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

# NMF reduction
nmf = NMF(n_components=n_components).fit(tfidf)
W_pos = nmf.fit_transform(tfidf)

# Output Topics
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: great experience atmosphere overall staff drinks time meal selection location server restaurant spot ambiance breakfast
Topic #1: good pretty really pizza overall experience drinks selection thing buffet sushi price fries desserts flavor
Topic #2: vegas best las buffet time ve favorite trip visit strip restaurant buffets hotel eat bellagio
Topic #3: food quality ok decent excellent price average better awesome mediocre selection drinks portions chinese okay
Topic #4: service excellent friendly customer slow attentive fast quick horrible staff terrible stars bad awesome nice
Topic #5: place love recommend try eat awesome loved fun stars highly nice looking strip cool overall
Topic #6: just really like time wait don didn got came nice ordered chicken order pizza went
Topic #7: amazing chicken absolutely experience pizza fried cheese burger staff server mac simply waffles sushi ordered
Topic #8: delicious pizza chic

In [27]:
n_components = 7
n_top_words = 15

# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

# NMF reduction
nmf = NMF(n_components=n_components).fit(tfidf)
W_pos = nmf.fit_transform(tfidf)

# Output Topics
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: great experience atmosphere overall time staff drinks meal selection location server restaurant ambiance spot breakfast
Topic #1: good pretty really pizza overall experience drinks selection buffet thing sushi price desserts prices fries
Topic #2: vegas definitely time best come las buffet ve favorite visit trip try recommend restaurant eat
Topic #3: food amazing quality delicious ok decent excellent average price better mediocre drinks awesome selection portions
Topic #4: service excellent friendly customer amazing slow attentive fast quick staff horrible terrible stars bad awesome
Topic #5: place love recommend amazing try highly eat loved fun awesome stars nice looking definitely overall
Topic #6: just really like delicious wait chicken got don didn pizza ordered nice worth came order



- Topic #0: Overall experience
- Topic #1: Price
- Topic #2: "Worth it"
- Topic #3: Food
- Topic #4: Service
- Topic #5: 
- Topic #6: Food

In [28]:
import glob
path =r'C:\Users\xinro\Downloads\89109'
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df)
frame = pd.concat(list_)
df = frame

In [29]:
# Append Topic with highest score
array = []
# For all NMF array
for i in range(0,len(W_pos),1):
    # Create dictionary with Topics and its NMF scores for each sentence
    topic_dict = {}
    # Drop sentences that have length less than 10 by setting topic to -1
    if len(corpus[i])>=10:
        for ind, w in enumerate(W_pos[i]):
            topic_dict[ind] = w
        # Classify sentence to the topic with highest score
        array.append(max(topic_dict, key=topic_dict.get))
    else:
        array.append(-1)
# Create new column in df for topic
df['Topic'] = array

In [31]:
# Initialize Sentiment Intensity Analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [32]:
# Append Sentiment Intensity Scores for each sentence
array = []
for i in df.sentence:
    # Generate Sentiment Intensity Scores and store in array
    score = analyser.polarity_scores(i)
    array.append(score['compound'])
# Create new column in df for sentiment intensity score
df['sentiment'] = array

In [33]:
# Initialize Final df of intensity scores
df_scores = pd.DataFrame(columns=['userid','0','1','2','3','4','5','6','stars'])
# For every user aggregate the sentiment scores by topic
for i in df.userid.unique():
    # Create df of scores from same user
    temp_df = df[df.userid==i].reset_index(drop=True)
    # For every topic
    topic_score = []
    for j in range(0,7,1):
        score = 0
        count = 0
        for k in range(0,len(temp_df),1):
            # If topic equal to current topic
            if temp_df.Topic[k] == j:
                # Add sentiment score
                score = score + temp_df.sentiment[k]
                # Increase count
                count = count + 1
        # If count = 0 then no score for topic
        if count==0:
            topic_score.append(0)
        # Else append average score for topic
        else:
            topic_score.append(score/count)
    # Insert UserId and Star Rating 
    topic_score.insert(0,temp_df.userid[0])
    topic_score.insert(len(topic_score),temp_df.stars[0])
    # Transform and Append into main df
    temp = pd.DataFrame(pd.Series(topic_score))
    temp = temp.transpose()
    temp.columns = df_scores.columns
    df_scores = df_scores.append(temp,ignore_index=True)

In [38]:
import statsmodels.api as sm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [39]:
# Split into predictors and target
X = df_scores.drop(['userid','stars'],axis=1)
X = X.astype(float)
y = df_scores.stars
y = y.astype(float)
# Split Train vs Test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=1)
# Split Test set into validation & test set
X_test2, X_val, y_test2, y_val = train_test_split(X_test,y_test,test_size=0.5,stratify=y_test,random_state=1)

In [40]:
model = sm.OLS(y_train,sm.add_constant(X_train)).fit()
print(model.params)
print()
print('Mean Squared Error: ',mean_squared_error(y_val,model.predict(sm.add_constant(X_val))))
print('AIC: ',model.aic)

const    2.871471
0        0.628335
1        0.315986
2        0.850981
3        0.838317
4        0.896340
5        0.753629
6        1.653815
dtype: float64

Mean Squared Error:  1.2035580251310127
AIC:  144970.80224183755


In [None]:
- Topic #0: Overall experience
- Topic #1: Price
- Topic #2: "Worth it"
- Topic #3: Food
- Topic #4: Service
- Topic #5: 
- Topic #6: Food

### 𝑦ℎ𝑎𝑡=2.87+0.63∗Overall+0.32∗Price+0.85∗𝑊𝑜𝑟𝑡ℎ+0.84∗𝐹𝑜𝑜𝑑+0.90∗Service+0.75Topic5+1.65∗Food

In [41]:
# Split into predictors and target
X = df_scores.drop(['userid','stars','4'],axis=1)
X = X.astype(float)
y = df_scores.stars
y = y.astype(float)
# Split Train vs Test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=1)
# Split Test set into validation & test set
X_test2, X_val, y_test2, y_val = train_test_split(X_test,y_test,test_size=0.5,stratify=y_test,random_state=1)

model = sm.OLS(y_train,X_train).fit()
print(model.params)
print()
print('Mean Squared Error: ',mean_squared_error(y_val,model.predict(X_val)))
print('AIC: ',model.aic)

0    2.163695
1    2.749672
2    2.628318
3    2.197884
5    2.299617
6    4.525110
dtype: float64

Mean Squared Error:  5.030062303354737
AIC:  213708.88908207347
