## TEXT MINING C. Sc. 83040 : FINAL PROJECT

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from timeit import default_timer as timer
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import warnings
from scipy.sparse import vstack
warnings.filterwarnings('ignore')

In [27]:
# read the pre-processed data
df_final = pd.read_pickle("D:\\Text Mining\\Final Project\\final_df10.pkl")

In [28]:
df_final.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool,rating,name,latitude,longitude,city,state
299999,vXMQhG5JtNwSRb6iql7iig,GK07iEy8UllYo113DlNnww,HrG_BxmOMPbqstycmzORzw,1,2013-01-09,They did some transmission work for me that in...,3,2,1,negative,"""Ken's Transmission""",33.640787,-112.02529,Phoenix,AZ
254005,W_rauRWpM3ok_diREwEM2A,sjzv-c1k_HGGT9vZbfimWw,HONwpNQ2fmwMTOIZu0VI1A,1,2014-03-15,Poor service when it comes to scheduling. Base...,0,0,0,negative,"""Integrative Family Medicine Limited""",36.078796,-115.242975,Las Vegas,NV
110374,h2F3EgNUdcggV8XrW2VQdg,KL-JE4VkGW02LzeSlW3e6Q,NWlNMG_eBIvDjCcHK46eDQ,2,2015-10-14,"Went on a Tuesday night, and it was really emp...",2,1,1,negative,"""The Haymaker Restaurant""",33.640232,-111.979545,Phoenix,AZ
110373,L3r_OGsUqObVUEyP9uR_Bw,AlYZFOW_Xqi0qXelUrrHVw,NWlNMG_eBIvDjCcHK46eDQ,2,2015-12-13,"Meh\n\nI can imagine going here for the game, ...",3,2,0,negative,"""The Haymaker Restaurant""",33.640232,-111.979545,Phoenix,AZ
110372,u0LWSgqpthGe3R5YlWR6mw,hNY3RdZK7dT43dznSxiA5A,NWlNMG_eBIvDjCcHK46eDQ,2,2016-11-19,Went here last night for dinner. Service was l...,0,0,0,negative,"""The Haymaker Restaurant""",33.640232,-111.979545,Phoenix,AZ


In [36]:
# Function that runs naive bayes algorithm on terms by 
# considering the top "MaxFeatures" high frequency terms
def runMNNaiveBayes(MaxFeatures):
    count_vect = CountVectorizer(stop_words="english", max_features = MaxFeatures)
    tokens = count_vect.fit_transform(df_final["text"].tolist())
    labels = df_final["rating"].tolist()

    trainX = vstack([tokens[0:70000], tokens[100000:170000], tokens[200000:270000]])
    testX = vstack([tokens[70000:100000], tokens[170000:200000], tokens[270000:]])
    trainY = labels[0:70000] + labels[100000:170000] + labels[200000:270000] 
    testY = labels[70000:100000] + labels[170000:200000] + labels[270000:]
    
    clf = MultinomialNB().fit(trainX, trainY)
    predictedY = clf.predict(testX)
    
    test_y = pd.Series(testY, name='Actual')
    pred_y = pd.Series(predictedY, name='Predicted')
    df_confusion = pd.crosstab(test_y, pred_y)
    print("Accuracy: {}".format(round(sklearn.metrics.accuracy_score(testY, predictedY),3)))
    print("Macro averaged precision score: {}".format(round(sklearn.metrics.precision_score(testY, predictedY, average='macro'),3)))
    print("Macro averaged recall score: {}".format(round(sklearn.metrics.recall_score(testY, predictedY, average='macro'),3)))
    print("Macro averaged f-1 score: {}".format(round(sklearn.metrics.f1_score(testY, predictedY, average='macro'),3)))
    return df_confusion

In [30]:
start_time = timer()
runMNNaiveBayes(5000)
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Accuracy: 0.715
Macro averaged precision score: 0.718
Macro averaged recall score: 0.715
Macro averaged f-1 score: 0.716
Finished in : 32.39 seconds


In [37]:
# Function that runs linear SVM algorithm on terms by 
# considering the top "MaxFeatures" high frequency terms
def runLinearSVM(MaxFeatures):
    count_vect = CountVectorizer(stop_words="english", max_features = MaxFeatures)
    tokens = count_vect.fit_transform(df_final["text"].tolist())
    labels = df_final["rating"].tolist()

    trainX = vstack([tokens[0:70000], tokens[100000:170000], tokens[200000:270000]])
    testX = vstack([tokens[70000:100000], tokens[170000:200000], tokens[270000:]])
    trainY = labels[0:70000] + labels[100000:170000] + labels[200000:270000] 
    testY = labels[70000:100000] + labels[170000:200000] + labels[270000:]

    # Train and test a multinomial naive bayes algorithm
    clf = SGDClassifier().fit(trainX, trainY)
    predictedY = clf.predict(testX)
    
    test_y = pd.Series(testY, name='Actual')
    pred_y = pd.Series(predictedY, name='Predicted')
    df_confusion = pd.crosstab(test_y, pred_y)
    print("Accuracy: {}".format(round(sklearn.metrics.accuracy_score(testY, predictedY),3)))
    print("Macro averaged precision score: {}".format(round(sklearn.metrics.precision_score(testY, predictedY, average='macro'),3)))
    print("Macro averaged recall score: {}".format(round(sklearn.metrics.recall_score(testY, predictedY, average='macro'),3)))
    print("Macro averaged f-1 score: {}".format(round(sklearn.metrics.f1_score(testY, predictedY, average='macro'),3)))
    return df_confusion

In [32]:
start_time = timer()
runLinearSVM(5000)
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Accuracy: 0.735
Macro averaged precision score: 0.731
Macro averaged recall score: 0.735
Macro averaged f-1 score: 0.727
Finished in : 33.97 seconds


In [38]:
# Function that runs logistic regression algorithm on terms by 
# considering the top "MaxFeatures" high frequency terms
def runLogReg(MaxFeatures):
    count_vect = CountVectorizer(stop_words="english", max_features = MaxFeatures)
    tokens = count_vect.fit_transform(df_final["text"].tolist())
    labels = df_final["rating"].tolist()

    trainX = vstack([tokens[0:70000], tokens[100000:170000], tokens[200000:270000]])
    testX = vstack([tokens[70000:100000], tokens[170000:200000], tokens[270000:]])
    trainY = labels[0:70000] + labels[100000:170000] + labels[200000:270000] 
    testY = labels[70000:100000] + labels[170000:200000] + labels[270000:]

    # Train and test a multinomial naive bayes algorithm
    clf = SGDClassifier(loss='log').fit(trainX, trainY)
    predictedY = clf.predict(testX)
    
    test_y = pd.Series(testY, name='Actual')
    pred_y = pd.Series(predictedY, name='Predicted')
    df_confusion = pd.crosstab(test_y, pred_y)
    print("Accuracy: {}".format(round(sklearn.metrics.accuracy_score(testY, predictedY),3)))
    print("Macro averaged precision score: {}".format(round(sklearn.metrics.precision_score(testY, predictedY, average='macro'),3)))
    print("Macro averaged recall score: {}".format(round(sklearn.metrics.recall_score(testY, predictedY, average='macro'),3)))
    print("Macro averaged f-1 score: {}".format(round(sklearn.metrics.f1_score(testY, predictedY, average='macro'),3)))
    return df_confusion

In [35]:
start_time = timer()
conMat = runLogReg(5000)
print("Finished in : {} seconds".format(round(timer()-start_time,2)))
conMat

Accuracy: 0.752
Macro averaged precision score: 0.75
Macro averaged recall score: 0.752
Macro averaged f-1 score: 0.751
Finished in : 34.28 seconds


Predicted,negative,neutral,positive
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,23997,4657,1346
neutral,5198,19238,5564
positive,957,4573,24470


In [56]:
test_y = pd.Series(testY, name='Actual')
pred_y = pd.Series(predictedY, name='Predicted')
df_confusion = pd.crosstab(test_y, pred_y)
print("Accuracy: {}".format(round(sklearn.metrics.accuracy_score(testY, predictedY),3)))
print("Macro averaged precision score: {}".format(round(sklearn.metrics.precision_score(testY, predictedY, average='macro'),3)))
print("Macro averaged recall score: {}".format(round(sklearn.metrics.recall_score(testY, predictedY, average='macro'),3)))
print("Macro averaged f-1 score: {}".format(round(sklearn.metrics.f1_score(testY, predictedY, average='macro'),3)))
df_confusion

Accuracy: 0.696
Macro averaged precision score: 0.705
Macro averaged recall score: 0.696
Macro averaged f-1 score: 0.698


Predicted,negative,neutral,positive
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,10248,4351,401
neutral,2542,10162,2296
positive,1348,2759,10893


In [49]:
trainX = vstack([tokens[0:35000], tokens[50000:85000], tokens[100000:135000]])

In [16]:
df_buss = pd.read_csv('D:\\Text Mining\\Final Project\\yelp_business.csv')

In [19]:
tokens[0]

(<1x500 sparse matrix of type '<class 'numpy.int64'>'
 	with 12 stored elements in Compressed Sparse Row format>, 'positive')

In [18]:
df_buss = df_buss[['business_id', 'name', 'latitude', 'longitude', 'city', 'state']]

In [19]:
df_buss.head()

Unnamed: 0,business_id,name,latitude,longitude,city,state
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",33.33069,-111.978599,Ahwatukee,AZ
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",40.291685,-80.1049,McMurray,PA
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""",33.524903,-112.11531,Phoenix,AZ
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""",33.383147,-111.964725,Tempe,AZ
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",41.119535,-81.47569,Cuyahoga Falls,OH


In [20]:
df_neut.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool,rating
8,w41ZS9shepfO3uEyhXEWuQ,u0LXt3Uea_GidxRW1xcsfg,5r6-G9C4YLbC7Ziz57l3rQ,3,2013-02-09,"Not bad!! Love that there is a gluten-free, ve...",1,0,0,neutral
11,PdZ_uFjbbkjtm3SCY_KrZw,u0LXt3Uea_GidxRW1xcsfg,XWTPNfskXoUL-Lf32wSk0Q,3,2011-09-28,Server was a little rude.\n\nOrdered the calam...,5,0,1,neutral
13,lsoSqIrrDbQvWpMvsSj2xw,u0LXt3Uea_GidxRW1xcsfg,RtUvSWO_UZ8V3Wpj0n077w,3,2012-12-03,Wanted to check out this place due to all the ...,2,1,1,neutral
17,xdu8nXrbNKeaywCX79KZSw,u0LXt3Uea_GidxRW1xcsfg,PFPUMF38-lraKzLcTiz5gQ,3,2010-09-15,Came here with my girlfriends one Sunday after...,2,0,0,neutral
18,K7o5jDInfmX3cY5oH6ATNw,u0LXt3Uea_GidxRW1xcsfg,oWTn2IzrprsRkPfULtjZtQ,3,2012-09-23,Came here for a burger as one of my friends sa...,4,0,0,neutral


In [21]:
df_neutN = pd.merge(df_neut, df_buss, left_on = "business_id", right_on="business_id")

In [24]:
df_neutN.shape

(615481, 15)

In [41]:
df_posN = pd.merge(df_pos, df_buss, left_on = "business_id", right_on="business_id")

In [27]:
df_final.memory_usage(deep=True)

Index            12000000
review_id       118500000
user_id         118500000
business_id     118500000
stars            12000000
date             12000000
text           1098617679
useful           12000000
funny            12000000
cool             12000000
rating            1500274
dtype: int64

In [42]:
df_posN["date"] = pd.to_datetime(df_posN["date"])

In [43]:
df_posN["city"] = df_posN.city.astype('category')

In [44]:
df_posN["city"].head()

0    Montréal
1    Montréal
2    Montréal
3    Montréal
4    Montréal
Name: city, dtype: category
Categories (1071, object): [110 Las Vegas, AGINCOURT, Aberdour, Aberlady, ..., peoria, toronto, Île des Soeurs, Île-des-Soeurs]

In [50]:
pd.unique(df_buss["state"])

array(['AZ', 'PA', 'OH', 'BW', 'NV', 'ON', 'NC', 'WI', 'SC', 'QC', 'IL',
       'MLN', 'EDH', 'FIF', 'CHE', 'CO', 'NYK', 'NE', 'HLD', 'WLN', 'ABE',
       'WA', 'SCB', 'KHL', 'ESX', 'CMA', 'FAL', 'ELN', 'NY', 'PKN', 'ST',
       '01', 'CA', 'NLK', 'GLG', 'C', 'VS', 'IN', 'XGL', 'AL', 'STG', 'VT',
       'CS', '30', 'AR', 'SL', 'NI', 'BY', 'AK', 'FLN', nan, 'FL', 'MN',
       '6', 'NTH', 'MT', 'B', 'GA', 'VA', 'DE', 'AB', 'HU', 'KY', '3',
       'TAM', 'ZET', 'RCC', 'WHT'], dtype=object)

In [16]:
%%HTML
<div class='tableauPlaceholder' id='viz1525575724929' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;31&#47;311workbook&#47;CompDash&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='311workbook&#47;CompDash' /><param name='tabs' value='yes' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;31&#47;311workbook&#47;CompDash&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1525575724929');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>