# Text to Sentiment value converter

## Read file

In [1]:
import os
import numpy as np
import pandas as pd

inputFile = 'reg_reviews_02.csv'        # enter original file name that include review text column
outputFile = 'reg_reviews_NLP_02.csv'   # enter oupput file name that will add new sentiment features

In [2]:
# Add empty column columns
df = pd.read_csv(inputFile)
header_list_new = ['numSentence', 'numWords', 'totSentiment', 'avgSentiment', 'Sfreq0','Sfreq1','Sfreq2','Sfreq3','Sfreq4','Sfreq5']
for i, name in enumerate(header_list_new):
    df[name] = 0

In [3]:
df.head()

Unnamed: 0,cool,date,friends,funny,has_photo,localtion,photos,rating,restaurant_id,review,...,numSentence,numWords,totSentiment,avgSentiment,Sfreq0,Sfreq1,Sfreq2,Sfreq3,Sfreq4,Sfreq5
0,0,8/26/2013,84,0,True,"Edgewater, NJ",34,4,350,Nearly all the restaurants on Fisherman's Whar...,...,0,0,0,0,0,0,0,0,0,0
1,0,11/11/2014,281,0,True,"Silicon Valley, CA",76,4,350,I've been going here since I was a little kid ...,...,0,0,0,0,0,0,0,0,0,0
2,4,6/2/2014,84,12,True,"Campbell, CA",14,2,350,"Um, no. I think as we headed up Fisherman's Wh...",...,0,0,0,0,0,0,0,0,0,0
3,0,12/28/2015,19,0,True,"Rockville, MD",303,3,350,Went here after reading great reviews on yelp ...,...,0,0,0,0,0,0,0,0,0,0
4,0,5/4/2015,97,0,True,"Sacramento, CA",1,5,350,You cannot go wrong with eating here. Clam Ch...,...,0,0,0,0,0,0,0,0,0,0


## Requirements to run Stanford Core NLP

#### Running Stanford Core NLP server
-Download NLP https://stanfordnlp.github.io/CoreNLP/index.html#license, and unzip

-Install Java

-Run Stanford Core NLP Server by typing below on a command prompt (Anaconda prompt) from the unzipped directory

cd Documents\Python Scripts\stanford-corenlp-full-2018-01-31

java -mx10g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 50000

!! java heap size can be adjusted by changing the number in the above cmd, -mx10g (this means allocated memory is 10GB)

#### Package required
Install a wrapper, e.g. stanfordcorenlp https://pypi.python.org/pypi/stanfordcorenlp

(base) C:\Users\ML\Documents\Python Scripts>pip install stanfordcorenlp-3.8.0.1-py2.py3-none-any.whl

Reference: https://www.khalidalnajjar.com/setup-use-stanford-corenlp-server-python/

## Sentiment analysis function

In [14]:
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

# Function; Output = # sentence, # words, avg.sentimentValue, sentimentHist
def stanford_sentiment(text_str):
    res = nlp.annotate(text_str,
                   properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json',
                       'timeout': 50000,
                   })
    numSentence = len(res["sentences"])
    numWords = len(text_str.split())
    
    # data arrangement
    arraySentVal = np.zeros(numSentence)

    for i, s in enumerate(res["sentences"]):
        arraySentVal[i] = int(s["sentimentValue"])

    # sum of sentiment values for all sentences in a text/review
    totSentiment = sum(arraySentVal)

    # avg. of sentiment values for all sentences in a text/review
    avgSentiment = np.mean(arraySentVal)

    # frequency of sentimentValue in a text/review; {1 : Negative, 2 : Neutral, 3 : Positive}
    bins = [0,1,2,3,4,5,6]
    freq = np.histogram(arraySentVal, bins)[0]    # getting freq. only w/o bins

    return(numSentence, numWords, totSentiment, avgSentiment, freq)   

## Text to Sentiment Score conversion

In [5]:
%%time

# sentiment score calculation
# input = review text w/ removing '\n' characters in text (which does not affect the sentiment analysis much)
# output = number of sentences and words, 
#          sum of all sentiment score from each sentence in a review
#          avg of sentiment scores
#          hist.frequency values of sentiment score (0 to 5); {1 : Negative, 2 : Neutral, 3 : Positive}

dfLength = len(df)

for i in range(dfLength):
    try:
        numSentence, numWords, totSentiment, avgSentiment, freq = stanford_sentiment(df.review[i].replace('\n'," "))
        df.loc[i,'numSentence'] = numSentence
        df.loc[i,'numWords'] = numWords
        df.loc[i,'totSentiment'] = totSentiment
        df.loc[i,'avgSentiment'] = avgSentiment
        df.loc[i,'Sfreq0'] = freq[0]
        df.loc[i,'Sfreq1'] = freq[1]
        df.loc[i,'Sfreq2'] = freq[2]
        df.loc[i,'Sfreq3'] = freq[3]
        df.loc[i,'Sfreq4'] = freq[4]
        df.loc[i,'Sfreq5'] = freq[5]
    except:
        print("error where i =", i)

error where i = 3624
error where i = 3962
error where i = 28396
error where i = 31377
error where i = 31378
error where i = 31379
error where i = 31380
error where i = 41907
error where i = 48645
error where i = 49231
error where i = 56679
error where i = 57989
error where i = 62787
error where i = 66789
Wall time: 12h 21min 19s


In [6]:
# df.review[1229]

In [10]:
df.iloc[31377,:]

cool                                                            10
date                                                      6/9/2017
friends                                                        280
funny                                                           10
has_photo                                                     True
localtion                                             Torrance, CA
photos                                                        3375
rating                                                           4
restaurant_id                                                  214
review           P A S T A\r\n\r\n- B R E A K D O W N (# of + s...
reviews                                                        489
useful                                                          10
user_name                                               Stephie L.
numSentence                                                      0
numWords                                                      

In [11]:
df.review[31377]

'P A S T A\r\n\r\n- B R E A K D O W N (# of + stars = rating)\r\n* A T M O S P H E R E/C O N C E P T: -1 star\r\n* F O O D/P R E S E N T A T I O N: +1 star\r\n* P O R T I O N/C O S T: +1 star\r\n* S E R V I C E: +1 star\r\n* R E V I S I T: +1 star\r\n\r\nThe atmosphere is a bit unassuming with plain tables and utensils in a basket. Very casual looking with simple menus. I was happy that they served bread and salad with their pasta! && luckily their food was pretty good. The chef served it to us which was a nice touch. Their pasta was cooked perfectly in heaping portions on the plate! We tried the mentaiko and the uni pasta! Both were delicious! It had hints of flavors that were different from other Japanese Italian spots but it worked. It gave it a complex flavor. While my favorite uni pasta is still Cafe Hiro, I would still recommend this spot for good pasta! Delicious!\r\n\r\nP A R K I N G\r\n\r\nIt was tough, but since we parked a little further away it was free on Sunday. Most of t

In [15]:
# manually calculated NLP outputs // server was restarted due to memory leak  // missed 4 recording conversion

numSentence, numWords, totSentiment, avgSentiment, freq = stanford_sentiment(df.review[31377].replace('\n'," "))
out = [numSentence, numWords, totSentiment, avgSentiment, freq[0], freq[1], freq[2], freq[3], freq[4]]
print(out)

[14, 246, 29.0, 2.0714285714285716, 1, 4, 2, 7, 0]


In [17]:
df.review[31378]

"Pasta heaven!!!! We shared 4 dishes and the were all delicious. We were so surprised how we didn't feel super heavy after our meal. The noodles were the right consistently.\r\n\r\nMy favorite was the garlic pasta. It was really flavorful and light!  Oh my I'm drooling I need to stop by soon!\r\n\r\nYou can call and make reservations, it's not a huge restaurant. They also validate parking for the Honda plaza lot."

In [16]:
numSentence, numWords, totSentiment, avgSentiment, freq = stanford_sentiment(df.review[31378].replace('\n'," "))
out = [numSentence, numWords, totSentiment, avgSentiment, freq[0], freq[1], freq[2], freq[3], freq[4]]
print(out)

[9, 72, 16.0, 1.7777777777777777, 0, 6, 0, 2, 1]


In [18]:
numSentence, numWords, totSentiment, avgSentiment, freq = stanford_sentiment(df.review[31379].replace('\n'," "))
out = [numSentence, numWords, totSentiment, avgSentiment, freq[0], freq[1], freq[2], freq[3], freq[4]]
print(out)

[7, 85, 12.0, 1.7142857142857142, 0, 4, 1, 2, 0]


In [19]:
numSentence, numWords, totSentiment, avgSentiment, freq = stanford_sentiment(df.review[31380].replace('\n'," "))
out = [numSentence, numWords, totSentiment, avgSentiment, freq[0], freq[1], freq[2], freq[3], freq[4]]
print(out)

[9, 104, 21.0, 2.3333333333333335, 0, 2, 2, 5, 0]


In [20]:
df.review[3624]

"I give this place 3.5 stars. If dining in the evening (which I did), I would recommend sitting outside in the patio. It's quaintly decorated and sets a nice dinner mood, perfect for catching up with friends, family, or even your significant other.\r\n\r\nOur waitress recommended we order everything family style so that we could try a bit of everything, which ended up being a good idea b/c some of the dishes came out smaller than expected, whereas other dishes were of more appropriate dinner entree size. This place is beer and wine only. Also, the menu changes depending on what is in season and what the chef decides to prepare for the day.\r\n\r\nThe dishes I thought were note-worthy:\r\n1) arugula salad -- very fresh with a wonderful vinaigrette\r\n2) the country ham -- sliced thin, served with a creamy cheese spread, cantaloupe melons, and sliced/toasted bread -- more of a dish to share than have on your own as an entree in my opinion\r\n3) grilled hanger steak -- cooked appropriatel

retried the failed NLP analysis and failed again... 

In [21]:
numSentence, numWords, totSentiment, avgSentiment, freq = stanford_sentiment(df.review[3624].replace('\n'," "))
out = [numSentence, numWords, totSentiment, avgSentiment, freq[0], freq[1], freq[2], freq[3], freq[4]]
print(out)

TypeError: string indices must be integers

## Write output into a csv file

In [8]:
df.to_csv(outputFile, encoding='utf-8', index=False )