# Text to Sentiment value converter

## Read file

In [1]:
import os
import numpy as np
import pandas as pd

inputFile = 'reg_reviews_01.csv'        # enter original file name that include review text column
outputFile = 'reg_reviews_NLP_01.csv'   # enter oupput file name that will add new sentiment features

In [2]:
# Add empty column columns
df = pd.read_csv(inputFile)
header_list_new = ['numSentence', 'numWords', 'totSentiment', 'avgSentiment', 'Sfreq0','Sfreq1','Sfreq2','Sfreq3','Sfreq4','Sfreq5']
for i, name in enumerate(header_list_new):
    df[name] = 0

In [3]:
df.head()

Unnamed: 0,cool,date,friends,funny,has_photo,localtion,photos,rating,restaurant_id,review,...,numSentence,numWords,totSentiment,avgSentiment,Sfreq0,Sfreq1,Sfreq2,Sfreq3,Sfreq4,Sfreq5
0,0,10/8/2017,0,0,True,"Bradenton, FL",2,4,617,Stopped in for lunch on a Friday for the first...,...,0,0,0,0,0,0,0,0,0,0
1,0,3/30/2018,0,0,True,"River Falls, WI",1,5,617,I was looking for a place to eat that made me ...,...,0,0,0,0,0,0,0,0,0,0
2,0,3/29/2018,1,0,True,"Tampa, FL",19,3,617,It was a busy early evening. The hostess worke...,...,0,0,0,0,0,0,0,0,0,0
3,6,4/15/2017,769,5,True,"Las Vegas, NV",4735,3,617,Located right near the terminal 2 cruise port ...,...,0,0,0,0,0,0,0,0,0,0
4,1,3/11/2017,515,1,True,"Tampa, FL",643,2,617,im sorry for the poor pizza Samantha. Im not s...,...,0,0,0,0,0,0,0,0,0,0


## Requirements to run Stanford Core NLP

#### Running Stanford Core NLP server
-Download NLP https://stanfordnlp.github.io/CoreNLP/index.html#license, and unzip

-Install Java

-Run Stanford Core NLP Server by typing below on a command prompt (Anaconda prompt) from the unzipped directory

cd Documents\Python Scripts\stanford-corenlp-full-2018-01-31

java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 50000


#### Package required
Install a wrapper, e.g. stanfordcorenlp https://pypi.python.org/pypi/stanfordcorenlp

(base) C:\Users\ML\Documents\Python Scripts>pip install stanfordcorenlp-3.8.0.1-py2.py3-none-any.whl

Reference: https://www.khalidalnajjar.com/setup-use-stanford-corenlp-server-python/

## Sentiment analysis function

In [4]:
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

# Function; Output = # sentence, # words, avg.sentimentValue, sentimentHist
def stanford_sentiment(text_str):
    res = nlp.annotate(text_str,
                   properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json',
                       'timeout': 50000,
                   })
    numSentence = len(res["sentences"])
    numWords = len(text_str.split())
    
    # data arrangement
    arraySentVal = np.zeros(numSentence)

    for i, s in enumerate(res["sentences"]):
        arraySentVal[i] = int(s["sentimentValue"])

    # sum of sentiment values for all sentences in a text/review
    totSentiment = sum(arraySentVal)

    # avg. of sentiment values for all sentences in a text/review
    avgSentiment = np.mean(arraySentVal)

    # frequency of sentimentValue in a text/review; {1 : Negative, 2 : Neutral, 3 : Positive}
    bins = [0,1,2,3,4,5,6]
    freq = np.histogram(arraySentVal, bins)[0]    # getting freq. only w/o bins

    return(numSentence, numWords, totSentiment, avgSentiment, freq)   

## Text to Sentiment Score conversion

In [5]:
%%time

# sentiment score calculation
# input = review text w/ removing '\n' characters in text (which does not affect the sentiment analysis much)
# output = number of sentences and words, 
#          sum of all sentiment score from each sentence in a review
#          avg of sentiment scores
#          hist.frequency values of sentiment score (0 to 5); {1 : Negative, 2 : Neutral, 3 : Positive}

dfLength = len(df)

for i in range(dfLength):
    try:
        numSentence, numWords, totSentiment, avgSentiment, freq = stanford_sentiment(df.review[i].replace('\n'," "))
        df.loc[i,'numSentence'] = numSentence
        df.loc[i,'numWords'] = numWords
        df.loc[i,'totSentiment'] = totSentiment
        df.loc[i,'avgSentiment'] = avgSentiment
        df.loc[i,'Sfreq0'] = freq[0]
        df.loc[i,'Sfreq1'] = freq[1]
        df.loc[i,'Sfreq2'] = freq[2]
        df.loc[i,'Sfreq3'] = freq[3]
        df.loc[i,'Sfreq4'] = freq[4]
        df.loc[i,'Sfreq5'] = freq[5]
    except:
        print("error where i =", i)

error where i = 7425
error where i = 8254
error where i = 14014
error where i = 14852
error where i = 18967
error where i = 20471
error where i = 34744
error where i = 41192
error where i = 47756
error where i = 66044
error where i = 66763
error where i = 66992
error where i = 68939
Wall time: 9h 34min 40s


In [6]:
# df.review[1229]

In [7]:
# df.review[7118]

## Write output into a csv file

In [8]:
df.to_csv(outputFile, encoding='utf-8', index=False )