In [6]:
import os
import numpy as np
import pandas as pd
from pycorenlp import StanfordCoreNLP


In [None]:

# For this notebook you need to run stanford corenlp as a local service

# wget wget https://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
# unzip stanford-corenlp-full-2018–10–05.zip
# cd stanford-corenlp-full-2018-10-05
# java -mx6g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 5000

In [4]:

# get dataset
inputFile = '../output.xls'


# Add empty column columns
df = pd.read_excel(inputFile)
df.head(5)

Unnamed: 0.1,Unnamed: 0,original_id,span1,span2,signal,context,idx,label,direction,source,ann_file,split,global_id
0,0,1,['configuration'],['elements'],[],The system as described above has its greatest...,span1 73:86\nspan2 98:106\nsignal,0,1,2,,0,1
1,1,2,['child'],['cradle'],[],The child was carefully wrapped and bound into...,span1 4:9\nspan2 51:57\nsignal,0,-1,2,,0,2
2,2,3,['author'],['disassembler'],[],The author of a keygen uses a disassembler to ...,span1 4:10\nspan2 30:42\nsignal,0,1,2,,0,3
3,3,4,['ridge'],['surge'],[],A misty ridge uprises from the surge.,span1 8:13\nspan2 31:36\nsignal,0,-1,2,,0,4
4,4,5,['student'],['association'],[],The student association is the voice of the un...,span1 4:11\nspan2 12:23\nsignal,0,0,2,,0,5


In [7]:

# run local core nlp
nlp = StanfordCoreNLP('http://localhost:9000')

# Function; Output = # sentence, # words, avg.sentimentValue, sentimentHist
def stanford_sentiment(text_str):
    res = nlp.annotate(text_str,
                   properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json',
                       'timeout': 40000,
                   })
    numSentence = len(res["sentences"])
    numWords = len(text_str.split())
    
    # data arrangement
    arraySentVal = np.zeros(numSentence)

    for i, s in enumerate(res["sentences"]):
        arraySentVal[i] = int(s["sentimentValue"])

    # sum of sentiment values 
    totSentiment = sum(arraySentVal)

    # avg. of sentiment values 
    avgSentiment = np.mean(arraySentVal)

    # frequency of sentimentValue
    bins = [0,1,2,3,4,5,6]
    freq = np.histogram(arraySentVal, bins)[0]    # getting freq. only w/o bins

    return(numSentence, numWords, totSentiment, avgSentiment, freq)

In [9]:
%%time
# for i in range(dfLength):
for i in range(10):
    try:
        numSentence, numWords, totSentiment, avgSentiment, freq = stanford_sentiment(df.context[i].replace('\n'," "))
        df.loc[i,'numSentence'] = numSentence
        df.loc[i,'numWords'] = numWords
        df.loc[i,'totSentiment'] = totSentiment
        df.loc[i,'avgSentiment'] = avgSentiment
        df.loc[i,'Sfreq0'] = freq[0]
        df.loc[i,'Sfreq1'] = freq[1]
        df.loc[i,'Sfreq2'] = freq[2]
        df.loc[i,'Sfreq3'] = freq[3]
        df.loc[i,'Sfreq4'] = freq[4]
        print(freq)
        df.loc[i,'Sfreq5'] = freq[5]
    except:
        print("error where i =", i)
  
outputFile = 'senti_analysis.xls'
df.to_excel(outputFile, encoding='utf-8', index=False )

[0 0 0 1 0 0]
[0 0 0 1 0 0]
[0 0 1 0 0 0]
[0 0 0 1 0 0]
[0 1 0 0 0 0]
[0 0 0 1 0 0]
[0 1 0 0 0 0]
[0 0 1 0 0 0]
[0 1 0 0 0 0]
[0 1 0 0 0 0]
CPU times: user 4.22 s, sys: 34.8 ms, total: 4.25 s
Wall time: 5.04 s


In [10]:
df

Unnamed: 0.1,Unnamed: 0,original_id,span1,span2,signal,context,idx,label,direction,source,...,numSentence,numWords,totSentiment,avgSentiment,Sfreq0,Sfreq1,Sfreq2,Sfreq3,Sfreq4,Sfreq5
0,0,1,['configuration'],['elements'],[],The system as described above has its greatest...,span1 73:86\nspan2 98:106\nsignal,0,1,2,...,1.0,16.0,3.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,2,['child'],['cradle'],[],The child was carefully wrapped and bound into...,span1 4:9\nspan2 51:57\nsignal,0,-1,2,...,1.0,15.0,3.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2,3,['author'],['disassembler'],[],The author of a keygen uses a disassembler to ...,span1 4:10\nspan2 30:42\nsignal,0,1,2,...,1.0,15.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0
3,3,4,['ridge'],['surge'],[],A misty ridge uprises from the surge.,span1 8:13\nspan2 31:36\nsignal,0,-1,2,...,1.0,7.0,3.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,5,['student'],['association'],[],The student association is the voice of the un...,span1 4:11\nspan2 12:23\nsignal,0,0,2,...,1.0,20.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10712,10712,10713,['king'],['broom'],[],"After seating all the idols, which itself take...",span1 71:75\nspan2 109:114\nsignal,0,1,2,...,,,,,,,,,,
10713,10713,10714,['materials'],['industries'],[],The minister attributed the slow production of...,span1 51:60\nspan2 74:84\nsignal,0,0,2,...,,,,,,,,,,
10714,10714,10715,['umbrella'],['frame'],[],The umbrella frame is provided with a movable ...,span1 4:12\nspan2 13:18\nsignal,0,1,2,...,,,,,,,,,,
10715,10715,10716,['film'],['salesman'],[],Manos: The Hands of Fate is a low-budget horro...,span1 48:52\nspan2 80:88\nsignal,0,0,2,...,,,,,,,,,,


In [11]:
output = nlp.annotate(df.context[1], properties={
  'annotators': 'tokenize,ssplit,pos,depparse,parse',
  'outputFormat': 'json'
  })
output


{'sentences': [{'index': 0,
   'parse': '(ROOT\n  (S\n    (NP (DT The) (NN child))\n    (VP (VBD was)\n      (VP\n        (ADVP (RB carefully))\n        (VBN wrapped)\n        (CC and)\n        (VBN bound)\n        (PP (IN into)\n          (NP (DT the) (NN cradle)))\n        (PP (IN by)\n          (NP\n            (NP (NNS means))\n            (PP (IN of)\n              (NP (DT a) (NN cord)))))))\n    (. .)))',
   'basicDependencies': [{'dep': 'ROOT',
     'governor': 0,
     'governorGloss': 'ROOT',
     'dependent': 5,
     'dependentGloss': 'wrapped'},
    {'dep': 'det',
     'governor': 2,
     'governorGloss': 'child',
     'dependent': 1,
     'dependentGloss': 'The'},
    {'dep': 'nsubjpass',
     'governor': 5,
     'governorGloss': 'wrapped',
     'dependent': 2,
     'dependentGloss': 'child'},
    {'dep': 'auxpass',
     'governor': 5,
     'governorGloss': 'wrapped',
     'dependent': 3,
     'dependentGloss': 'was'},
    {'dep': 'advmod',
     'governor': 5,
     'governor

In [12]:
output = nlp.annotate(df.context[1], properties={
  'annotators': 'parse',
  'outputFormat': 'json'
  })
output['sentences']

[{'index': 0,
  'parse': '(ROOT\n  (S\n    (NP (DT The) (NN child))\n    (VP (VBD was)\n      (VP\n        (ADVP (RB carefully))\n        (VBN wrapped)\n        (CC and)\n        (VBN bound)\n        (PP (IN into)\n          (NP (DT the) (NN cradle)))\n        (PP (IN by)\n          (NP\n            (NP (NNS means))\n            (PP (IN of)\n              (NP (DT a) (NN cord)))))))\n    (. .)))',
  'basicDependencies': [{'dep': 'ROOT',
    'governor': 0,
    'governorGloss': 'ROOT',
    'dependent': 5,
    'dependentGloss': 'wrapped'},
   {'dep': 'det',
    'governor': 2,
    'governorGloss': 'child',
    'dependent': 1,
    'dependentGloss': 'The'},
   {'dep': 'nsubjpass',
    'governor': 5,
    'governorGloss': 'wrapped',
    'dependent': 2,
    'dependentGloss': 'child'},
   {'dep': 'auxpass',
    'governor': 5,
    'governorGloss': 'wrapped',
    'dependent': 3,
    'dependentGloss': 'was'},
   {'dep': 'advmod',
    'governor': 5,
    'governorGloss': 'wrapped',
    'dependent': 4,