## Positive vs. negative words in books over time

* Based on words in over 15 million books scanned by the Google Books project.
* Using Harvard IV-4 dictionary, with 1619 positive words and 1989 negative words
* We calculate an index of positive and negative words as $$\text{PosWords}_t = \sum_{w\in \mathscr{P}} \text{WordCount}(w,t)$$ and $$\text{NegWords}_t = \sum_{w\in \mathscr{N}} \text{WordCount}(w,t).$$


![Happiness in books](https://kelley.iu.edu/nstoffma/da/graph_am_fic.jpg?1)

## The Harvard IV-4 dictionary

http://www.wjh.harvard.edu/~inquirer/homecat.htm

In [1]:
import numpy as np
import pandas as pd

In [3]:
floc = 'https://raw.githubusercontent.com/hanzhichao2000/pysentiment/master/pysentiment/static/HIV-4.csv'

h4 = pd.read_csv(floc)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
h4

Unnamed: 0,Entry,Source,Positiv,Negativ,Pstv,Affil,Ngtv,Hostile,Strong,Power,...,Anomie,NegAff,PosAff,SureLw,If,NotLw,TimeSpc,FormLw,Othtags,Defined
0,A,H4Lvd,,,,,,,,,...,,,,,,,,,DET ART,| article: Indefinite singular article--some o...
1,ABANDON,H4Lvd,,Negativ,,,Ngtv,,,,...,,,,,,,,,SUPV,|
2,ABANDONMENT,H4,,Negativ,,,,,,,...,,,,,,,,,Noun,|
3,ABATE,H4Lvd,,Negativ,,,,,,,...,,,,,,,,,SUPV,|
4,ABATEMENT,Lvd,,,,,,,,,...,,,,,,,,,Noun,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11783,ZENITH,H4,Positiv,,,,,,,,...,,,,,,,,,Noun,|
11784,ZERO,H4Lvd,,,,,,,,,...,,,,,,,,,DET,|
11785,ZEST,H4,Positiv,,,,,,Strong,,...,,,,,,,,,Noun,|
11786,ZINC,H4Lvd,,,,,,,,,...,,,,,,,,,Noun,|


In [7]:
h4[h4['Entry']=='ABANDON'].T.dropna()

Unnamed: 0,1
Entry,ABANDON
Source,H4Lvd
Negativ,Negativ
Ngtv,Ngtv
Weak,Weak
Fail,Fail
IAV,IAV
AffLoss,AffLoss
AffTot,AffTot
Othtags,SUPV


In [9]:
h4[h4['Entry']=='GROWTH'].T.dropna()

Unnamed: 0,4761
Entry,GROWTH
Source,H4Lvd
Strong,Strong
Passive,Passive
Increas,Increas
EndsLw,EndsLw
Othtags,Noun
Defined,| noun: An increase in size or extent


In [10]:
h4[h4['Entry']=='CHEER'].T.dropna()

Unnamed: 0,1696
Entry,CHEER
Source,H4Lvd
Positiv,Positiv
Affil,Affil
Active,Active
ComForm,ComForm
IAV,IAV
WlbGain,WlbGain
WlbTot,WlbTot
Othtags,SUPV


In [11]:
h4[['Entry', 'Positiv', 'Negativ']].dropna(subset=['Positiv', 'Negativ'], how='all')

Unnamed: 0,Entry,Positiv,Negativ
1,ABANDON,,Negativ
2,ABANDONMENT,,Negativ
3,ABATE,,Negativ
5,ABDICATE,,Negativ
6,ABHOR,,Negativ
...,...,...,...
11739,YAWN,,Negativ
11745,YEARN,,Negativ
11749,YELP,,Negativ
11783,ZENITH,Positiv,


In [12]:
h4 = h4[['Entry', 'Positiv', 'Negativ']].dropna(subset=['Positiv', 'Negativ'], how='all')

In [13]:
len(h4['Positiv'].dropna())

1915

In [14]:
len(h4['Negativ'].dropna())

2291

In [15]:
h4['tag'] = 1
h4.loc[h4['Negativ']=='Negativ', 'tag'] = -1

h4.head(10)

Unnamed: 0,Entry,Positiv,Negativ,tag
1,ABANDON,,Negativ,-1
2,ABANDONMENT,,Negativ,-1
3,ABATE,,Negativ,-1
5,ABDICATE,,Negativ,-1
6,ABHOR,,Negativ,-1
7,ABIDE,Positiv,,1
8,ABILITY,Positiv,,1
9,ABJECT,,Negativ,-1
10,ABLE,Positiv,,1
11,ABNORMAL,,Negativ,-1


In [16]:
h4.drop(['Positiv', 'Negativ'], axis=1, inplace=True)

In [17]:
# Look for non-letter characters in words using regular expression pattern

h4[h4['Entry'].str.contains(r'[^\w]')].sample(10)

Unnamed: 0,Entry,tag
4092,FINE#4,1
5843,KICK#4,-1
7773,POINT#6,-1
5684,INVITE#2,1
1443,BUY#2,1
5884,KNOW#4,1
549,APPALL#1,-1
9131,SECURITY#2,1
5719,ISOLATE#1,-1
4832,HAND#6,-1


In [22]:
h4['Entry'] = h4['Entry'].str.replace(r'#_?\d+', '')

  """Entry point for launching an IPython kernel.


In [23]:
h4 = h4.drop_duplicates()

In [24]:
cnts = h4['Entry'].value_counts()
cnts

MIND           2
MATTER         2
FINE           2
ARREST         2
HAND           2
              ..
EAGER          1
EAGERNESS      1
EARNEST        1
EARNESTNESS    1
ZEST           1
Name: Entry, Length: 3626, dtype: int64

In [25]:
cnts[cnts==1].index

Index(['PATHETIC', 'PASSE', 'PARTNERSHIP', 'PASSIONATE', 'PARTNER', 'ABANDON',
       'PATIENCE', 'PATRIOT', 'PATRIOTIC', 'PATRON',
       ...
       'DURABLE', 'DUTY', 'DWINDLE', 'DYING', 'DYNAMIC', 'EAGER', 'EAGERNESS',
       'EARNEST', 'EARNESTNESS', 'ZEST'],
      dtype='object', length=3610)

In [26]:
h4 = h4[h4['Entry'].isin(cnts[cnts==1].index)]

In [27]:
len(h4)

3610

In [28]:
h4['Entry'] = h4['Entry'].str.lower()

In [29]:
h4 = h4.set_index('Entry').squeeze()

In [30]:
h4['trust']

1

In [31]:
h4['happy']

1

In [32]:
h4 = h4.to_dict()

**An alternative dictionary for financial applications**

Loughran-McDonald Dictionary

https://sraf.nd.edu/loughranmcdonald-master-dictionary/

## Measuring sentiment in text

In [34]:
text = '''It was the best of times, it was the worst of times, it was the age of wisdom, 
it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, 
it was the season of Light, it was the season of Darkness, it was the spring of hope, it was 
the winter of despair, we had everything before us, we had nothing before us, we were all 
going direct to Heaven, we were all going direct the other way – in short, the period was so
far like the present period, that some of its noisiest authorities insisted on its being 
received, for good or for evil, in the superlative degree of comparison only.'''

In [35]:
text.split()

['It',
 'was',
 'the',
 'best',
 'of',
 'times,',
 'it',
 'was',
 'the',
 'worst',
 'of',
 'times,',
 'it',
 'was',
 'the',
 'age',
 'of',
 'wisdom,',
 'it',
 'was',
 'the',
 'age',
 'of',
 'foolishness,',
 'it',
 'was',
 'the',
 'epoch',
 'of',
 'belief,',
 'it',
 'was',
 'the',
 'epoch',
 'of',
 'incredulity,',
 'it',
 'was',
 'the',
 'season',
 'of',
 'Light,',
 'it',
 'was',
 'the',
 'season',
 'of',
 'Darkness,',
 'it',
 'was',
 'the',
 'spring',
 'of',
 'hope,',
 'it',
 'was',
 'the',
 'winter',
 'of',
 'despair,',
 'we',
 'had',
 'everything',
 'before',
 'us,',
 'we',
 'had',
 'nothing',
 'before',
 'us,',
 'we',
 'were',
 'all',
 'going',
 'direct',
 'to',
 'Heaven,',
 'we',
 'were',
 'all',
 'going',
 'direct',
 'the',
 'other',
 'way',
 '–',
 'in',
 'short,',
 'the',
 'period',
 'was',
 'so',
 'far',
 'like',
 'the',
 'present',
 'period,',
 'that',
 'some',
 'of',
 'its',
 'noisiest',
 'authorities',
 'insisted',
 'on',
 'its',
 'being',
 'received,',
 'for',
 'good',
 'or'

In [37]:
h4['light']

1

In [38]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [40]:
# create a translator dictionary for replacing many characters
translator = str.maketrans('', '', string.punctuation)

print(text.translate(translator))

It was the best of times it was the worst of times it was the age of wisdom 
it was the age of foolishness it was the epoch of belief it was the epoch of incredulity 
it was the season of Light it was the season of Darkness it was the spring of hope it was 
the winter of despair we had everything before us we had nothing before us we were all 
going direct to Heaven we were all going direct the other way – in short the period was so
far like the present period that some of its noisiest authorities insisted on its being 
received for good or for evil in the superlative degree of comparison only


In [41]:
text = text.translate(translator).lower()

In [42]:
text.split()

['it',
 'was',
 'the',
 'best',
 'of',
 'times',
 'it',
 'was',
 'the',
 'worst',
 'of',
 'times',
 'it',
 'was',
 'the',
 'age',
 'of',
 'wisdom',
 'it',
 'was',
 'the',
 'age',
 'of',
 'foolishness',
 'it',
 'was',
 'the',
 'epoch',
 'of',
 'belief',
 'it',
 'was',
 'the',
 'epoch',
 'of',
 'incredulity',
 'it',
 'was',
 'the',
 'season',
 'of',
 'light',
 'it',
 'was',
 'the',
 'season',
 'of',
 'darkness',
 'it',
 'was',
 'the',
 'spring',
 'of',
 'hope',
 'it',
 'was',
 'the',
 'winter',
 'of',
 'despair',
 'we',
 'had',
 'everything',
 'before',
 'us',
 'we',
 'had',
 'nothing',
 'before',
 'us',
 'we',
 'were',
 'all',
 'going',
 'direct',
 'to',
 'heaven',
 'we',
 'were',
 'all',
 'going',
 'direct',
 'the',
 'other',
 'way',
 '–',
 'in',
 'short',
 'the',
 'period',
 'was',
 'so',
 'far',
 'like',
 'the',
 'present',
 'period',
 'that',
 'some',
 'of',
 'its',
 'noisiest',
 'authorities',
 'insisted',
 'on',
 'its',
 'being',
 'received',
 'for',
 'good',
 'or',
 'for',
 'evil

In [45]:
h4['abhor']

-1

In [46]:
h4['mellifluous']

KeyError: ignored

In [48]:
h4.get('mellifluous', 0)

0

In [44]:
[h4.get(w, 0) for w in text.split()]

[0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 -1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 -1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 -1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 -1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 -1,
 0,
 0,
 1,
 0,
 0,
 0,
 0]

In [49]:
sum([h4.get(w, 0) for w in text.split()])

2

In [50]:
def sent_score(s, dic=None, punc=None):
    ''' Calculates sentiment score by summing positive (+1) and negative (-1) words'''
    import string
    if dic is None:
        raise ValueError('Must supply a dictionary of positive/negative words')
    if punc is None:
        punc = string.punctuation
    translator = str.maketrans('', '', punc)
    s = s.translate(translator).lower()
    negcnt, poscnt = 0, 0
    for w in s.split():
        if dic.get(w,0) == 1:
            poscnt += 1
        elif dic.get(w,0) == -1:
            negcnt += 1
    return (poscnt, negcnt, len(s.split()))

In [51]:
sent_score(text, h4)

(8, 6, 120)