# Installation

Go to terminal, type in "pip install NRCLex"

# Library

In [16]:
from nrclex import NRCLex

import numpy as np
import pandas as pd

# Detect emotions: sentence

In [6]:
# Assign emotion
text = 'your website is horrible'
  
# Create object
emotion = NRCLex(text)
  
# Using methods to classigy emotion
print('\n', emotion.words)
print('\n', emotion.affect_dict)


 ['your', 'website', 'is', 'horrible']

 {'horrible': ['anger', 'disgust', 'fear', 'negative']}


# "Raw emotion scores" output how many words triggered each emotion

The word "horrible" is the only emotion word, so we just have "1"

In [4]:
print('\n', emotion.raw_emotion_scores)


 {'anger': 1, 'disgust': 1, 'fear': 1, 'negative': 1}


# "Top emotions" standardize "raw emotion scores"

Assuming each sentence can only have a total score of 1, which emotions weigh more and which emotions weigh less?

In [5]:
print('\n', emotion.top_emotions)


 [('fear', 0.25), ('anger', 0.25), ('negative', 0.25), ('disgust', 0.25)]


# Why do we need to standardize? Without standardization, longer sentences can give us more false positives

In [14]:
text = 'It is unclear what brands Amazon will offer in the stores, although the company’s private-label goods are expected to feature prominently, the people said. Amazon sells scores of products including clothes, furniture, batteries and electronic devices through many of its own labels. The plans arent yet final and could change, these people said.'
emotion = NRCLex(text)
print('\n', emotion.affect_dict)
print('\n', emotion.raw_emotion_scores)



 {'offer': ['positive'], 'goods': ['positive'], 'expected': ['anticipation'], 'feature': ['positive'], 'prominently': ['positive'], 'including': ['positive'], 'change': ['fear']}

 {'positive': 5, 'anticipation': 1, 'fear': 1}


# Let's look at a longer example

In [10]:
text = 'I am humbled and honored to be surrounded by colleagues who challenge, support and encourage me at each stage. Through them, I’ve learned work isn’t just about the tasks we set out to do, but the experiences, growth and friendships we build along the way.'
emotion = NRCLex(text)
print('\n', emotion.affect_dict)
print('\n', emotion.raw_emotion_scores)


 {'humbled': ['positive', 'sadness'], 'challenge': ['anger', 'fear', 'negative'], 'encourage': ['joy', 'positive', 'trust'], 'growth': ['positive'], 'build': ['positive']}

 {'positive': 4, 'sadness': 1, 'anger': 1, 'fear': 1, 'negative': 1, 'joy': 1, 'trust': 1}


# Detect emotions: dataframe

In [22]:
df = pd.read_csv('test.csv')
df.head()

Unnamed: 0,id,text
0,1,your website is very easy to use!
1,2,your website is not good
2,3,is this refundable?
3,4,someone needs to be fired
4,5,"Way too big for a 3, 4, & 5 year old..... disa..."


## Make sure you process the data: lemmatization etc (not shown here)

In [23]:
df['emotions'] = df['text'].apply(lambda x: NRCLex(x).affect_frequencies)
df.head()

Unnamed: 0,id,text,emotions
0,1,your website is very easy to use!,"{'fear': 0.0, 'anger': 0.0, 'anticip': 0.0, 't..."
1,2,your website is not good,"{'fear': 0.0, 'anger': 0.0, 'anticip': 0.0, 't..."
2,3,is this refundable?,"{'fear': 0.0, 'anger': 0.0, 'anticip': 0.0, 't..."
3,4,someone needs to be fired,"{'fear': 0.0, 'anger': 0.0, 'anticip': 0.0, 't..."
4,5,"Way too big for a 3, 4, & 5 year old..... disa...","{'fear': 0.0, 'anger': 0.25, 'anticip': 0.0, '..."


In [24]:
df = pd.concat([df.drop(['emotions'], axis = 1), df['emotions'].apply(pd.Series)], axis = 1)
df.head()

Unnamed: 0,id,text,fear,anger,anticip,trust,surprise,positive,negative,sadness,disgust,joy,anticipation
0,1,your website is very easy to use!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,2,your website is not good,0.0,0.0,0.0,0.2,0.2,0.2,0.0,0.0,0.0,0.2,0.2
2,3,is this refundable?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,4,someone needs to be fired,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,5,"Way too big for a 3, 4, & 5 year old..... disa...",0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.25,0.25,0.0,


# Limitation: NRCLex can't handle negations

# Limitation: NRCLex can't recognize words it doesn't know

# References

- https://github.com/metalcorebear/NRCLex
- https://www.geeksforgeeks.org/emotion-classification-using-nrc-lexicon-in-python/

In [1]:
import os
from os import path
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
from datetime import datetime
date = datetime.today().strftime('%y%m%d')
print ('Last modified: ' + date)

Last modified: 210524


In [3]:
df = pd.read_csv('Investment Decision Survey (Mturk) 5.13.2019_May 28, 2020_09.38.csv', encoding = "ISO-8859-1")
df = df.drop(df.index[[0,1]])
df_duplicate = pd.read_csv('mturk_duplicate_participants.csv')
df_bad = pd.read_csv('mturk_subjects_log.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
cols = df.columns.drop('ResponseId')
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

# Version

In [5]:
#aarp or sec version? Dummy coding: 1 = the longer, SEC version.
df['version'] = 1
for i in df.index:
    if df.at[i, 'treatment_AARP_verif'] == 4:
        df.at[i, 'version'] = 0
        print('here')


here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here


# Comprehension

In [6]:
#reverse scoring:
#comprehension_pre and comprehension_post
#if the correct answer is BD, it becomes (100-x)
#if the answer is BOTH, and x>=50, it becomes (100-(x-50))
#if the answer is BOTH, and x<50, it becomes (100-(50-x))
#comp_pre_9_3 and comp_pre_9_4 will each get 50 points

In [7]:
#columns to reverse score because the answer is BD:
cols = ["comp_pre_1_1",'comp_pre_6_1','comp_post_1_1','comp_post_6_1']
df[cols] = 100 - df[cols]

In [8]:
#columns to reward an answer of "50" - item 2, 7, 8, and 4*
#for item 4, the longer version and the shorter version disagree. 
#“Both” was coded as the answer for the longer version
#“Investment advisor” the answer for the shorter version.

df["comp_pre_4_raw"] = df["comp_pre_4_1"]
df["comp_post_4_raw"] = df["comp_post_4_1"]

#the special case of item 4:
for i in df.index:
    if df.at[i, 'comp_pre_4_1'] < 50 and df.at[i, 'version'] == 1:
        df.at[i, 'comp_pre_4_1'] = 100 - (50 - df.at[i, 'comp_pre_4_1'])
    elif df.at[i, 'comp_pre_4_1'] >= 50 and df.at[i, 'version'] == 1:
        df.at[i, 'comp_pre_4_1'] =100 - (df.at[i, 'comp_pre_4_1'] - 50)

for i in df.index:
    if df.at[i, 'comp_post_4_1'] < 50 and df.at[i, 'version'] == 1:
        df.at[i, 'comp_post_4_1'] = 100 - (50 - df.at[i, 'comp_post_4_1'])
    elif df.at[i, 'comp_post_4_1'] >= 50 and df.at[i, 'version'] == 1:
        df.at[i, 'comp_post_4_1'] =100 - (df.at[i, 'comp_post_4_1'] - 50)

        
#the other cases:

for i in df.index:
    if df.at[i, 'comp_pre_2_1'] < 50:
        df.at[i, 'comp_pre_2_1'] = 100 - (50 - df.at[i, 'comp_pre_2_1'])
    else:
        df.at[i, 'comp_pre_2_1'] =100 - (df.at[i, 'comp_pre_2_1'] - 50)

for i in df.index:
    if df.at[i, 'comp_pre_7_1'] < 50:
        df.at[i, 'comp_pre_7_1'] = 100 - (50 - df.at[i, 'comp_pre_7_1'])
    else:
        df.at[i, 'comp_pre_7_1'] =100 - (df.at[i, 'comp_pre_7_1'] - 50)

for i in df.index:
    if df.at[i, 'comp_pre_8_1'] < 50:
        df.at[i, 'comp_pre_8_1'] = 100 - (50 - df.at[i, 'comp_pre_8_1'])
    else:
        df.at[i, 'comp_pre_8_1'] =100 - (df.at[i, 'comp_pre_8_1'] - 50)
        
for i in df.index:
    if df.at[i, 'comp_post_2_1'] < 50:
        df.at[i, 'comp_post_2_1'] = 100 - (50 - df.at[i, 'comp_post_2_1'])
    else:
        df.at[i, 'comp_post_2_1'] =100 - (df.at[i, 'comp_post_2_1'] - 50)


for i in df.index:
    if df.at[i, 'comp_post_7_1'] < 50:
        df.at[i, 'comp_post_7_1'] = 100 - (50 - df.at[i, 'comp_post_7_1'])
    else:
        df.at[i, 'comp_post_7_1'] =100 - (df.at[i, 'comp_post_7_1'] - 50)

for i in df.index:
    if df.at[i, 'comp_post_8_1'] < 50:
        df.at[i, 'comp_post_8_1'] = 100 - (50 - df.at[i, 'comp_post_8_1'])
    else:
        df.at[i, 'comp_post_8_1'] =100 - (df.at[i, 'comp_post_8_1'] - 50)
        
    


In [9]:
#sum up a final comprehension pre and post score
df['comp_pre_score'] = (df['comp_pre_1_1'] + df['comp_pre_2_1'] + df['comp_pre_3_1'] + df['comp_pre_4_1'] + df['comp_pre_5_1'] + df['comp_pre_6_1'] + df['comp_pre_7_1'] + df['comp_pre_8_1'])/8
df['comp_post_score'] = (df['comp_post_1_1'] + df['comp_post_2_1'] + df['comp_post_3_1'] + df['comp_post_4_1'] + df['comp_post_5_1'] + df['comp_post_6_1'] + df['comp_post_7_1'] + df['comp_post_8_1'])/8

### Comprehension change

In [10]:
df['comp_change_1'] = df['comp_post_1_1'] - df['comp_pre_1_1']
df['comp_change_2'] = df['comp_post_2_1'] - df['comp_pre_2_1']
df['comp_change_3'] = df['comp_post_3_1'] - df['comp_pre_3_1']
df['comp_change_4'] = df['comp_post_4_1'] - df['comp_pre_4_1']
df['comp_change_5'] = df['comp_post_5_1'] - df['comp_pre_5_1']
df['comp_change_6'] = df['comp_post_6_1'] - df['comp_pre_6_1']
df['comp_change_7'] = df['comp_post_7_1'] - df['comp_pre_7_1']
df['comp_change_8'] = df['comp_post_8_1'] - df['comp_pre_8_1']

df['comprehension_change'] = df['comp_post_score'] - df['comp_pre_score']

# Decision

In [11]:
#decision scores ranges from -100 to 100. -100 means definitely choose BD, 0 means weak preference, 100 means definitely choose IA.
#decision scores are the effects coding of the decision multiplied with the strength of the decision
df['binary_decision_pre'] = df['decision_pre_1'] - 1
df['binary_decision_post'] = df['decision_post_1'] - 1

for i in df.index:
    if df.at[i, 'decision_pre_1'] == 1:
        df.at[i, 'decision_pre_1'] = -1
    else:
        df.at[i, 'decision_pre_1'] = 1
        
#Decision ranges from -100 to 100

df['decision_pre'] =  df['decision_pre_1'] * df['decision_pre_2_44'] 

for i in df.index:
    if df.at[i, 'decision_post_1'] == 1:
        df.at[i, 'decision_post_1'] = -1
    else:
        df.at[i, 'decision_post_1'] = 1
       
df['decision_post'] =  df['decision_post_1'] * df['decision_post_2_1']


### Decisions change

In [12]:
df['decision_change'] = df['decision_post'] - df['decision_pre']

### Exclusion

In [13]:
#exclude 1). duplicate data, 2.)bad data, 3). progress incomplete data, 
# 4). failing language/residency/age
df['exclude_python'] = 0
for i in df.index:
    for k in df_bad.index:
        if df.at[i, 'Random ID'] == df_bad.at[k, 'survey_code']:
            df.at[i, 'exclude_python'] = 2   #this order ensures that I can count these people as participated
    for j in df_duplicate.index:
        if df.at[i, 'Random ID'] == df_duplicate.at[j, 'survey_code']:
            df.at[i, 'exclude_python'] = 1
    if df.at[i, 'Progress']!= 100:
            df.at[i, 'exclude_python'] = 3          
    elif df.at[i, 'lang_3'] != 1 or df.at[i, 'residence'] != 5:
            df.at[i, 'exclude_python'] = 4      


print ('For publication purpose, here are the counts of participants excluded for analysis:')            
print ('younger:', len(df[(df['exclude_python'] == 2) & (df['age'] <= 35)]), ' ,one of them is xiaoqing, so it should be 68')
print ('older:', len(df[(df['exclude_python'] == 2) & (df['age'] >= 60)]))
print ('overall:', len(df[df['exclude_python'] == 2]),' ,one of them is xiaoqing, so it should be 77')



For publication purpose, here are the counts of participants excluded for analysis:
younger: 69  ,one of them is xiaoqing, so it should be 68
older: 2
overall: 78  ,one of them is xiaoqing, so it should be 77


### Age

In [14]:
### Decisions change#create age variables
for i in df.index:
    if 17 < df.at[i, 'age'] < 36:
        df.at[i, 'age_bracket'] = 1
    elif 35 < df.at[i, 'age'] < 60:
        df.at[i, 'age_bracket'] = 2
    elif 59 < df.at[i, 'age'] < 91:
        df.at[i, 'age_bracket'] = 3
        
# df['age_bracket'] = 0
# mask = (df['age'] < 60)
# df['age_bracket'][mask] = 1

### Reading Time

In [15]:
#how much time spent on reading form crs?

df['time_form_sec'] = df['treatment_tm_RAND_1_Page Submit'] + df['treatment_tm_RAND_2_Page Submit'] + df['treatment_tm_RAND_3_Page Submit'] + df['treatment_tm_RAND_4_Page Submit'] + df['treatment_tm_RAND_5_Page Submit'] + df['treatment_tm_RAND_6_Page Submit'] + df['treatment_tm_RAND_7_Page Submit'] + df['treatment_tm_RAND_8_Page Submit']
df['time_form_aarp'] = df['treatment_tm_AARP_1_Page Submit'] + df['treatment_tm_AARP_2_Page Submit'] + df['treatment_tm_AARP_3_Page Submit'] + df['treatment_tm_AARP_4_Page Submit'] + df['treatment_tm_AARP_5_Page Submit'] + df['treatment_tm_AARP_6_Page Submit'] + df['treatment_tm_AARP_7_Page Submit']

df['reading_time'] = df.fillna(0)['time_form_sec'] + df.fillna(0)['time_form_aarp']

### Income

In [16]:
#income?
x = [1,2,3,4,5,6,12,13,9,10,11]
y = [15,20,30,42.5,62.5,87.5,112.5,137.5,162.5,187.5,200]
df['income'] = (df['income_1'].replace(x, y))

### Race

In [17]:
#race
x = [1,2,3,4,5,6]
y = [0,1,0,0,0,0]
df['race'] = (df['race'].replace(x, y))

### Experience

In [18]:
#subjective vs objective effectiveness
#comprehension

# df['comp_relationship'] = (df['comp_post_1_1'] + df['comp_post_2_1']) / 2
# df['comp_obligations'] = (df['comp_post_3_1'] + df['comp_post_4_1']) / 2
# df['comp_fees'] = (df['comp_post_5_1'] + df['comp_post_6_1']) / 2
# df['comp_conflicts'] = (df['comp_post_7_1'] + df['comp_post_8_1']) / 2
# df ['comp_additionalinfo'] = df['comp_post_9']

# df['subj_relationship'] = df['crs_RAND_3']
# df['subj_obligations'] = df['crs_RAND_4']
# df['subj_fees'] = df['crs_RAND_5']
# df['subj_conflicts'] = df['crs_RAND_6']
# df ['subj_additionalinfo'] = df['crs_RAND_7']

# df['subj_obligations'] = df['subj_obligations'].replace([4,5,6,7], [2,3,4,5])
# df['subj_fees'] = df['subj_fees'].replace([4,5,6,7], [2,3,4,5])
# df['subj_conflicts'] = df['subj_conflicts'].replace([4,5,6,7], [2,3,4,5])
# df ['subj_additionalinfo'] = df['subj_additionalinfo'].replace([4,5,6,7], [2,3,4,5])

In [19]:
#experience level
df['experience_1'] = df['experience_1'].replace([4], [0])
df['experience_2'] = df['experience_2'].replace([1,2,3], [1,1,0])
df['experience_3'] = df['experience_3'].replace([4], [0])
df['experience_4'] = df['experience_4'].replace([2], [0])
df['experience_6'] = df['experience_6'].replace([2], [0])
df['experience_7'] = df['experience_7'].replace([2], [0])
df['rand_experience_sum'] = df['experience_1'] + df['experience_2'] + df['experience_3'] + df['experience_4'] + df['experience_6'] + df['experience_7']

df['experience'] = 0
mask = (df['rand_experience_sum'] > 2)
df['experience'][mask] = 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


### Financial Literacy

In [20]:
#financial_lit_sum
df['financial_lit_sum'] = df['financial_lit_1'] + df['financial_lit_2'] + df['financial_lit_3'] 

df['finlit'] = 0
mask = (df['financial_lit_sum'] > 1)
df['finlit'][mask] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Vocab

In [21]:
#vocab### Decisions change
filter_col = [col for col in df if col.startswith('vocab')]
df['vocab'] = df[filter_col].sum(axis=1)

In [22]:
#rename Duration (in seconds)
df = df.rename({'Duration (in seconds)': 'duration'}, axis=1)
df = df.rename({'Random ID': 'random_id'}, axis=1)
df = df.rename({'helpful_1': 'helpful'}, axis=1)
df = df.rename({'interesting_1': 'interesting'}, axis=1)

In [23]:
df = df[df.exclude_python == 0]

In [24]:
df1 = df

In [25]:
subset = ['ResponseId','random_id','age','gender','race','education','duration','income',
          'comp_pre_score','comp_post_score','comprehension_change', 'helpful', 'interesting',
          'binary_decision_pre','binary_decision_post','decision_pre', 'decision_post','decision_change', 'age_bracket',
          'version','rand_experience_sum','experience','financial_lit_sum','finlit', 'reading_time', 'vocab',
          'comp_pre_1_1', 'comp_pre_2_1', 'comp_pre_3_1', 'comp_pre_4_1', 'comp_pre_5_1', 'comp_pre_6_1', 'comp_pre_7_1', 'comp_pre_8_1',
          'comp_post_1_1', 'comp_post_2_1', 'comp_post_3_1', 'comp_post_4_1', 'comp_post_5_1', 'comp_post_6_1', 'comp_post_7_1', 'comp_post_8_1', 
          'comp_change_1','comp_change_2','comp_change_3','comp_change_4','comp_change_5','comp_change_6','comp_change_7','comp_change_8',
         'comp_pre_4_raw', 'comp_post_4_raw']

df1 = df1[subset]

# Save 

In [26]:
df1.to_csv(('mturk_wide_'+ date + '.csv'),index = False)