In [1]:
from py_lex import EmoLex
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
import nltk
from nltk.tokenize.casual import TweetTokenizer
import numpy as np
import pandas as pd
from twitter_preprocessor import TwitterPreprocessor
import os
import glob
from collections import Counter

In [2]:
lexicon = EmoLex('./NRC-Emotion-Lexicon-Wordlevel-v0.92-headless.csv')
lexicon.dump('./lexicon.pickle')

In [3]:
lexicon = EmoLex()
lexicon.load('./lexicon.pickle')

lexicon.keys()

{'anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'negative',
 'positive',
 'sadness',
 'surprise',
 'trust'}

In [4]:
len(lexicon)

10

In [5]:
# Where stores the total data
soldierEmoDict = {}

In [6]:
# file list
soldierCSVs = []
files = os.listdir(os.getcwd() + "/data/soldiers")
for file in files:
     if not os.path.isdir(file) and file.endswith(".csv"):
            soldierCSVs.append(file)

In [7]:
# useless columns for analyzing
uselessColumns = [
    "id",
    "conversation_id",
    "place",
    "photos",
    "video",
    "near",
    "geo",
    "source",
    "user_rt_id",
    "user_rt",
    "retweet_id",
    "reply_to",
    "retweet_date",
    "translate",
    "trans_src",
    "trans_dest",
    "link"
]

In [8]:
def calEmoForTweet(tweet):
    tempTweet = tweet
    tempTweetCleaned = TwitterPreprocessor(tempTweet).fully_preprocess().text
    
#     tempTokens = TweetTokenizer.tokenize(tempTweetCleaned)
    tempTokens = nltk.word_tokenize(tempTweetCleaned)
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if len(tempToken)>2]
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if tempToken.lower() not in stopset and len(tempToken)>2]
    
    summary = lexicon.summarize_doc(tempTokens)
    
    return summary

In [None]:
for soldierCSVFile in soldierCSVs:
    if soldierCSVFile == 'cleaned_vet_tweet_df.csv': continue
    print('Calculating... ' + soldierCSVFile)
    
    tempDF = pd.read_csv(os.getcwd() + "/data/soldiers/" + soldierCSVFile, encoding='utf8')
    tempDF.dropna(subset=['tweet'], inplace=True) # clear empty tweets
    tempDF.drop(uselessColumns, axis=1, inplace=True)
    
    
    singleResult = {}
    count = 0.0
    for id, row in tempDF.iterrows():
        emo = calEmoForTweet(row['tweet'])
        count += 1
        if (id % 1000 ) == 0: print(id)
#         if (soldierCSVFile == 'cNikonphoto.csv'): print(id)
        singleResult = dict(Counter(emo) + Counter(singleResult))
    
    singleResult['valid_tweet_count'] = count
    
    for key in lexicon.keys():
        if key in singleResult:
            singleResult[key] = singleResult[key] / count
        else:
            singleResult[key] = 0.0
    
    soldierEmoDict[soldierCSVFile] = singleResult
    print(singleResult)

Calculating... fcharles81.csv
0
1000
2000
{'positive': 0.08077758103602568, 'sadness': 0.02006569630885572, 'trust': 0.05014820855863482, 'anticipation': 0.048726328784548245, 'negative': 0.04552184172069993, 'fear': 0.03088518842587695, 'joy': 0.04432112301476929, 'surprise': 0.028664386003705234, 'disgust': 0.017283581615450107, 'anger': 0.02841112320039621, 'valid_tweet_count': 2483.0}
Calculating... GeoffMillard.csv
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
{'positive': 0.08850023371534158, 'sadness': 0.033235892276753554, 'trust': 0.058780233562243746, 'anticipation': 0.044306846162893355, 'negative': 0.06464355161982124, 'anger': 0.03555015322514268, 'fear': 0.035744380868346096, 'joy': 0.042416665869899506, 'surprise': 0.022545487813523384, 'disgust': 0.023644540377432504, 'valid_tweet_count': 10766.0}
Calculating... cNikonphoto.csv
0
1000
2000
3000
4000
5000
6000
{'sadness': 0.009221199458654405, 'trust': 0.023883385007700503, 'anticipation': 0.02244573451250814, 'ne

  interactivity=interactivity, compiler=compiler, result=result)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
{'positive': 0.062065477519859864, 'joy': 0.03617771193790886, 'trust': 0.03804327985624282, 'surprise': 0.019855951470228183, 'sadness': 0.027013335035117086, 'negative': 0.05621970560847217, 'anger': 0.028939420545983273, 'fear': 0.026555534103392755, 'disgust': 0.02368124288565753, 'anticipation': 0.03392467649561844, 'valid_tweet_count': 16417.0}
Calculating... StMarthasTable.csv
0
1000
{'sadness': 0.02357625683706724, 'negative': 0.050351123640822906, 'anger': 0.016152197361477227, 'fear': 0.03733262298775152, 'surprise': 0.016694616843452893, 'disgust': 0.012193887601799942, 'positive': 0.14723397184623138, 'joy': 0.06824359269352323, 'anticipation': 0.05623005139247291, 'trust': 0.09085786831673945, 'valid_tweet_count': 1280.0}
Calculating... amwyatt.csv
0
{'sadness': 0.012396406799391876, 'trust': 0.03924733842043578, 'negative': 0.02663343095585905, 'anger': 0.017225214333712092, 'fear': 0.0

In [None]:
pd.DataFrame.from_dict(soldierEmoDict, orient="index").to_csv("soldierEmo.csv")