In [1]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
import nltk
from nltk.tokenize.casual import TweetTokenizer
import numpy as np
import pandas as pd
from twitter_preprocessor import TwitterPreprocessor
import os
import glob
# from textblob import TextBlob
# from textblob import Blobber
# from textblob.sentiments import NaiveBayesAnalyzer


In [2]:
# preperation
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
nltk.download('stopwords')
stopset = set(stopwords.words('english'))
# blobSenti = Blobber(analyzer = NaiveBayesAnalyzer())

[nltk_data] Downloading package punkt to /Users/cary/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cary/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/cary/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/cary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Where stores the total data
soldierCalDict = {}

In [4]:
# file list
soldierCSVs = []
files = os.listdir(os.getcwd() + "/data/civilians")
for file in files:
     if not os.path.isdir(file) and file.endswith(".csv"):
            soldierCSVs.append(file)

In [5]:
# useless columns for analyzing
uselessColumns = [
    "id",
    "conversation_id",
    "place",
    "photos",
    "video",
    "near",
    "geo",
    "source",
    "user_rt_id",
    "user_rt",
    "retweet_id",
    "reply_to",
    "retweet_date",
    "translate",
    "trans_src",
    "trans_dest",
    "link"
]


In [6]:
# convert pos from nltk.pos_tag to SentiWordNet
# n - NOUN 
# v - VERB 
# a - ADJECTIVE 
# s - ADJECTIVE SATELLITE 
# r - ADVERB
# ' ' - others
# a list of nltk tags is here:
# https://www.techrepublic.com/article/the-6-laws-every-cloud-architect-should-know-according-to-werner-vogels/
def sentiWordNetPOSconvetor(pos):
    newtag = ''
    if pos.startswith('NN'):
        newtag='n'
    elif pos.startswith('JJ'):
        newtag='a'
    elif pos.startswith('V'):
        newtag='v'
    elif pos.startswith('R'):
        newtag='r'
    return newtag

In [7]:
# word: single word
# pos: part of speech from nltk.pos_tag
def calSentiForWord(word, pos):
    
    sentis = list(swn.senti_synsets(word, sentiWordNetPOSconvetor(pos)))
    
    if len(sentis) <= 0:
        return 0, 0, 0
    
    # Getting average of all possible sentiments
    positive = 0.0
    negative = 0.0
    objective = 0.0
    count = 0.0
    
    for senti in sentis :
        positive += senti.pos_score()
        negative += senti.neg_score()
        objective += senti.obj_score()
        count += 1
        
    if count <= 0.1:
        return 0, 0, 0
    
    positive /= count
    negative /= count
    objective /= count
    
    return positive, negative, objective

In [8]:
def calSentiForTweet(tweet):
    tempTweet = tweet
    tempTweetCleaned = TwitterPreprocessor(tempTweet).remove_urls().remove_mentions().remove_hashtags().remove_twitter_reserved_words().remove_single_letter_words().remove_numbers(preserve_years=True).remove_blank_spaces().text
    if len(tempTweetCleaned) <= 0:
        return 0, 0, 0, 0
    # miss spelling
#     tempTweetCleaned = str(blobSenti(tempTweetCleaned).correct())
    
#     tempTokens = TweetTokenizer.tokenize(tempTweetCleaned)
    tempTokens = nltk.word_tokenize(tempTweetCleaned)
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if len(tempToken)>2]
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if tempToken.lower() not in stopset and len(tempToken)>2]
    tempTagged = nltk.pos_tag(tempTokens)
    
    positive = 0.0
    negative = 0.0
    objective = 0.0
    count = 0.0
    
#     print(tempTagged)
    
    for w, p in tempTagged:
        po, ne, ob = calSentiForWord(w, p)
        positive += po
        negative += ne
        objective += ob
        count += 1
        
    if count <= 0.1:
        return 0, 0, 0, 0
        
    positive /= count
    negative /= count
    objective /= count
    
#     tb = blobSenti(tempTweetCleaned)
    
    return count, positive, negative, objective

In [9]:
for soldierCSVFile in soldierCSVs:
    if soldierCSVFile == 'cleaned_vet_tweet_df.csv': continue
    if soldierCSVFile == 'combined_civilians.csv': continue

    print('Calculating... ' + soldierCSVFile)
    
    tempDF = pd.read_csv(os.getcwd() + "/data/civilians/" + soldierCSVFile, encoding='utf8')
    tempDF.dropna(subset=['tweet'], inplace=True) # clear empty tweets
    tempDF.drop(uselessColumns, axis=1, inplace=True)
    
    positives = []
    negatives = []
    objectives = []
#     polarities = []
#     subjectivities = []
    tweetCleanedLengths = []
    
    for id, row in tempDF.iterrows():
        tokens, po, ne, ob = calSentiForTweet(row['tweet'])
        tweetCleanedLengths.append(tokens)
        positives.append(po)
        negatives.append(ne)
        objectives.append(ob)
#         polarities.append(pola)
#         subjectivities.append(sub)
        
        if (id % 2000) == 0: print(id)
#         if (soldierCSVFile == 'cNikonphoto.csv'): print(id)
    
    count = len(tweetCleanedLengths)
    
    # Computing the standard deviation in float64 is more accurate:
    # np.std([1, 2, 3, 4], dtype=np.float64)
    if count > 0:
        result = {
            'count': count,
            'tweet cleaned length mean': np.mean(tweetCleanedLengths),
            'tweet cleaned length std': np.std(tweetCleanedLengths, dtype=np.float64),
            'positive mean': np.mean(positives),
            'positive std': np.std(positives, dtype=np.float64),
            'negative mean': np.mean(negatives),
            'negative std': np.std(negatives, dtype=np.float64),
            'objective mean': np.mean(objectives),
            'objective std': np.std(objectives, dtype=np.float64),
#             'polarity mean': np.mean(polarities),
#             'polarity std': np.std(polarities, dtype=np.float64),
#             'subjectivity mean': np.mean(subjectivities),
#             'subjectivity std': np.std(subjectivities, dtype=np.float64),
        }
    else:
        result = {
            'count': 0,
            'tweet cleaned length mean': 0,
            'tweet cleaned length std': 0,
            'positive mean': 0,
            'positive std': 0,
            'negative mean': 0,
            'negative std': 0,
            'objective mean': 0,
            'objective std': 0,
#             'polarity mean': 0,
#             'polarity std': 0,
#             'subjectivity mean': 0,
#             'subjectivity std': 0
        }
    
    soldierCalDict[soldierCSVFile] = result
    
    print(result)


Calculating... myrna99.csv
0
{'count': 940, 'tweet cleaned length mean': 9.89468085106383, 'tweet cleaned length std': 6.867946935650414, 'positive mean': 0.027931138997599022, 'positive std': 0.038165877539349156, 'negative mean': 0.017497955867138034, 'negative std': 0.02402043197706413, 'objective mean': 0.29069217664181707, 'objective std': 0.18645256273106292}
.csvulating... KBDeSalvo
0
{'count': 1659, 'tweet cleaned length mean': 13.661844484629295, 'tweet cleaned length std': 8.49597623996019, 'positive mean': 0.03757807488631795, 'positive std': 0.037370376545760496, 'negative mean': 0.01200715890828403, 'negative std': 0.014143419446218254, 'objective mean': 0.3524741646958704, 'objective std': 0.1563323320743209}
Calculating... HTracyDavido.csv
0
2000
{'count': 2588, 'tweet cleaned length mean': 11.049072642967543, 'tweet cleaned length std': 8.1219160780163, 'positive mean': 0.03859826901153744, 'positive std': 0.04595910913322617, 'negative mean': 0.025928000952483838, 'neg

  interactivity=interactivity, compiler=compiler, result=result)


0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
42000
44000
{'count': 44534, 'tweet cleaned length mean': 9.342120626936723, 'tweet cleaned length std': 7.674596749394667, 'positive mean': 0.027067336507071712, 'positive std': 0.04130522878024155, 'negative mean': 0.018037914886257164, 'negative std': 0.03049760168125871, 'objective mean': 0.33895783584615313, 'objective std': 0.19466879009626253}
Calculating... lindseywasson.csv
0
2000
4000
6000
8000
10000
12000
{'count': 13060, 'tweet cleaned length mean': 11.693415007656968, 'tweet cleaned length std': 9.557452748462895, 'positive mean': 0.027094259211508813, 'positive std': 0.03914342105302621, 'negative mean': 0.021154153443863132, 'negative std': 0.032011030239237126, 'objective mean': 0.3329643200893287, 'objective std': 0.18952864678960712}
Calculating... mattdoty.csv
0
{'count': 41, 'tweet cleaned length mean': 15.097560975609756, 'tweet cleaned length std':

  interactivity=interactivity, compiler=compiler, result=result)


0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
{'count': 40986, 'tweet cleaned length mean': 18.51929927292246, 'tweet cleaned length std': 8.569385832654381, 'positive mean': 0.0347686420069634, 'positive std': 0.031164249804709134, 'negative mean': 0.023732930936780982, 'negative std': 0.024684178948143815, 'objective mean': 0.39463607033218623, 'objective std': 0.11529211618764153}
Calculating... carioke.csv
0
{'count': 474, 'tweet cleaned length mean': 10.767932489451477, 'tweet cleaned length std': 10.68186607358507, 'positive mean': 0.024652724486203835, 'positive std': 0.04004052815718582, 'negative mean': 0.018993017428755987, 'negative std': 0.03179841914477705, 'objective mean': 0.2985055875761057, 'objective std': 0.20185329710656993}
.csvulating... williamready
0
{'count': 937, 'tweet cleaned length mean': 17.558164354322304, 'tweet cleaned length std': 8.93598006040804, 'positive mean': 0.02940768988213

In [10]:
# pd.DataFrame.from_dict(soldierCalDict, orient="index").to_csv("soldierCal.csv")
pd.DataFrame.from_dict(soldierCalDict, orient="index").to_csv("civilianCal.csv")