In [1]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
import nltk
from nltk.tokenize.casual import TweetTokenizer
import numpy as np
import pandas as pd
from twitter_preprocessor import TwitterPreprocessor
import os
import glob

In [2]:
# preperation
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
nltk.download('stopwords')
stopset = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /Users/cary/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cary/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/cary/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/cary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Where stores the total data
soldierCalDict = {}

In [4]:
# file list
soldierCSVs = []
files = os.listdir(os.getcwd() + "/data/soldiers")
for file in files:
     if not os.path.isdir(file) and file.endswith(".csv"):
            soldierCSVs.append(file)

In [5]:
# useless columns for analyzing
uselessColumns = [
    "id",
    "conversation_id",
    "place",
    "photos",
    "video",
    "near",
    "geo",
    "source",
    "user_rt_id",
    "user_rt",
    "retweet_id",
    "reply_to",
    "retweet_date",
    "translate",
    "trans_src",
    "trans_dest",
    "link"
]


In [6]:
# convert pos from nltk.pos_tag to SentiWordNet
# n - NOUN 
# v - VERB 
# a - ADJECTIVE 
# s - ADJECTIVE SATELLITE 
# r - ADVERB
# ' ' - others
# a list of nltk tags is here:
# https://www.techrepublic.com/article/the-6-laws-every-cloud-architect-should-know-according-to-werner-vogels/
def sentiWordNetPOSconvetor(pos):
    newtag = ''
    if pos.startswith('NN'):
        newtag='n'
    elif pos.startswith('JJ'):
        newtag='a'
    elif pos.startswith('V'):
        newtag='v'
    elif pos.startswith('R'):
        newtag='r'
    return newtag

In [7]:
# word: single word
# pos: part of speech from nltk.pos_tag
def calSentiForWord(word, pos):
    
    sentis = list(swn.senti_synsets(word, sentiWordNetPOSconvetor(pos)))
    
    if len(sentis) <= 0:
        return 0, 0, 0
    
    # Getting average of all possible sentiments
    positive = 0.0
    negative = 0.0
    objective = 0.0
    count = 0.0
    
    for senti in sentis :
        positive += senti.pos_score()
        negative += senti.neg_score()
        objective += senti.obj_score()
        count += 1
        
    if count <= 0.1:
        return 0, 0, 0
    
    positive /= count
    negative /= count
    objective /= count
    
    return positive, negative, objective

In [8]:
def calSentiForTweet(tweet):
    tempTweet = tweet
    tempTweetCleaned = TwitterPreprocessor(tempTweet).fully_preprocess().text
    
    if len(tempTweetCleaned) <= 0:
        return 0, 0, 0, 0
    
#     tempTokens = TweetTokenizer.tokenize(tempTweetCleaned)
    tempTokens = nltk.word_tokenize(tempTweetCleaned)
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if len(tempToken)>2]
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if tempToken.lower() not in stopset and len(tempToken)>2]
    tempTagged = nltk.pos_tag(tempTokens)
    
    positive = 0.0
    negative = 0.0
    objective = 0.0
    count = 0.0
    
#     print(tempTagged)
    
    for w, p in tempTagged:
        po, ne, ob = calSentiForWord(w, p)
        positive += po
        negative += ne
        objective += ob
        count += 1
        
    if count <= 0.1:
        return 0, 0, 0, 0
        
    positive /= count
    negative /= count
    objective /= count
    
    return count, positive, negative, objective

In [10]:
for soldierCSVFile in soldierCSVs:
    if soldierCSVFile == 'cleaned_vet_tweet_df.csv': continue
    print('Calculating... ' + soldierCSVFile)
    
    tempDF = pd.read_csv(os.getcwd() + "/data/soldiers/" + soldierCSVFile, encoding='utf8')
    tempDF.dropna(subset=['tweet'], inplace=True) # clear empty tweets
    tempDF.drop(uselessColumns, axis=1, inplace=True)

    positives = []
    negatives = []
    objectives = []
    tweetCleanedLengths = []
    
    for id, row in tempDF.iterrows():
        tokens, po, ne, ob = calSentiForTweet(row['tweet'])
        tweetCleanedLengths.append(tokens)
        positives.append(po)
        negatives.append(ne)
        objectives.append(ob)
        
        if (id % 1000 ) == 0: print(id)
#         if (soldierCSVFile == 'cNikonphoto.csv'): print(id)
    
    count = len(tweetCleanedLengths)
    
    # Computing the standard deviation in float64 is more accurate:
    # np.std([1, 2, 3, 4], dtype=np.float64)
    if count > 0:
        result = {
            'count': count,
            'tweet cleaned length mean': np.mean(tweetCleanedLengths),
            'tweet cleaned length std': np.std(tweetCleanedLengths, dtype=np.float64),
            'positive mean': np.mean(positives),
            'positive std': np.std(positives, dtype=np.float64),
            'negative mean': np.mean(negatives),
            'negative std': np.std(negatives, dtype=np.float64),
            'objective mean': np.mean(objectives),
            'objective std': np.std(objectives, dtype=np.float64),
        }
    else:
        result = {
            'count': 0,
            'tweet cleaned length mean': 0,
            'tweet cleaned length std': 0,
            'positive mean': 0,
            'positive std': 0,
            'negative mean': 0,
            'negative std': 0,
            'objective mean': 0,
            'objective std': 0
        }
    
    soldierCalDict[soldierCSVFile] = result
    
    print(result)


Calculating... fcharles81.csv
0
1000
2000
{'count': 2483, 'tweet cleaned length mean': 5.596053161498188, 'tweet cleaned length std': 4.40233166540416, 'positive mean': 0.051373189613490695, 'positive std': 0.07769411273468534, 'negative mean': 0.034686168598558914, 'negative std': 0.055133425743599106, 'objective mean': 0.5049516242442927, 'objective std': 0.2855088751543331}
Calculating... GeoffMillard.csv
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
{'count': 10766, 'tweet cleaned length mean': 8.590284228125581, 'tweet cleaned length std': 5.928264459297859, 'positive mean': 0.05267640003227673, 'positive std': 0.057892454032504675, 'negative mean': 0.038604878086912445, 'negative std': 0.04375903479787348, 'objective mean': 0.5905271985280819, 'objective std': 0.23708012094020903}
Calculating... cNikonphoto.csv
0
1000
2000
3000
4000
5000
6000
{'count': 6981, 'tweet cleaned length mean': 4.385618106288497, 'tweet cleaned length std': 4.1662540874296665, 'positive mean': 0.0

In [11]:
# pd.DataFrame.from_dict(soldierCalDict, orient="index").to_csv("soldierCal.csv")