In [1]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
import nltk
from nltk.tokenize.casual import TweetTokenizer
import numpy as np
import pandas as pd
from twitter_preprocessor import TwitterPreprocessor
import os
import glob
# from textblob import TextBlob
# from textblob import Blobber
# from textblob.sentiments import NaiveBayesAnalyzer


In [2]:
# preperation
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
nltk.download('stopwords')
stopset = set(stopwords.words('english'))
# blobSenti = Blobber(analyzer = NaiveBayesAnalyzer())

[nltk_data] Downloading package punkt to /Users/cary/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cary/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/cary/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/cary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Where stores the total data
soldierCalDict = {}

In [4]:
# file list
soldierCSVs = []
files = os.listdir(os.getcwd() + "/data/soldiers")
for file in files:
     if not os.path.isdir(file) and file.endswith(".csv"):
            soldierCSVs.append(file)

In [5]:
# useless columns for analyzing
uselessColumns = [
    "id",
    "conversation_id",
    "place",
    "photos",
    "video",
    "near",
    "geo",
    "source",
    "user_rt_id",
    "user_rt",
    "retweet_id",
    "reply_to",
    "retweet_date",
    "translate",
    "trans_src",
    "trans_dest",
    "link"
]


In [6]:
# convert pos from nltk.pos_tag to SentiWordNet
# n - NOUN 
# v - VERB 
# a - ADJECTIVE 
# s - ADJECTIVE SATELLITE 
# r - ADVERB
# ' ' - others
# a list of nltk tags is here:
# https://www.techrepublic.com/article/the-6-laws-every-cloud-architect-should-know-according-to-werner-vogels/
def sentiWordNetPOSconvetor(pos):
    newtag = ''
    if pos.startswith('NN'):
        newtag='n'
    elif pos.startswith('JJ'):
        newtag='a'
    elif pos.startswith('V'):
        newtag='v'
    elif pos.startswith('R'):
        newtag='r'
    return newtag

In [7]:
# word: single word
# pos: part of speech from nltk.pos_tag
def calSentiForWord(word, pos):
    
    sentis = list(swn.senti_synsets(word, sentiWordNetPOSconvetor(pos)))
    
    if len(sentis) <= 0:
        return 0, 0, 0
    
    # Getting average of all possible sentiments
    positive = 0.0
    negative = 0.0
    objective = 0.0
    count = 0.0
    
    for senti in sentis :
        positive += senti.pos_score()
        negative += senti.neg_score()
        objective += senti.obj_score()
        count += 1
        
    if count <= 0.1:
        return 0, 0, 0
    
    positive /= count
    negative /= count
    objective /= count
    
    return positive, negative, objective

In [8]:
def calSentiForTweet(tweet):
    tempTweet = tweet
    tempTweetCleaned = TwitterPreprocessor(tempTweet).remove_urls().remove_mentions().remove_hashtags().remove_twitter_reserved_words().remove_single_letter_words().remove_numbers(preserve_years=True).remove_blank_spaces().text
    if len(tempTweetCleaned) <= 0:
        return 0, 0, 0, 0
    # miss spelling
#     tempTweetCleaned = str(blobSenti(tempTweetCleaned).correct())
    
#     tempTokens = TweetTokenizer.tokenize(tempTweetCleaned)
    tempTokens = nltk.word_tokenize(tempTweetCleaned)
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if len(tempToken)>2]
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if tempToken.lower() not in stopset and len(tempToken)>2]
    tempTagged = nltk.pos_tag(tempTokens)
    
    positive = 0.0
    negative = 0.0
    objective = 0.0
    count = 0.0
    
#     print(tempTagged)
    
    for w, p in tempTagged:
        po, ne, ob = calSentiForWord(w, p)
        positive += po
        negative += ne
        objective += ob
        count += 1
        
    if count <= 0.1:
        return 0, 0, 0, 0
        
    positive /= count
    negative /= count
    objective /= count
    
#     tb = blobSenti(tempTweetCleaned)
    
    return count, positive, negative, objective

In [9]:
for soldierCSVFile in soldierCSVs:
    if soldierCSVFile == 'cleaned_vet_tweet_df.csv': continue
    print('Calculating... ' + soldierCSVFile)
    
    tempDF = pd.read_csv(os.getcwd() + "/data/soldiers/" + soldierCSVFile, encoding='utf8')
    tempDF.dropna(subset=['tweet'], inplace=True) # clear empty tweets
    tempDF.drop(uselessColumns, axis=1, inplace=True)
    
    positives = []
    negatives = []
    objectives = []
#     polarities = []
#     subjectivities = []
    tweetCleanedLengths = []
    
    for id, row in tempDF.iterrows():
        tokens, po, ne, ob = calSentiForTweet(row['tweet'])
        tweetCleanedLengths.append(tokens)
        positives.append(po)
        negatives.append(ne)
        objectives.append(ob)
#         polarities.append(pola)
#         subjectivities.append(sub)
        
        if (id % 10000) == 0: print(id)
#         if (soldierCSVFile == 'cNikonphoto.csv'): print(id)
    
    count = len(tweetCleanedLengths)
    
    # Computing the standard deviation in float64 is more accurate:
    # np.std([1, 2, 3, 4], dtype=np.float64)
    if count > 0:
        result = {
            'count': count,
            'tweet cleaned length mean': np.mean(tweetCleanedLengths),
            'tweet cleaned length std': np.std(tweetCleanedLengths, dtype=np.float64),
            'positive mean': np.mean(positives),
            'positive std': np.std(positives, dtype=np.float64),
            'negative mean': np.mean(negatives),
            'negative std': np.std(negatives, dtype=np.float64),
            'objective mean': np.mean(objectives),
            'objective std': np.std(objectives, dtype=np.float64),
#             'polarity mean': np.mean(polarities),
#             'polarity std': np.std(polarities, dtype=np.float64),
#             'subjectivity mean': np.mean(subjectivities),
#             'subjectivity std': np.std(subjectivities, dtype=np.float64),
        }
    else:
        result = {
            'count': 0,
            'tweet cleaned length mean': 0,
            'tweet cleaned length std': 0,
            'positive mean': 0,
            'positive std': 0,
            'negative mean': 0,
            'negative std': 0,
            'objective mean': 0,
            'objective std': 0,
#             'polarity mean': 0,
#             'polarity std': 0,
#             'subjectivity mean': 0,
#             'subjectivity std': 0
        }
    
    soldierCalDict[soldierCSVFile] = result
    
    print(result)


Calculating... fcharles81.csv
0
{'count': 2483, 'tweet cleaned length mean': 11.342327829238824, 'tweet cleaned length std': 9.236260411089148, 'positive mean': 0.029115606186199367, 'positive std': 0.044574041712805025, 'negative mean': 0.0204517327577071, 'negative std': 0.03073654911977145, 'objective mean': 0.3455796287686272, 'objective std': 0.1944905499641884}
Calculating... GeoffMillard.csv
0
10000
{'count': 10766, 'tweet cleaned length mean': 18.373304848597435, 'tweet cleaned length std': 12.57766025202359, 'positive mean': 0.027207293018808303, 'positive std': 0.02966980835039068, 'negative mean': 0.02408279559047247, 'negative std': 0.024487573547413592, 'objective mean': 0.37189436913867535, 'objective std': 0.14925432920760728}
Calculating... cNikonphoto.csv
0
{'count': 6981, 'tweet cleaned length mean': 8.220312276178198, 'tweet cleaned length std': 8.356399759989467, 'positive mean': 0.011385928194043776, 'positive std': 0.02311930927853645, 'negative mean': 0.007050791

  interactivity=interactivity, compiler=compiler, result=result)


10000
{'count': 16417, 'tweet cleaned length mean': 10.751903514649449, 'tweet cleaned length std': 7.331309342062428, 'positive mean': 0.026173947393283156, 'positive std': 0.03783633939927451, 'negative mean': 0.021171061727956717, 'negative std': 0.031035165530135838, 'objective mean': 0.302284369918384, 'objective std': 0.18510830129581737}
Calculating... StMarthasTable.csv
0
{'count': 1280, 'tweet cleaned length mean': 12.0484375, 'tweet cleaned length std': 7.301487266892531, 'positive mean': 0.02360683004674229, 'positive std': 0.03202365928570955, 'negative mean': 0.016999550830168193, 'negative std': 0.02520255150151766, 'objective mean': 0.3999501758036044, 'objective std': 0.15975653671223597}
Calculating... amwyatt.csv
0
{'count': 201, 'tweet cleaned length mean': 24.73134328358209, 'tweet cleaned length std': 17.440373549292826, 'positive mean': 0.023606384425817785, 'positive std': 0.023152791656148784, 'negative mean': 0.01514800491871138, 'negative std': 0.0146275382776

In [10]:
pd.DataFrame.from_dict(soldierCalDict, orient="index").to_csv("soldierCal.csv")