In [1]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
import nltk
from nltk.tokenize.casual import TweetTokenizer
import numpy as np
import pandas as pd
from twitter_preprocessor import TwitterPreprocessor
import os
import glob
# from textblob import TextBlob
# from textblob import Blobber
# from textblob.sentiments import NaiveBayesAnalyzer


In [2]:
# preperation
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
nltk.download('stopwords')
stopset = set(stopwords.words('english'))
# blobSenti = Blobber(analyzer = NaiveBayesAnalyzer())

[nltk_data] Downloading package punkt to /Users/cary/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cary/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/cary/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/cary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Where stores the total data
soldierCalDict = {}

In [4]:
# file list
soldierCSVs = []
files = os.listdir(os.getcwd() + "/data/civilians")
for file in files:
     if not os.path.isdir(file) and file.endswith(".csv"):
            soldierCSVs.append(file)

In [5]:
# useless columns for analyzing
uselessColumns = [
    "id",
    "conversation_id",
    "place",
    "photos",
    "video",
    "near",
    "geo",
    "source",
    "user_rt_id",
    "user_rt",
    "retweet_id",
    "reply_to",
    "retweet_date",
    "translate",
    "trans_src",
    "trans_dest",
    "link"
]


In [6]:
# convert pos from nltk.pos_tag to SentiWordNet
# n - NOUN 
# v - VERB 
# a - ADJECTIVE 
# s - ADJECTIVE SATELLITE 
# r - ADVERB
# ' ' - others
# a list of nltk tags is here:
# https://www.techrepublic.com/article/the-6-laws-every-cloud-architect-should-know-according-to-werner-vogels/
def sentiWordNetPOSconvetor(pos):
    newtag = ''
    if pos.startswith('NN'):
        newtag='n'
    elif pos.startswith('JJ'):
        newtag='a'
    elif pos.startswith('V'):
        newtag='v'
    elif pos.startswith('R'):
        newtag='r'
    return newtag

In [7]:
# word: single word
# pos: part of speech from nltk.pos_tag
def calSentiForWord(word, pos):
    
    sentis = list(swn.senti_synsets(word, sentiWordNetPOSconvetor(pos)))
    
    if len(sentis) <= 0:
        return 0, 0, 0
    
    # Getting average of all possible sentiments
    positive = 0.0
    negative = 0.0
    objective = 0.0
    count = 0.0
    
    for senti in sentis :
        positive += senti.pos_score()
        negative += senti.neg_score()
        objective += senti.obj_score()
        count += 1
        
    if count <= 0.1:
        return 0, 0, 0
    
    positive /= count
    negative /= count
    objective /= count
    
    return positive, negative, objective

In [8]:
def calSentiForTweet(tweet):
    tempTweet = tweet
    tempTweetCleaned = TwitterPreprocessor(tempTweet).remove_urls().remove_mentions().remove_hashtags().remove_twitter_reserved_words().remove_single_letter_words().remove_numbers(preserve_years=True).remove_blank_spaces().text
    if len(tempTweetCleaned) <= 0:
        return 0, 0, 0, 0
    # miss spelling
#     tempTweetCleaned = str(blobSenti(tempTweetCleaned).correct())
    
#     tempTokens = TweetTokenizer.tokenize(tempTweetCleaned)
    tempTokens = nltk.word_tokenize(tempTweetCleaned)
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if len(tempToken)>2]
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if tempToken.lower() not in stopset and len(tempToken)>2]
    tempTagged = nltk.pos_tag(tempTokens)
    
    positive = 0.0
    negative = 0.0
    objective = 0.0
    count = 0.0
    
#     print(tempTagged)
    
    for w, p in tempTagged:
        w = w.lower()
        po, ne, ob = calSentiForWord(w, p)
        positive += po
        negative += ne
        objective += ob
        count += 1
        
    if count <= 0.1:
        return 0, 0, 0, 0
        
    positive /= count
    negative /= count
    objective /= count
    
#     tb = blobSenti(tempTweetCleaned)
    
    return count, positive, negative, objective

In [9]:
filecont = 0
for soldierCSVFile in soldierCSVs:
    if soldierCSVFile == 'cleaned_vet_tweet_df.csv': continue
    if soldierCSVFile == 'combined_civilians.csv': continue

    print('Calculating... ' + soldierCSVFile)
    
    tempDF = pd.read_csv(os.getcwd() + "/data/civilians/" + soldierCSVFile, encoding='utf8')
    tempDF.dropna(subset=['tweet'], inplace=True) # clear empty tweets
    tempDF.drop(uselessColumns, axis=1, inplace=True)
    
    positives = []
    negatives = []
    objectives = []
#     polarities = []
#     subjectivities = []
    tweetCleanedLengths = []
    
    
    
    for id, row in tempDF.iterrows():
        tokens, po, ne, ob = calSentiForTweet(row['tweet'])
        tweetCleanedLengths.append(tokens)
        positives.append(po)
        negatives.append(ne)
        objectives.append(ob)
#         polarities.append(pola)
#         subjectivities.append(sub)
        
        if (id % 2000) == 0: print(id)
#         if (soldierCSVFile == 'cNikonphoto.csv'): print(id)
    
    count = len(tweetCleanedLengths)
    
    # Computing the standard deviation in float64 is more accurate:
    # np.std([1, 2, 3, 4], dtype=np.float64)
    if count > 0:
        result = {
            'count': count,
            'tweet cleaned length mean': np.mean(tweetCleanedLengths),
            'tweet cleaned length std': np.std(tweetCleanedLengths, dtype=np.float64),
            'positive mean': np.mean(positives),
            'positive std': np.std(positives, dtype=np.float64),
            'negative mean': np.mean(negatives),
            'negative std': np.std(negatives, dtype=np.float64),
            'objective mean': np.mean(objectives),
            'objective std': np.std(objectives, dtype=np.float64),
#             'polarity mean': np.mean(polarities),
#             'polarity std': np.std(polarities, dtype=np.float64),
#             'subjectivity mean': np.mean(subjectivities),
#             'subjectivity std': np.std(subjectivities, dtype=np.float64),
        }
    else:
        result = {
            'count': 0,
            'tweet cleaned length mean': 0,
            'tweet cleaned length std': 0,
            'positive mean': 0,
            'positive std': 0,
            'negative mean': 0,
            'negative std': 0,
            'objective mean': 0,
            'objective std': 0,
#             'polarity mean': 0,
#             'polarity std': 0,
#             'subjectivity mean': 0,
#             'subjectivity std': 0
        }
    
    soldierCalDict[filecont] = result
    filecont += 1
#     print(result)


Calculating... myrna99.csv
0
.csvulating... KBDeSalvo
0
Calculating... HTracyDavido.csv
0
2000
Calculating... CenterdinOaklnd.csv
0
2000
4000
Calculating... satyanadella.csv
0
Calculating... jeffrreyford.csv
0
Calculating... CarolynRife.csv
0
Calculating... bexsmith2303.csv
0
.csvulating... VivianaLongo
0
Calculating... HanninenRaija.csv
0
Calculating... JoshPickett22_.csv
0
Calculating... sabema11_.csv
0
2000
4000
6000
Calculating... msteGeorge_.csv
0
Calculating... amandasilveir.csv
0
Calculating... courageousgirl2_.csv
0
2000
4000
6000
Calculating... ysik696_yesica.csv
0
Calculating... TheBagLadyBaySt.csv
0
Calculating... bootleg55_.csv
0
.csvulating... sventennis
0
2000
4000
6000
8000
Calculating... only1lovelylady_.csv
0
Calculating... Fold_Alot.csv
0
Calculating... srininad.csv
0
Calculating... dootsonlady.csv
0
Calculating... winecc.csv
0
Calculating... smodisette.csv
0
Calculating... poopthought.csv
0
Calculating... amityron112.csv
0
Calculating... ShowalterMG.csv
0
.csvulating

  interactivity=interactivity, compiler=compiler, result=result)


0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
42000
44000
Calculating... lindseywasson.csv
0
2000
4000
6000
8000
10000
12000
Calculating... mattdoty.csv
0
Calculating... blessings4life.csv
0
Calculating... thedrillsgt_.csv
0
Calculating... mamalium.csv
0
Calculating... whatlisacooks.csv
0
2000
Calculating... marissssa_mcbri.csv
0
Calculating... Forest_Theater_.csv
0
Calculating... Car_o_lina.csv
0
2000
Calculating... Blargers.csv
0
Calculating... ChrisVossPodcas.csv
0
2000
4000
6000
8000
10000
12000
Calculating... thejoecardamone_.csv
0
Calculating... AlkieshaK.csv
0
.csvulating... jsoltero
0
2000
4000
Calculating... Tarraccas.csv
0
Calculating... LUVVAJ.csv
0
2000
4000
6000
Calculating... GerrickB_.csv
0
Calculating... OldTomYoung_.csv
0
Calculating... nabe1.csv
0
.csvulating... HussnainMahroof
0
Calculating... justinbernier_.csv
0
Calculating... tomglanz.csv
0
2000
4000
Calculating... Cliu00.csv
0
Calculating... 

  interactivity=interactivity, compiler=compiler, result=result)


0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
Calculating... carioke.csv
0
.csvulating... williamready
0
Calculating... TahoeRCD.csv
0
Calculating... CArecycler.csv
0
Calculating... AleBritoFlores.csv
0
Calculating... hergus_.csv
0
Calculating... mizzslim5.csv
0
Calculating... UdiSch.csv
0
Calculating... renegoscinny.csv
0
Calculating... Christa_Belle.csv
0
Calculating... Gimmie18USC2381.csv
0
2000
Calculating... JuanPierreBowly.csv
0
Calculating... lilgde90_.csv
0
2000
Calculating... SpilledInkRepU.csv
0
Calculating... sundarpichai.csv
0
Calculating... mongabay.csv
0
2000
4000
6000
8000
Calculating... AmandaK0812.csv
0
Calculating... rajeshsawhney.csv
0
2000
4000
6000
8000
10000
12000
Calculating... jpaul237_.csv
0
2000
Calculating... OCSportsXchange.csv
0
Calculating... BPositive104.csv
0
Calculating... dwalden0726.csv
0
.csvulating... DohertyShannen
0
2000
4000
6000
8000
Calculating... urbanchillage_.csv
0
2000


In [10]:
# pd.DataFrame.from_dict(soldierCalDict, orient="index").to_csv("soldierCal.csv")
pd.DataFrame.from_dict(soldierCalDict, orient="index").to_csv("civilianCal.csv")