In [1]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
import nltk
import numpy as np
import pandas as pd
from twitter_preprocessor import TwitterPreprocessor
import os
import glob
from collections import Counter

In [2]:
# preperation
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
nltk.download('stopwords')
stopset = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /Users/cary/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cary/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/cary/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/cary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
wordFrqSol= {}
results = {}

In [4]:
# file list
soldierCSVs = []
files = os.listdir(os.getcwd() + "/data/soldiers")
for file in files:
     if not os.path.isdir(file) and file.endswith(".csv"):
            soldierCSVs.append(file)

In [5]:
# useless columns for analyzing
uselessColumns = [
    "id",
    "conversation_id",
    "place",
    "photos",
    "video",
    "near",
    "geo",
    "source",
    "user_rt_id",
    "user_rt",
    "retweet_id",
    "reply_to",
    "retweet_date",
    "translate",
    "trans_src",
    "trans_dest",
    "link"
]

In [6]:
def calFrqForTweet(tweet):
    tempTweet = tweet
    tempTweetCleaned = TwitterPreprocessor(tempTweet).fully_preprocess().text
    
    frq = {}
    
    if len(tempTweetCleaned) <= 0:
        return 0, frq
    
    tempTokens = nltk.word_tokenize(tempTweetCleaned)
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if len(tempToken)>2]
    tempTokens = [tempToken.lower() for tempToken in tempTokens if tempToken.lower() not in stopset and len(tempToken)>2]
    tempTagged = nltk.pos_tag(tempTokens)
    
    count = 0
    for w, p in tempTagged:
        count += 1
        if p.startswith('J'):
            key = w + "_" + p
            if key in frq:
                frq[key] += 1
            else:
                frq[key] = 1
    
    return count, frq

In [7]:
def genTable(res, frqList):
    print(frqList)
    num = 0
    for w, c in frqList:
        num += 1
        key = 'list-' + str(num)
        res[key + '-w'] = w
        res[key + '-f'] = c * 1.0 / res['ttlLen']
    
    return res

In [8]:
for soldierCSVFile in soldierCSVs:
    if soldierCSVFile == 'cleaned_vet_tweet_df.csv': continue
    print('Calculating... ' + soldierCSVFile)
    
    tempDF = pd.read_csv(os.getcwd() + "/data/soldiers/" + soldierCSVFile, encoding='utf8')
    tempDF.dropna(subset=['tweet'], inplace=True) # clear empty tweets
    tempDF.drop(uselessColumns, axis=1, inplace=True)
    
    singleFRQ = {}
    ttlLen = 0
    tCount = 0

    for id, row in tempDF.iterrows():
        count, freq = calFrqForTweet(row['tweet'])
        singleFRQ = dict(Counter(singleFRQ) + Counter(freq))
        ttlLen += count
        tCount += 1
        if (id % 1000 ) == 0: print(id)
    
    wordFrqSol = dict(Counter(singleFRQ) + Counter(wordFrqSol))
    
    frqList = sorted(singleFRQ.items(), key = lambda x: x[1], reverse=True)[:10]
        
    res = {
        'count': tCount,
        'ttlLen': ttlLen,
    }
    
    res = genTable(res, frqList)
    
    results[soldierCSVFile] = res
    
    print(res)
    

Calculating... fcharles81.csv
0
1000
2000
[('good_JJ', 93), ('great_JJ', 57), ('nice_JJ', 54), ('new_JJ', 45), ('bad_JJ', 42), ('real_JJ', 36), ('dont_JJ', 33), ('cant_JJ', 33), ('white_JJ', 33), ('awesome_JJ', 30)]
{'count': 2483, 'ttlLen': 13048, 'list-1-w': 'good_JJ', 'list-1-f': 0.007127529123237278, 'list-2-w': 'great_JJ', 'list-2-f': 0.0043684855916615575, 'list-3-w': 'nice_JJ', 'list-3-f': 0.00413856529736358, 'list-4-w': 'new_JJ', 'list-4-f': 0.0034488044144696506, 'list-5-w': 'bad_JJ', 'list-5-f': 0.003218884120171674, 'list-6-w': 'real_JJ', 'list-6-f': 0.0027590435315757206, 'list-7-w': 'dont_JJ', 'list-7-f': 0.0025291232372777438, 'list-8-w': 'cant_JJ', 'list-8-f': 0.0025291232372777438, 'list-9-w': 'white_JJ', 'list-9-f': 0.0025291232372777438, 'list-10-w': 'awesome_JJ', 'list-10-f': 0.002299202942979767}
Calculating... GeoffMillard.csv
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
[('good_JJ', 447), ('great_JJ', 330), ('white_JJ', 297), ('little_JJ', 207), ('new_JJ'

  interactivity=interactivity, compiler=compiler, result=result)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
[('good_JJ', 559), ('real_JJ', 212), ('new_JJ', 200), ('bad_JJ', 191), ('little_JJ', 181), ('dont_JJ', 179), ('cant_JJ', 164), ('nice_JJ', 161), ('many_JJ', 131), ('lol_JJ', 119)]
{'count': 16417, 'ttlLen': 77107, 'list-1-w': 'good_JJ', 'list-1-f': 0.007249666048478089, 'list-2-w': 'real_JJ', 'list-2-f': 0.0027494261221419586, 'list-3-w': 'new_JJ', 'list-3-f': 0.00259379822843581, 'list-4-w': 'bad_JJ', 'list-4-f': 0.0024770773081561983, 'list-5-w': 'little_JJ', 'list-5-f': 0.002347387396734408, 'list-6-w': 'dont_JJ', 'list-6-f': 0.00232144941445005, 'list-7-w': 'cant_JJ', 'list-7-f': 0.0021269145473173643, 'list-8-w': 'nice_JJ', 'list-8-f': 0.002088007573890827, 'list-9-w': 'many_JJ', 'list-9-f': 0.0016989378396254555, 'list-10-w': 'lol_JJ', 'list-10-f': 0.001543309945919307}
Calculating... StMarthasTable.csv
0
1000
[('impact_JJ', 78), ('real_JJ', 66), ('live_JJ', 60), ('vocal_JJ', 48), ('good_JJ', 2

In [9]:
pd.DataFrame.from_dict(results, orient="index").to_csv("soldierFreqCal.csv")

In [14]:
topList = sorted(wordFrqSol.items(), key = lambda x: x[1], reverse=True)[:50]

In [15]:
topList

[('good_JJ', 18196),
 ('great_JJ', 12757),
 ('new_JJ', 11980),
 ('many_JJ', 8516),
 ('much_JJ', 7989),
 ('right_JJ', 6863),
 ('last_JJ', 6832),
 ('happy_JJ', 6380),
 ('real_JJ', 6374),
 ('military_JJ', 6115),
 ('best_JJS', 6027),
 ('sure_JJ', 5979),
 ('bad_JJ', 5895),
 ('american_JJ', 5305),
 ('big_JJ', 5088),
 ('true_JJ', 4976),
 ('little_JJ', 4904),
 ('white_JJ', 4673),
 ('wrong_JJ', 4629),
 ('first_JJ', 4483),
 ('free_JJ', 4467),
 ('old_JJ', 4392),
 ('next_JJ', 4217),
 ('cant_JJ', 4112),
 ('dont_JJ', 3799),
 ('black_JJ', 3338),
 ('live_JJ', 3326),
 ('ive_JJ', 3242),
 ('hard_JJ', 3221),
 ('open_JJ', 3187),
 ('awesome_JJ', 2924),
 ('nice_JJ', 2833),
 ('national_JJ', 2832),
 ('full_JJ', 2802),
 ('trump_JJ', 2761),
 ('least_JJS', 2712),
 ('better_JJR', 2601),
 ('public_JJ', 2478),
 ('want_JJ', 2465),
 ('political_JJ', 2465),
 ('different_JJ', 2422),
 ('stupid_JJ', 2397),
 ('know_JJ', 2351),
 ('ready_JJ', 2347),
 ('high_JJ', 2342),
 ('medical_JJ', 2277),
 ('federal_JJ', 2152),
 ('long_JJ