In [1]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
import nltk
import numpy as np
import pandas as pd
from twitter_preprocessor import TwitterPreprocessor
import os
import glob
from collections import Counter

In [2]:
# preperation
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
nltk.download('stopwords')
stopset = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /Users/cary/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cary/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/cary/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/cary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
wordFrqSol= {}
results = {}

In [4]:
# file list
soldierCSVs = []
files = os.listdir(os.getcwd() + "/data/civilians")
for file in files:
     if not os.path.isdir(file) and file.endswith(".csv"):
            soldierCSVs.append(file)

In [5]:
# useless columns for analyzing
uselessColumns = [
    "id",
    "conversation_id",
    "place",
    "photos",
    "video",
    "near",
    "geo",
    "source",
    "user_rt_id",
    "user_rt",
    "retweet_id",
    "reply_to",
    "retweet_date",
    "translate",
    "trans_src",
    "trans_dest",
    "link"
]

In [6]:
def calFrqForTweet(tweet):
    tempTweet = tweet
    tempTweetCleaned = TwitterPreprocessor(tempTweet).remove_urls().remove_mentions().remove_hashtags().remove_twitter_reserved_words().remove_single_letter_words().remove_numbers(preserve_years=True).remove_blank_spaces().text    
    frq = {}
    
    if len(tempTweetCleaned) <= 0:
        return 0, frq
    
    tempTokens = nltk.word_tokenize(tempTweetCleaned)
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if len(tempToken)>2]
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if tempToken.lower() not in stopset and len(tempToken)>2]
    tempTagged = nltk.pos_tag(tempTokens)
    
    count = 0
    for w, p in tempTagged:
        count += 1
        if p.startswith('J'):
            key = w.lower() # !!!!!!!!!
            if key in frq:
                frq[key] += 1
            else:
                frq[key] = 1
    
    return count, frq

In [7]:
def genTable(res, frqList):
    print(frqList)
    num = 0
    for w, c in frqList:
        num += 1
        key = 'list-' + str(num)
        res[key + '-w'] = w
        res[key + '-f'] = c * 1.0 / res['ttlLen']
    
    return res

In [8]:
for soldierCSVFile in soldierCSVs:
    if soldierCSVFile == 'cleaned_vet_tweet_df.csv': continue
    if soldierCSVFile == 'combined_civilians.csv': continue

    print('Calculating... ' + soldierCSVFile)
    
    tempDF = pd.read_csv(os.getcwd() + "/data/civilians/" + soldierCSVFile, encoding='utf8')
    tempDF.dropna(subset=['tweet'], inplace=True) # clear empty tweets
    tempDF.drop(uselessColumns, axis=1, inplace=True)
    
    singleFRQ = {}
    ttlLen = 0
    tCount = 0

    for id, row in tempDF.iterrows():
        count, freq = calFrqForTweet(row['tweet'])
        singleFRQ = dict(Counter(singleFRQ) + Counter(freq))
        ttlLen += count
        tCount += 1
        if (id % 1000 ) == 0: print(id)
    
    wordFrqSol = dict(Counter(singleFRQ) + Counter(wordFrqSol))
    
    frqList = sorted(singleFRQ.items(), key = lambda x: x[1], reverse=True)[:10]
        
    res = {
        'count': tCount,
        'ttlLen': ttlLen,
    }
    
    res = genTable(res, frqList)
    
    results[soldierCSVFile] = res
    
#     print(res)
    

Calculating... myrna99.csv
0
[('limited', 43), ('next', 34), ('happy', 26), ('other', 26), ('good', 16), ('’', 16), ('sorry', 12), ('great', 10), ('cute', 8), ('best', 7)]
.csvulating... KBDeSalvo
0
1000
[('great', 146), ('public', 69), ('happy', 44), ('such', 42), ('more', 36), ('good', 30), ('new', 29), ('impoant', 28), ('proud', 27), ('social', 24)]
Calculating... HTracyDavido.csv
0
1000
2000
[('great', 92), ('orite', 50), ('marvelous', 39), ('fantastic', 37), ('good', 36), ('fabulous', 36), ('next', 32), ('beautiful', 32), ('true', 30), ('more', 29)]
Calculating... CenterdinOaklnd.csv
0
1000
2000
3000
4000
5000
[('italian', 326), ('fine', 319), ('personal', 319), ('*', 161), ('non-invasive', 161), ('one-year', 132), ('share-a-little', 120), ('good', 99), ('1-hour', 87), ('on-the-road', 82)]
Calculating... satyanadella.csv
0
[('new', 74), ('more', 39), ('great', 37), ('re', 32), ('many', 23), ('digital', 22), ('incredible', 22), ('powerful', 17), ('next', 15), ('intelligent', 15)]
C

  interactivity=interactivity, compiler=compiler, result=result)


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
[('new', 1210), ('good', 844), ('live', 485), ('next', 446), ('more', 434), ('first', 411), ('great', 368), ('best', 367), ('ready', 362), ('awesome', 355)]
Calculating... lindseywasson.csv
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
[('good', 273), ('more', 221), ('’', 209), ('sure', 155), ('much', 143), ('last', 137), ('other', 132), ('bad', 132), ('great', 126), ('many', 123)]
Calculating... mattdoty.csv
0
[('’', 2), ('bad', 2), ('good', 1), ('such', 1), ('current', 1), ('garbage', 1), ('orite', 1), ('buttermilk-based', 1), ('salad', 1), ('old', 1)]
Calculating... blessings4life.csv
0
[('new', 9), ('first', 4), ('great', 4), ('possible', 3), ('more', 3), ('worst', 3), ('top', 3), ('high', 3), ('4-5', 3),

  interactivity=interactivity, compiler=compiler, result=result)


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
[('good', 3535), ('many', 1535), ('last', 1087), ('more', 1078), ('great', 1048), ('big', 958), ('best', 947), ('indian', 945), ('much', 919), ('first', 672)]
Calculating... carioke.csv
0
[('more', 7), ('white', 7), ('same', 6), ('first', 6), ('great', 5), ('last', 5), ('many', 5), ('human', 5), ('good', 5), ('new', 5)]
.csvulating... williamready
0
[('great', 81), ('mobile', 61), ('more', 54), ('new', 46), ('/', 25), ('good', 24), ('big', 22), ('small', 20), ('…', 19), ('top', 17)]
Calculating... TahoeRCD.csv
0
[('invasive', 53), ('great', 49), ('aquatic', 40), ('more', 37), ('new', 36), ('free', 28), ('due', 24), ('beautiful', 17), ('last', 15), ('local', 15)]
Calculating... CArecycler.csv
0
[('free', 402), ('available', 392), ('new', 11), ('least', 6), (

In [9]:
# pd.DataFrame.from_dict(results, orient="index").to_csv("soldierFreqCal.csv")
pd.DataFrame.from_dict(results, orient="index").to_csv("civilianFreqCal.csv")

In [10]:
topList = sorted(wordFrqSol.items(), key = lambda x: x[1], reverse=True)[:100]

In [11]:
topList

[('good', 18344),
 ('more', 15394),
 ('new', 15153),
 ('great', 12745),
 ('last', 9785),
 ('happy', 8338),
 ('best', 7950),
 ('many', 7240),
 ('next', 7215),
 ('much', 6352),
 ('first', 5734),
 ('other', 5450),
 ('big', 5375),
 ('’', 4661),
 ('free', 4259),
 ('sure', 4088),
 ('such', 3740),
 ('better', 3701),
 ('same', 3658),
 ('few', 3443),
 ('bad', 3392),
 ('little', 3315),
 ('real', 3260),
 ('old', 3182),
 ('special', 2912),
 ('nice', 2892),
 ('right', 2863),
 ('ready', 2769),
 ('true', 2753),
 ('live', 2620),
 ('own', 2605),
 ('beautiful', 2559),
 ('awesome', 2541),
 ('most', 2455),
 ('>', 2455),
 ('indian', 2434),
 ('wonderful', 2393),
 ('top', 2349),
 ('..', 2334),
 ('“', 2195),
 ('different', 2144),
 ('wrong', 2031),
 ('hard', 2028),
 ('full', 2017),
 ('long', 2008),
 ('latest', 2008),
 ('orite', 1932),
 ('…', 1928),
 ('least', 1871),
 ('amazing', 1841),
 ('only', 1824),
 ('young', 1726),
 ('daily', 1696),
 ('high', 1675),
 ('re', 1672),
 ('*', 1660),
 ('cool', 1632),
 ('social'

In [12]:
# results