In [1]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
import nltk
import numpy as np
import pandas as pd
from twitter_preprocessor import TwitterPreprocessor
import os
import glob
from collections import Counter

In [2]:
# preperation
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
nltk.download('stopwords')
stopset = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /Users/cary/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cary/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/cary/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/cary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
wordFrqSol= {}
results = {}

In [4]:
# file list
soldierCSVs = []
files = os.listdir(os.getcwd() + "/data/soldiers")
for file in files:
     if not os.path.isdir(file) and file.endswith(".csv"):
            soldierCSVs.append(file)

In [5]:
# useless columns for analyzing
uselessColumns = [
    "id",
    "conversation_id",
    "place",
    "photos",
    "video",
    "near",
    "geo",
    "source",
    "user_rt_id",
    "user_rt",
    "retweet_id",
    "reply_to",
    "retweet_date",
    "translate",
    "trans_src",
    "trans_dest",
    "link"
]

In [6]:
def calFrqForTweet(tweet):
    tempTweet = tweet
    tempTweetCleaned = TwitterPreprocessor(tempTweet).remove_urls().remove_mentions().remove_hashtags().remove_twitter_reserved_words().remove_single_letter_words().remove_numbers(preserve_years=True).remove_blank_spaces().text    
    frq = {}
    
    if len(tempTweetCleaned) <= 0:
        return 0, frq
    
    tempTokens = nltk.word_tokenize(tempTweetCleaned)
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if len(tempToken)>2]
#     tempTokens = [tempToken.lower() for tempToken in tempTokens if tempToken.lower() not in stopset and len(tempToken)>2]
    tempTagged = nltk.pos_tag(tempTokens)
    
    count = 0
    for w, p in tempTagged:
        count += 1
        if p.startswith('J'):
            key = w
            if key in frq:
                frq[key] += 1
            else:
                frq[key] = 1
    
    return count, frq

In [7]:
def genTable(res, frqList):
    print(frqList)
    num = 0
    for w, c in frqList:
        num += 1
        key = 'list-' + str(num)
        res[key + '-w'] = w
        res[key + '-f'] = c * 1.0 / res['ttlLen']
    
    return res

In [None]:
for soldierCSVFile in soldierCSVs:
    if soldierCSVFile == 'cleaned_vet_tweet_df.csv': continue
    print('Calculating... ' + soldierCSVFile)
    
    tempDF = pd.read_csv(os.getcwd() + "/data/soldiers/" + soldierCSVFile, encoding='utf8')
    tempDF.dropna(subset=['tweet'], inplace=True) # clear empty tweets
    tempDF.drop(uselessColumns, axis=1, inplace=True)
    
    singleFRQ = {}
    ttlLen = 0
    tCount = 0

    for id, row in tempDF.iterrows():
        count, freq = calFrqForTweet(row['tweet'])
        singleFRQ = dict(Counter(singleFRQ) + Counter(freq))
        ttlLen += count
        tCount += 1
        if (id % 1000 ) == 0: print(id)
    
    wordFrqSol = dict(Counter(singleFRQ) + Counter(wordFrqSol))
    
    frqList = sorted(singleFRQ.items(), key = lambda x: x[1], reverse=True)[:10]
        
    res = {
        'count': tCount,
        'ttlLen': ttlLen,
    }
    
    res = genTable(res, frqList)
    
    results[soldierCSVFile] = res
    
    print(res)
    

Calculating... fcharles81.csv
0
1000
2000
[('good', 63), ('nice', 51), ('new', 42), ('bad', 39), ('more', 39), ('Good', 33), ('real', 33), ('great', 30), ('big', 27), ('white', 27)]
{'count': 2483, 'ttlLen': 28163, 'list-1-w': 'good', 'list-1-f': 0.002236977594716472, 'list-2-w': 'nice', 'list-2-f': 0.0018108866242942869, 'list-3-w': 'new', 'list-3-f': 0.001491318396477648, 'list-4-w': 'bad', 'list-4-f': 0.0013847956538721017, 'list-5-w': 'more', 'list-5-f': 0.0013847956538721017, 'list-6-w': 'Good', 'list-6-f': 0.001171750168661009, 'list-7-w': 'real', 'list-7-f': 0.001171750168661009, 'list-8-w': 'great', 'list-8-f': 0.0010652274260554628, 'list-9-w': 'big', 'list-9-f': 0.0009587046834499166, 'list-10-w': 'white', 'list-10-f': 0.0009587046834499166}
Calculating... GeoffMillard.csv
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
[('’', 466), ('good', 420), ('white', 273), ('great', 264), ('little', 204), ('more', 186), ('black', 177), ('new', 174), ('late', 168), ('other', 165)]


  interactivity=interactivity, compiler=compiler, result=result)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
[('good', 522), ('ur', 518), ('real', 198), ('bad', 185), ('new', 179), ('little', 171), ('nice', 167), ('....', 154), ('other', 153), ('sexy', 149)]
{'count': 16417, 'ttlLen': 176514, 'list-1-w': 'good', 'list-1-f': 0.0029572725109623032, 'list-2-w': 'ur', 'list-2-f': 0.002934611418924278, 'list-3-w': 'real', 'list-3-f': 0.001121724055882253, 'list-4-w': 'bad', 'list-4-f': 0.0010480755067586707, 'list-5-w': 'new', 'list-5-f': 0.0010140838687016328, 'list-6-w': 'little', 'list-6-f': 0.0009687616846255821, 'list-7-w': 'nice', 'list-7-f': 0.0009461005925875568, 'list-8-w': '....', 'list-8-f': 0.0008724520434639745, 'list-9-w': 'other', 'list-9-f': 0.0008667867704544682, 'list-10-w': 'sexy', 'list-10-f': 0.0008441256784164429}
Calculating... StMarthasTable.csv
0
1000
[('impact', 78), ('real', 61), ('good', 18), ('great', 11), ('Happy', 11), ('Christian', 9), ('..', 8), ('Live', 8), ('Amazing', 8), ('bes

In [None]:
pd.DataFrame.from_dict(results, orient="index").to_csv("soldierFreqCal.csv")

In [None]:
topList = sorted(wordFrqSol.items(), key = lambda x: x[1], reverse=True)[:50]

In [None]:
topList