# Word Counts, Bigrams, and Hashtags

In [1]:
import operator 
from collections import Counter
from nltk.corpus import stopwords
import string
import re
from nltk import bigrams 
from collections import defaultdict
import vincent 

In [2]:
punctuation = list(string.punctuation) # stop words  
# rt and via specific to this project, add them
stop = stopwords.words('english') + punctuation + ['RT', 'via'] 
# for co-occurrences 
com = defaultdict(lambda : defaultdict(int))

In [3]:
fname = '../out/tweet_content_orig.txt'

In [4]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]

In [5]:
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
search_word = 'Lupus'

In [6]:
def tokenize(s):
    return tokens_re.findall(s)

In [7]:
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [8]:
with open(fname, 'r') as f:
    count_all = Counter()
    count_hash = Counter()
    count_terms_only = Counter()
    terms_only_bigram = Counter()
    count_search = Counter()
    for line in f:
        # for json: 
        # tweet = json.loads(line)

        # for plain text:

        # Create a list with all the terms
        terms_all = [term for term in preprocess(line)]
        # Update the counter
        count_all.update(terms_all)

        # clean data for a single tweet 
        terms_single = set(terms_all)
        # Count hashtags only
        terms_hash = [term for term in preprocess(line) 
                      if term.startswith('#')]
        # Count terms only (no hashtags, no mentions)
        terms_only = [term for term in preprocess(line) 
                      if term not in stop and
                      not term.startswith(('#', '@'))] 
                      # mind the ((double brackets))
                      # startswith() takes a tuple (not a list) if 
                      # we pass a list of inputs
        terms_only_bigram.update(bigrams(terms_only))
        count_terms_only.update(terms_only)
        count_hash.update(terms_hash)

        # for all co-occurrences
        for i in range(len(terms_only)-1):            
            for j in range(i+1, len(terms_only)):
                w1, w2 = sorted([terms_only[i], terms_only[j]])                
                if w1 != w2:
                    com[w1][w2] += 1

        # for search word co-occurrences
        if search_word in terms_only:
            count_search.update(terms_only)

In [9]:
print("Co-occurrence for %s:" % search_word)
print(count_search.most_common(20))
print()

Co-occurrence for Lupus:
[('Lupus', 45515), ('I', 17155), ('’', 7627), ('amp', 4387), ('The', 2845), ('Selena', 2415), ('💜', 1991), ('like', 1963), ('disease', 1898), ('My', 1883), ('know', 1811), ('lupus', 1696), ('It', 1657), ('Gomez', 1637), ('Research', 1609), ("I'm", 1569), ('get', 1567), ('…', 1522), ('Awareness', 1431), ('2', 1369)]



## Co-occurences

In [10]:
com_max = []
# For each term, look for the most common co-occurrent terms
for t1 in com:
    t1_max_terms = sorted(com[t1].items(), key=operator.itemgetter(1), reverse=True)[:5]
    for t2, t2_count in t1_max_terms:
        com_max.append(((t1, t2), t2_count))
# Get the most frequent co-occurrences
terms_max = sorted(com_max, key=operator.itemgetter(1), reverse=True)
print(terms_max[:5])

[(('I', '’'), 30242), (('I', 'lupus'), 24070), (('I', 'Lupus'), 18332), (('lupus', '’'), 14680), (('I', 'amp'), 9933)]


## Most Frequent Terms (not clean)

(Includes punctuation, etc.)

In [11]:
#not cleaned, includes punctuation etc.
print(count_all.most_common(5))

[('.', 171840), (',', 80526), ('to', 58935), ('I', 56467), ('the', 53308)]


## Most Frequent Terms (clean)

In [12]:
print(count_terms_only.most_common(25))

[('I', 56467), ('lupus', 47252), ('Lupus', 45515), ('’', 28171), ('amp', 13871), ('like', 8155), ('The', 6962), ('know', 6449), ('get', 6129), ('💜', 5943), ("I'm", 5370), ('disease', 5365), ('people', 5364), ('My', 5204), ('Selena', 4887), ('It', 4781), ('one', 4605), ('help', 4408), ('pain', 4383), ('You', 4349), ('️', 3872), ('LUPUS', 3751), ('…', 3732), ('2', 3693), ('time', 3629)]


## Bigram (clean)

In [13]:
freq_bigrams = terms_only_bigram.most_common(25)
for t in freq_bigrams: 
    print(t)

(('I', '’'), 7239)
(('Selena', 'Gomez'), 2321)
(('💜', '💜'), 1830)
(('It', '’'), 1748)
(('I', 'know'), 1615)
(('lupus', 'erythematosus'), 1555)
(('lupus', 'I'), 1549)
(('I', 'lupus'), 1527)
(('Lupus', 'Research'), 1377)
(('😂', '😂'), 1279)
(('I', 'think'), 1198)
(('Lupus', 'Awareness'), 1181)
(('❤', '️'), 1176)
(('systemic', 'lupus'), 1143)
(('I', "don't"), 1117)
(('SHARE', '4'), 1064)
(('I', 'Lupus'), 1019)
(('Lupus', 'I'), 996)
(('Research', 'Alliance'), 939)
(('lupus', '’'), 898)
(('’', 'lupus'), 890)
(('I', 'love'), 887)
(('4', 'https://t.co/TClwcK0izG'), 874)
(('Lupus', 'Foundation'), 849)
(('autoimmune', 'disease'), 790)


## Most Frequent Hashtags

In [14]:
most_common = count_hash.most_common(35)
for t in most_common: 
    print(t)

('#lupus', 12653)
('#Lupus', 12106)
('#LupusChat', 3615)
('#lupusawareness', 2431)
('#spoonie', 1209)
('#CelebritySaturday', 1116)
('#chronicpain', 1056)
('#chronicillness', 1039)
('#autoimmune', 1010)
('#LupusAwareness', 939)
('#lupussurvivors', 938)
('#LupusAwarenessMonth', 881)
('#LUPUS', 844)
('#Fibromyalgia', 802)
('#SLE', 727)
('#fibromyalgia', 651)
('#ChronicPain', 523)
('#RA', 514)
('#MS', 472)
('#LupusWarrior', 469)
('#lupuswarrior', 466)
('#health', 453)
('#LupusInColor', 442)
('#arthritis', 429)
('#CRPS', 404)
('#autoimmunedisease', 388)
('#', 385)
('#cancer', 384)
('#KidneyDisease', 384)
('#migraine', 362)
('#Fibro', 350)
('#LupusSurvivors', 336)
('#lupuschat', 333)
('#fibro', 306)
('#ACR17', 273)
