## Configuring settings to tokenize a tweet

In [None]:
# import libraries
from nltk.tokenize import word_tokenize
import re

In [None]:
# define regular expression

emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

In [None]:
# funcions which help in tokenizing a tweet

def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [None]:
# example of tokenizing a tweet

tweet = "RT @marcobonzanini: just an example! :D http://example.com #NLP"
print(preprocess(tweet))
# ['RT', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']

## Configuring settings for the word count

In [None]:
# import additional libraries for word counting
import operator
import json
from collections import Counter
from nltk.corpus import stopwords
import string

In [None]:
# define stopwords which help remove characters which aren't helpful

punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']

## Performing the word count

In [None]:
# carry out word count on UFC 197 original tweets dataset

fname = 'ufc197_original.json'

with open(fname, 'r') as f:
    count_all = Counter()
    
    for line in f:
            
        tweet = json.loads(line)
        
        if 'text' in tweet:
            # create a list with all the terms 
            terms_all = [term for term in preprocess(tweet['text']) if term not in stop]
        
            # update counter
            count_all.update(terms_all)

In [None]:
# print n most frequent words#

n = 50

print(count_all.most_common(n))

## Storing the top 50 words into CSV file

In [None]:
# import csv library
import csv

In [None]:
# write word and count value to CSV file

with open('ufc197wordcount.csv', 'w', newline='', encoding='utf-8') as csvfile:
    columnNames = ['word', 'count']
    writer = csv.writer(csvfile, delimiter = ',', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(columnNames)

    writer.writerows(count_all.most_common(50))