<h4><u>This experiment considers one tweet as one document.</u></h4>

In [15]:
%matplotlib inline
import pandas as pd
import pyLDAvis.gensim
import warnings
import numpy as np
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from functools32 import lru_cache
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import re
import gensim
from gensim import corpora, models


pyLDAvis.enable_notebook()
warnings.filterwarnings('ignore')

In [16]:
# initializing lemmatizer
stemmer = SnowballStemmer("english")
wordnet_lemmatizer = WordNetLemmatizer()
lemmatize = lru_cache(maxsize=50000)(wordnet_lemmatizer.lemmatize)
 
# ===========helper methods ========================================


def remove_non_ascii(s):
    return "".join(i for i in s if ord(i) < 128)


def stop_words_list():
    """
        A stop list specific to the observed timelines composed of noisy words
        This list would change for different set of timelines
    """
    stop_words = ['bc', 'http', 'https', 'co', 'com','rt', 'one', 'us', 'new',
              'lol', 'may', 'get', 'want', 'like', 'love', 'no', 'thank', 'would', 'thanks',
              'good', 'much', 'low', 'roger']

    stoplist  = set( nltk.corpus.stopwords.words("english") + stop_words)
    return stoplist


def remove_urls(text):
    text = re.sub(r"(?:\@|http?\://)\S+", "", text)
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    return text


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v'
    elif treebank_tag.startswith('N'):
        return 'n'
    elif treebank_tag.startswith('R'):
        return 'r'
    else:
        return None


def tokenize(text):
    """
    helper function to readTweets() removes url and tokenizes text
    :param text
    """
    text = remove_urls(text)
    text = remove_non_ascii(text)
    text = re.sub(r"""[\'\"]""",'', text)
    regexps = (
        r"""(?:[\w_]+)""",                          # regular word
        r"""(?:[a-z][a-z'\-_]+[a-z])"""             # word with an apostrophe or a dash
    )
    tokens_regexp = re.compile(r"""(%s)""" % "|".join(regexps),
                               re.VERBOSE | re.I | re.UNICODE)
    return tokens_regexp.findall(text)


def replace_acronym(tokens, slang_dict):
    new_tokens = []
    for token in tokens:
        if token in slang_dict:
            new_tokens.extend(slang_dict[token].split())
        else:
            new_tokens.append(token)
    return new_tokens


def tokenize_and_lemmatize(text, slang_dict, stop_words):
    # get the tokens, lowercase - replace acronym
    lowered = [item.lower() for item in tokenize(text)]
    tokens = replace_acronym(lowered, slang_dict)

    
    tokens_pos = pos_tag(tokens)
    words = []
    for token in tokens_pos:
        pos = get_wordnet_pos(token[1])
        # if verb, noun, adj or adverb include them after lemmatization
        if pos is not None and token[0] not in stop_words:
            try:
                tok = lemmatize(token[0], pos)
                words.append(tok)
            except UnicodeDecodeError:
                pass
    # print words
    return words

def read_in_dict(filename):
    dict = {}
    with open(filename) as f:
        for line in f.readlines():
            parts = line.partition(":")
            dict[parts[0].strip()] = parts[2].strip()
    return dict

In [17]:
# read the tweet csv
df = pd.read_csv('LDA/data/single_tweet_doc_AmericanCrime_1000.csv')
print df[:5]

   Unnamed: 0   user_id       username  \
0           0  17312100   FitYourStyle   
1           1  17312100   FitYourStyle   
2           2  17312100   FitYourStyle   
3           3  17312100   FitYourStyle   
4           4  17312100   FitYourStyle   

                                      old_tweet_list  
0  Yes! Love, love, love ! Enjoy the day @FitYour...  
1  Hi, @FitYourStyle I hope u are fine. I just lo...  
2                 . @LoriRMixson Hello from Toronto!  
3  @FitYourStyle Thank you Jennifer - Hi from #Te...  
4  Love from #Aurora #TheHip pic.twitter.com/4A2L...  


In [18]:
# get the tweets document
list_of_single_documents = df.iloc[: , 3]
print list_of_single_documents[:2]

0    Yes! Love, love, love ! Enjoy the day @FitYour...
1    Hi, @FitYourStyle I hope u are fine. I just lo...
Name: old_tweet_list, dtype: object


In [19]:
# Preprocess the documents
def get_preprocessed_docs(list_of_doc):
    stop_words = stop_words_list()
    slang_dict = read_in_dict("LDA/data/out_slang_map.csv")
    for each_doc in list_of_doc:
        yield tokenize_and_lemmatize(each_doc, slang_dict, stop_words)


documents = get_preprocessed_docs(list_of_single_documents)

In [20]:
list_doc = list(documents)
print(list_doc[:5])

[['yes', 'enjoy', 'day', 'proud', 'frenchcanadian', u'represent', 'boston', 'twitter', 'fityourstyle', u'tatu'], ['hi', 'hope', 'u', 'acronym', 'rich', 'environment', 'fine', 'see', u'book', u'author', 'reader', 'hopr', 'learn', 'lot', u'tweet'], ['hello', 'toronto'], ['jennifer', 'texas', 'kiss'], ['aurora', 'thehip', 'picture', 'twitter']]


In [21]:
# create a Gensim dictionary from documents
dictionary = corpora.Dictionary(list_doc)

# filter extremes no_below=5, no_above=0.5, keep_n=100000
dictionary.filter_extremes(no_below=3, no_above=0.9)

# save the dictionary
dictionary.save('LDA/data/single_tweet_doc_AmericanCrime_1000_dict.dict')

token_count = len(dictionary)

In [22]:
# convert the dictionary to a corpus
corpus = [dictionary.doc2bow(doc) for doc in list_doc]

# save corpus to the disck
corpora.MmCorpus.serialize('LDA/data/single_tweet_doc_AmericanCrime_1000_corpus.mm', corpus) 
corpus[:5]

[[(363, 1),
  (2008, 1),
  (6060, 1),
  (6596, 1),
  (7401, 1),
  (8526, 1),
  (9645, 1),
  (10878, 1),
  (11201, 1)],
 [(447, 1),
  (1055, 1),
  (1835, 1),
  (2145, 1),
  (2931, 1),
  (4984, 1),
  (6675, 1),
  (7027, 1),
  (8518, 1),
  (10253, 1),
  (11126, 1),
  (11705, 1),
  (13527, 1),
  (13619, 1)],
 [(2503, 1), (3694, 1)],
 [(2732, 1), (7868, 1), (10938, 1)],
 [(6596, 1), (11087, 1)]]