# Topic Model Example

In [5]:
%matplotlib inline
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  and should_run_async(code)


In [6]:
import seaborn as sns

In [7]:
from gensim import corpora, models
import pyLDAvis
import string
pyLDAvis.enable_notebook()

In [24]:
from pyLDAvis import gensim_models

## Loading data

In [8]:
tweets = pd.read_csv('~/OneDrive - UvA/DigitalSociety/TeachTheTeacher-Python/TeachTheTeacherPython-Datasets/tcat_TheoAraujo-20200101-20200124------------fullExport--09043db5e1.csv')

# Some Unsupervised Machine Learning (Topic Models)


## Initial processing of the data

Note: this is a very informal review of the textual data - just so we have a general view of what a topic model is. This is not part of the course - but if you are interested, we can point you to some resources :-)

In [43]:
def tokenizer(text, stopwords = []):
    text = str(text)
    text = text.translate(str.maketrans('', '', string.punctuation)) # removing punctuation
    text = text.lower() # turning the text into lower case
    text = text.split(' ') # splitting the text into a list (with space serving as the delimiter)
    text = [item for item in text if item not in stopwords] # removing words that may not be informative
    text = [item for item in text if len(item) > 2] # removing tokens with less than 3 characters
    text = [item for item in text if 'http' not in item] #removing urls
    return text

In [44]:
# List of stopwords retrieved from https://gist.github.com/sebleier/554280
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", 
             "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", 
             "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", 
             "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", 
             "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", 
             "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", 
             "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", 
             "against", "between", "into", "through", "during", "before", "after", "above", "below", 
             "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", 
             "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", 
             "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", 
             "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", 
             "don", "should", "now", 'rt', "don't", 'algorithm']

In [45]:
tweets['tokens'] = tweets['text'].apply(tokenizer, args=(stopwords,))

In [46]:
tweets['tokens'].head()

0    [techreview, deepmind’s, new, uncovered, new, ...
1    [bankillerrobots, don’t, machine, sort, apprec...
2    [topdown, approach, uses, parametrized, algori...
3    [techreview, deepmind’s, new, uncovered, new, ...
4    [gbpusd, eurusd, ipda, interbank, price, deliv...
Name: tokens, dtype: object

### Topic Model for Reasons for Choosing the Course

In [47]:
dict_tweets = corpora.Dictionary(tweets['tokens'].values.tolist())
corpus_tweets = [dict_tweets.doc2bow(doc) for doc in tweets['tokens'].values.tolist()]

In [48]:
corpus_tweets[101:102]

[[(263, 1), (752, 1), (753, 1), (754, 1), (755, 1), (756, 1), (757, 1)]]

In [49]:
lda = models.LdaModel(corpus_tweets, id2word=dict_tweets,
                        num_topics=5,
                        passes=10,
                        alpha = 'auto')

In [50]:
lda.print_topics()

[(0,
  '0.017*"like" + 0.011*"youtube" + 0.010*"would" + 0.010*"dont" + 0.008*"thread" + 0.007*"best" + 0.007*"tweets" + 0.007*"used" + 0.007*"pound" + 0.006*"facebook"'),
 (1,
  '0.015*"want" + 0.014*"twitter" + 0.013*"like" + 0.010*"day" + 0.009*"got" + 0.008*"youtube" + 0.008*"really" + 0.007*"know" + 0.007*"get" + 0.007*"ranking"'),
 (2,
  '0.007*"google" + 0.007*"good" + 0.007*"much" + 0.006*"still" + 0.006*"make" + 0.005*"search" + 0.005*"system" + 0.005*"also" + 0.005*"sometimes" + 0.005*"data"'),
 (3,
  '0.040*"new" + 0.010*"going" + 0.010*"learning" + 0.010*"learn" + 0.009*"people" + 0.009*"techreview" + 0.008*"find" + 0.007*"solved" + 0.007*"perspective" + 0.007*"helps"'),
 (4,
  '0.007*"using" + 0.007*"google" + 0.007*"music" + 0.006*"search" + 0.006*"2pm" + 0.005*"youtube" + 0.005*"years" + 0.004*"via" + 0.004*"updates" + 0.004*"breaking"')]

In [51]:
gensim_models.prepare(lda, corpus_tweets, dict_tweets)