## Latent Dirichlet allocation

### Load Data

In [10]:
import pandas as pd
train_data_raw = pd.read_csv("./data/train.csv", encoding="ISO-8859-1") #utf-8 doesn't play nice
test_data_raw = pd.read_csv("./data/test.csv", encoding="ISO-8859-1")
train_data_raw.head()

Unnamed: 0,tweetid,text,disaster_type,disaster,Unnamed: 4
0,10001,@TheEllenShow Please check into Salt River hor...,,0,
1,10002,"As for the hurricane, it's already category 1 ...",hurricane,1,
2,10003,So it looks like my @SoundCloud profile shall ...,,0,
3,10004,@SushmaSwaraj Am sure background check of the ...,,0,
4,10005,Open forex detonation indicator is irretrievab...,,0,


### Clean Corpus

In [18]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in test_data_raw["text"]]
print("\n".join(", ".join(i) for i in doc_clean[:15]))

themaine, soon, start, flooding, comment, section, come, brazil, comment, bc
little, weak, as, earthquake
ooh, girl, actually, meeting, boy, im, devastated, congrats, winner, otrabaltimore
graceville, underground, car, park, flooding, rapidly, excyclone, debbie, dump, large, rainfall, se, queensland, httpstco3uju1n7ns1
next, man, upah, screw, this, im, tired, injury, happened, camp, cupcake, like, camp, cramp, break
dying, debt, costly, survivor
oh, god, rosie, oûªdonnell, foster, screamed, name, fit, shock, awe
found, fish, skeleton, along, path, walk, probably, got, stuck, puddle, flood, stage, httpstcopafc9jk3ty
afrojazz, ill, add, i, dont, even, know, talking, about
ð¿ñð, ñtropical, cyclone, debbie, cause, widespread, damage, north, queensland, ñ, httpstcodfvn3pb0zn
mikeparractor, devastated, longer, emmerdale, best, character, much, give, superbactor, going, missed
ear, gonna, explode, smh
two, can, explode, wanted, drink, rest, kaldi, coffee, stout, httptcou6isxv2f3v, photo


### Prepare Document-Term Matrix


In [20]:
from gensim import corpora

# Creating the term dictionary of our corpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
print("\n".join(", ".join(str(i)) for i in doc_term_matrix[:15]))


[, (, 0, ,,  , 1, ), ,,  , (, 1, ,,  , 1, ), ,,  , (, 2, ,,  , 1, ), ,,  , (, 3, ,,  , 2, ), ,,  , (, 4, ,,  , 1, ), ,,  , (, 5, ,,  , 1, ), ,,  , (, 6, ,,  , 1, ), ,,  , (, 7, ,,  , 1, ), ,,  , (, 8, ,,  , 1, ), ]
[, (, 9, ,,  , 1, ), ,,  , (, 1, 0, ,,  , 1, ), ,,  , (, 1, 1, ,,  , 1, ), ,,  , (, 1, 2, ,,  , 1, ), ]
[, (, 1, 3, ,,  , 1, ), ,,  , (, 1, 4, ,,  , 1, ), ,,  , (, 1, 5, ,,  , 1, ), ,,  , (, 1, 6, ,,  , 1, ), ,,  , (, 1, 7, ,,  , 1, ), ,,  , (, 1, 8, ,,  , 1, ), ,,  , (, 1, 9, ,,  , 1, ), ,,  , (, 2, 0, ,,  , 1, ), ,,  , (, 2, 1, ,,  , 1, ), ,,  , (, 2, 2, ,,  , 1, ), ]
[, (, 4, ,,  , 1, ), ,,  , (, 2, 3, ,,  , 1, ), ,,  , (, 2, 4, ,,  , 1, ), ,,  , (, 2, 5, ,,  , 1, ), ,,  , (, 2, 6, ,,  , 1, ), ,,  , (, 2, 7, ,,  , 1, ), ,,  , (, 2, 8, ,,  , 1, ), ,,  , (, 2, 9, ,,  , 1, ), ,,  , (, 3, 0, ,,  , 1, ), ,,  , (, 3, 1, ,,  , 1, ), ,,  , (, 3, 2, ,,  , 1, ), ,,  , (, 3, 3, ,,  , 1, ), ,,  , (, 3, 4, ,,  , 1, ), ,,  , (, 3, 5, ,,  , 1, ), ]
[, (, 1, 8, ,,  , 1, ), ,,  , (, 3, 6,

### Run LDA Model

In [35]:
### # Creating the object for LDA model using gensim library
from gensim.models.ldamodel import LdaModel as Lda
from time import time

# Running and Training LDA model on the document term matrix.
start = time()
ldaModel = Lda(doc_term_matrix, num_topics=2, id2word = dictionary, passes=50)
end = time()

print(f"Lda model trained in {end - start}")


Lda model trained in 83.69358396530151


In [36]:
print("\n".join(str(i) for i in ldaModel.print_topics(num_topics=2, num_words=5)))

(0, '0.005*"new" + 0.003*"bag" + 0.003*"via" + 0.003*"amp" + 0.003*"body"')
(1, '0.008*"earthquake" + 0.007*"like" + 0.007*"im" + 0.006*"debbie" + 0.006*"cyclone"')
