# Latent Dirichlet Allocation

This is latent dirichlet allocation program. I use this program for my learning purpose.

Thanks to Alfan Farizki Wicaksono from Fasilkom, Universitas Indonesia.
Actually this is Alfan's code, so I'm just copying his code from Pusilkom UI website.

In [2]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import re
import numpy as np
import pandas as pd
import gensim
import nltk

from nltk.stem.snowball import SnowballStemmer

from gensim.corpora import Dictionary, MmCorpus
from gensim.models import ldamodel

from sklearn.cluster import KMeans
from sklearn.manifold import MDS
from sklearn.decomposition import PCA

Now, define function for load our dataset

In [3]:
def load_dataset(filename):
    file = open(filename, 'r')

    acc_names = []
    tweets = []

    for line in file:
        line = line.strip()
        parts = line.split('###')
        acc_names.append(parts[0])
        tweets.append(parts[1])

    return acc_names, tweets

Define number of topic

In [7]:
# config variables
num_topics=4

In [9]:
# definisikan beberapa fungsi untuk kebutuhkan pre-processing, pre-processing yang dilakukan adalah
# 1. lowercasing
# 2. stopword removal
# 3. stemming

stemmer = SnowballStemmer("english")
stopwords = nltk.corpus.stopwords.words('english')

def preprocess(text):

    # tokenizing and lowercasing
    tokens = [word.lower() for word in text.split()]
    filtered_tokens = []

    # buat yang bukan terdiri dari alfabet, dan merupakan stopword
    for token in tokens:
        if re.search('[a-zA-Z]', token) and (token not in stopwords):
            filtered_tokens.append(token)

    # lakukan stemming dengan snowball stemmer
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

Then we can import our dataset. I got the dataset from Alfan. Regarding to his statement in his website, he get the dataset from twitter.

In [10]:
# Kita load dokumen twitter, dan lakukan preprocessing terhadap tweet yang sudah di-load
acc_names, tweets = load_dataset("twitter.txt")

# Lakukan pre-process untuk setiap tweet pada koleksi "tweets" kita
# Gunakan List Comprehension untuk mempermudah hidup kita
tweets = [preprocess(tweet) for tweet in tweets]

In [11]:
tweets

[['kardashian',
  'yr',
  'anniversary,',
  'iphon',
  'yr',
  'anniversary,',
  'so,',
  'kardashian',
  'made',
  'iphon'],
 ['iphon',
  'year',
  'old.',
  'appl',
  'watch',
  'year',
  'old.',
  'feel',
  'peopl',
  'forget',
  'small',
  'fact.'],
 ["can't",
  'save',
  'make,',
  'live',
  'beyond',
  'means.',
  'ditch',
  'starbucks,',
  'eat',
  'less,',
  'need',
  'new',
  'iphone,',
  'save',
  'money!'],
 ['time', 'year!', 'iphon', 'vs.', 'samsung', 'galaxi', 's8', 'smackdown:'],
 ['sell',
  'yeezi',
  'samsung',
  'galaxi',
  's8',
  'anyon',
  'interest',
  'show',
  'proof',
  'trust',
  '@devilishrt',
  '@alienrt',
  '@bear_retweet',
  '@flyrt'],
 ['iphon',
  '16gb',
  'spacegray',
  'peso',
  'only!',
  'complet',
  'full',
  'packag',
  'guys!',
  'dm'],
 ['swear',
  'even',
  'iphon',
  'dress',
  'clown,',
  'reach',
  'pillow',
  '&choke',
  'slept,',
  'still',
  'buy',
  'samsung'],
 ['iphon',
  '8',
  'a11',
  'bionic',
  'chip',
  'lost',
  'samsung',
  'gala

Make dictionary for words in our document (in tweets variable). We will add unique words into this dictionary.

In [15]:
# membuat term dictionary dari korpus kita, dimana setiap kata unik akan diberikan sebuah index
dictionary = Dictionary(tweets)

# buang term yang:
# pop term which:
# 1. muncul di kurang dari 2 dokumen
# 1. appears in less than 2 documents 
# 2. muncul di lebih dari 0.9*(total_dok) dokumen
# 2. appears in more than  0.9*(total_doct) documents
dictionary.filter_extremes(no_below=2, no_above=0.9)

# ubah dictionary menjadi object bag-of-words reference
# ingat bahwa dalama LDA, dokumen diasumsikan dengan bag-of-words model
corpus = [dictionary.doc2bow(tweet) for tweet in tweets]

In [14]:
corpus

[[(0, 2), (1, 1)],
 [(0, 1), (2, 1), (3, 2)],
 [(4, 1)],
 [(0, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(5, 1), (6, 1), (7, 1)],
 [(0, 1), (9, 1), (10, 1)],
 [(0, 1), (7, 1), (11, 1), (12, 1)],
 [(0, 1), (5, 1), (7, 1), (8, 1), (13, 1), (14, 1)],
 [(5, 1), (7, 1), (10, 1), (14, 1), (15, 1), (16, 1)],
 [(5, 1), (7, 1), (9, 1)],
 [(0, 1)],
 [(0, 1)],
 [(5, 1), (7, 1), (11, 1), (12, 1), (14, 1), (17, 1), (18, 1)],
 [(1, 1), (3, 1), (5, 1), (7, 1), (14, 1)],
 [(0, 1)],
 [(5, 1), (7, 2), (14, 2)],
 [(0, 1), (5, 1), (7, 1), (19, 1)],
 [(0, 1), (4, 1)],
 [(20, 1), (21, 1)],
 [(20, 1), (21, 1), (22, 1), (23, 1)],
 [(13, 1), (20, 1), (24, 1)],
 [(25, 1), (26, 1), (27, 1)],
 [(20, 1), (21, 1), (23, 1)],
 [(20, 1), (21, 1), (26, 1)],
 [(20, 1), (28, 1)],
 [(20, 1), (21, 1), (29, 1)],
 [(20, 1), (21, 1), (30, 1), (31, 1), (32, 1), (33, 1)],
 [(20, 1), (21, 1), (30, 1), (31, 1)],
 [(3, 1), (18, 1), (20, 1), (21, 1)],
 [(19, 1), (20, 1), (21, 2), (33, 1), (34, 1)],
 [(20, 1), (21, 1), (22, 1), (29, 1)