In [1]:
import pandas as pd
import numpy as np

In [2]:
document = ["liburan keluarga di ancol",
            "lagi nonton konser dengan keluarga",
            "ancol ramai lagi konser"]
len(document)

3

# Preprocessing

In [3]:
# import
from preprocess.normalize import Normalize
from preprocess.tokenize import Tokenize
from preprocess.symspell import SymSpell

In [4]:
# initialize
normalizer = Normalize()
tokenizer = Tokenize()
symspell = SymSpell(max_dictionary_edit_distance=3)
symspell.load_complete_model_from_json('preprocess\data\corpus_complete_model.json', encoding="ISO-8859-1")

Loading dictionary...
Processing dictionary...
Copied 94811 words to master dictionary...
Copied 679534 hashes to master dictionary...


In [5]:
# do process
doc_preprocessed = []

for tweet in document:
    # normalize
    tweet_norm = normalizer.remove_ascii_unicode(tweet)
    tweet_norm = normalizer.remove_rt_fav(tweet_norm)
    tweet_norm = normalizer.lower_text(tweet_norm)
    tweet_norm = normalizer.remove_newline(tweet_norm)
    tweet_norm = normalizer.remove_url(tweet_norm)
    tweet_norm = normalizer.remove_emoticon(tweet_norm)
    tweet_norm = normalizer.remove_hashtag_mention(tweet_norm)
    tweet_norm = normalizer.remove_punctuation(tweet_norm)
    
    # tokenize
    tweet_tok = tokenizer.WordTokenize(tweet_norm, removepunct=True)
    
    # spell correction
    temp = []
    for token in tweet_tok:
        suggestion = symspell.lookup(phrase=token, verbosity=1, max_edit_distance=3)

        # option if there is no suggestion
        if len(suggestion) > 0:
            get_suggestion = str(suggestion[0]).split(':')[0]
            temp.append(get_suggestion)
        else:
            temp.append(token)
    tweet_prepared = ' '.join(temp)
    
    doc_preprocessed.append(tweet_prepared)

In [6]:
for _ in doc_preprocessed:
    print(_)

liburan keluarga di ancol
lagi nonton konser dengan keluarga
ancol ramai lagi konser


# HMM POS TAGGER

In [7]:
# import
from hmmtagger.tagger import MainTagger
from tokenization import *

In [8]:
# initialize
tagger = MainTagger("resource/Lexicon.trn", "resource/Ngram.trn", 0, 3, 3, 0, 0, False, 0.2, 0, 500.0, 1)

In [9]:
# do process
doc_tagged = []

for tweet in doc_preprocessed:
    if len(tweet) == 0: continue
    out = sentence_extraction(cleaning(tweet))

    join_token = []
    for o in out:
        strtag = " ".join(tokenisasi_kalimat(o)).strip()
        join_token.extend(tagger.taggingStr(strtag))
    doc_tagged.append(' '.join(join_token))

In [10]:
doc_tagged

['liburan/NN keluarga/NN di/IN ancol/NN',
 'lagi/RB nonton/NN konser/NN dengan/IN keluarga/NN',
 'ancol/NN ramai/JJ lagi/RB konser/NN']

# Tag Class

In [11]:
# define
Ccon = ['JJ', 'NN','NNP', 'NNG', 'VBI', 'VBT']
Cfnc = ['OP', 'CP', 'GM', ';', ':', '"', '.',
         ',', '-', '...', 'RB', 'IN', 'MD', 'CC',
         'SC', 'DT', 'UH', 'CDO', 'CDC', 'CDP', 'CDI',
         'PRP', 'WP', 'PRN', 'PRL', 'NEG', 'SYM', 'RP', 'FW']

In [12]:
# do process
doc_classified = []

for tweet in doc_tagged:
    tweet_split = tweet.split(' ')
    
    temp = {"Content": [], "Function": []}
    con = []
    fnc = []
    
    for token in tweet_split:
        word = token.split('/', 1)[0]
        tag = token.split('/', 1)[1]
        
        if tag in Ccon:
            con.append(token)
        elif tag in Cfnc:
            fnc.append(token)
            
    temp["Content"].append(' '.join(con))
    temp["Function"].append(' '.join(fnc))
    
    doc_classified.append(temp)

In [13]:
doc_classified

[{'Content': ['liburan/NN keluarga/NN ancol/NN'], 'Function': ['di/IN']},
 {'Content': ['nonton/NN konser/NN keluarga/NN'],
  'Function': ['lagi/RB dengan/IN']},
 {'Content': ['ancol/NN ramai/JJ konser/NN'], 'Function': ['lagi/RB']}]

In [14]:
# split document content and function
doc_content = []
for tweet in doc_classified:
    doc_content.append(''.join(tweet['Content']))

print(doc_content)
print()

# split tag and word
doc_prepared = []
for tweet in doc_content:
    tweet_split = tweet.split(' ')
    
    temp = []
    for token in tweet_split:
        word = token.split('/', 1)[0]
        temp.append(word)
    
    doc_prepared.append(temp)

print(doc_prepared)

['liburan/NN keluarga/NN ancol/NN', 'nonton/NN konser/NN keluarga/NN', 'ancol/NN ramai/JJ konser/NN']

[['liburan', 'keluarga', 'ancol'], ['nonton', 'konser', 'keluarga'], ['ancol', 'ramai', 'konser']]


# LDA

In [15]:
# import
from lda.ldamodel import LdaModel

In [16]:
%%time
# iniatialize
k = 2
alpha = 0.01
beta = 0.01
iterations = 100

# do process
lda = LdaModel(doc_prepared, k, alpha, beta, iterations)

{'ancol': {0: 1, 1: 0, 2: 1}, 'keluarga': {0: 1, 1: 1, 2: 0}, 'konser': {0: 0, 1: 1, 2: 1}, 'liburan': {0: 1, 1: 0, 2: 0}, 'nonton': {0: 0, 1: 1, 2: 0}, 'ramai': {0: 0, 1: 0, 2: 1}}

{'ancol': {0: 0.39723320158102765, 1: 0}, 'keluarga': {0: 0.19960474308300397, 1: 0.24876847290640397}, 'konser': {0: 0, 1: 0.49507389162561577}, 'liburan': {0: 0.19960474308300397, 1: 0}, 'nonton': {0: 0, 1: 0.24876847290640397}, 'ramai': {0: 0.19960474308300397, 1: 0}}

{0: {0: 0.9966887417218542, 1: 0}, 1: {0: 0, 1: 0.9966887417218542}, 2: {0: 0.6655629139072847, 1: 0.3344370860927152}}

Wall time: 54 ms


In [17]:
result = lda.get_topic_word_pwz(doc_content)

df_lda = pd.DataFrame(result, columns=['Topik', 'Kata', 'PWZ'])

In [18]:
df_lda

Unnamed: 0,Topik,Kata,PWZ
0,0,ancol/NN,0.397233
1,0,liburan/NN,0.199605
2,0,ramai/JJ,0.199605
3,0,keluarga/NN,0.199605
4,1,konser/NN,0.495074
5,1,keluarga/NN,0.248768
6,1,nonton/NN,0.248768


In [19]:
lda.perplexity()

1.854475187535565

In [20]:
q = np.array([[0.202, 0.00249],
              [0.202, 0.252],
              [0.401, 0.00249],
              [0.002, 0.252],
              [0.002, 0.501],
              [0.202, 0.00249]])
r = np.array([[1, 0.00332, 0.668],
              [0.00332, 1, 0.336]])
p = np.array([[1, 0, 0],
              [1, 1, 0],
              [1, 0, 1],
              [0, 1, 0],
              [0, 1, 1],
              [0, 0, 1]])

In [21]:
s = np.matmul(q, r)
s

array([[0.20200827, 0.00316064, 0.13577264],
       [0.20283664, 0.25267064, 0.219608  ],
       [0.40100827, 0.00382132, 0.26870464],
       [0.00283664, 0.25200664, 0.086008  ],
       [0.00366332, 0.50100664, 0.169672  ],
       [0.20200827, 0.00316064, 0.13577264]])

In [22]:
log_s = np.log10(s, out=np.zeros_like(s), where=(s!=0))
log_s

array([[-0.69463086, -2.50022497, -0.86718774],
       [-0.69285359, -0.59744522, -0.65835184],
       [-0.39684667, -2.41778659, -0.57072483],
       [-2.54719578, -0.59858802, -1.06546115],
       [-2.43612514, -0.30015652, -0.77038982],
       [-0.69463086, -2.50022497, -0.86718774]])

In [23]:
p_log_s = np.multiply(p, log_s)
p_log_s

array([[-0.69463086, -0.        , -0.        ],
       [-0.69285359, -0.59744522, -0.        ],
       [-0.39684667, -0.        , -0.57072483],
       [-0.        , -0.59858802, -0.        ],
       [-0.        , -0.30015652, -0.77038982],
       [-0.        , -0.        , -0.86718774]])

In [24]:
sum_p = np.sum(p)
sum_p

9

In [25]:
sum_p_log_s = np.sum(p_log_s)
sum_p_log_s

-5.488823270271718

In [26]:
perplexity = np.exp(-(sum_p_log_s/sum_p))
perplexity

1.8401907822520358

In [27]:
list_perplexity = [0 for x in range(10)]