In [None]:
import pandas as pd
import numpy as np
import tqdm
import psycopg2
import sys
import csv
import matplotlib.pyplot as plt
%matplotlib inline
from gensim.models import Phrases
import gensim
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
import pyLDAvis
import pyLDAvis.gensim
from ProcessingLDA import LDAparamater
from Database import database
import Database
from Connection import connection

In [None]:
def topic_distribution(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,3), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
con = connection.connect(Database.params_dic)

column_names = ["document_no","user_name", "comments_user", "video_title", "channel", "views", "posted", "comments_clean",
                "comments_tokenize", "comments_slangword", "comments_stemmed", "comments_stopwords", "text_string"]

df = database.postgresql_to_dataframe(con, "select * from comments_preprocessing", column_names)

con.close()

df = df.sort_values(by=['document_no'])

df = df.reset_index(drop=True)

In [None]:
text = df['text_string']
text_list =  [i.split() for i in text]

bigram = Phrases(text_list, min_count=10)
trigram = Phrases(bigram[text_list])
for idx in range(len(text_list)):
    for token in bigram[text_list[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            text_list[idx].append(token)
    for token in trigram[text_list[idx]]:
        if '_' in token:
            # Token is a trigram, add to document.
            text_list[idx].append(token)

In [None]:
# Pembuatan Dictionary
dictionary = gensim.corpora.Dictionary(text_list)
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=20000)
dictionary.save('dictionary_comments.dict')

# Pembuatan Corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in text_list]
corpora.MmCorpus.serialize('bow_corpus.mm', bow_corpus)

In [None]:
# Penentuan Jumlah Topik Optimal
start=1
limit=21
step=1
model_list, coherence_values = LDAparamater.compute_coherence_values_numtopics(bow_corpus, dictionary, text_list,
                                                                                limit, start, step)

x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, "has Coherence Value of", round(cv, 3))

In [None]:
# Penentuan Nilai Alpha & Beta
alpha = list(np.arange(0.01, 1, 0.1))
alpha.append(1)
beta = list(np.arange(0.01, 1, 0.1))
beta.append(1)

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
if 1 == 1:
    pbar = tqdm.tqdm(total=121)
    for a in alpha:
        for b in beta:
            cv = LDAparamater.compute_coherence_values_hyperparameters(bow_corpus, dictionary, 6, a, b)
               
            model_results['Topics'].append(k)
            model_results['Alpha'].append(a)
            model_results['Beta'].append(b)
            model_results['Coherence'].append(cv)
                    
            pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_hyperparameters_results_using_bow.csv', index=False)
    pbar.close()

In [None]:
# Penentuan Jumlah Iterasi Optimal
passes = list(np.arange(5, 51, 5))

passes_model_results = {'Topics': [],
                        'Alpha': [],
                        'Beta': [],
                        'Passes': [],
                        'Coherence': []}

if 1 == 1:
    pbar = tqdm.tqdm(total=10)
    
    for p in passes:
        cv = LDAparamater.compute_coherence_values_passes(bow_corpus, dictionary, 6, 0.3, 0.5, p)
        passes_model_results['Topics'].append(k)
        passes_model_results['Alpha'].append(a)
        passes_model_results['Beta'].append(b)
        passes_model_results['Passes'].append(p)
        passes_model_results['Coherence'].append(cv)
        
        pbar.update(1)
        
    pbar.close()
    
pd.DataFrame(passes_model_results).to_csv('lda_tuning_hyperparameters_results_using_bow.csv', index=False)

In [None]:
# FINAL MODEL
lda_model_final = gensim.models.LdaMulticore(corpus=bow_corpus, id2word=dictionary, num_topics=6, alpha=0.3, eta=0.5, passes=30, workers=2)
coherence_model_lda_final = CoherenceModel(model=lda_model_final, texts=text_list, dictionary=dictionary, coherence='c_v')

for idx, topic in lda_model_final.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
coherence_lda_final = coherence_model_lda_final.get_coherence()
print('\nCoherence Score: ', coherence_lda_final)

lda_model_final.save('lda_model_final.model')

In [None]:
# Visualize
pyLDAvis.enable_notebook()

LDAvis = pyLDAvis.gensim.prepare(lda_model_final, bow_corpus, dictionary)

LDAvis

LDAvis.topic_info.to_csv(Database.path_data + 'LDA_term_detail.csv', index=True)

In [None]:
# Prediksi Tiap Komentar
df_topic_sents_keywords = topic_distribution(ldamodel=lda_model_final, corpus=bow_corpus, texts=text_list)
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.to_csv(Database.path_data + 'Distribution Topics Result.csv', index=False)

In [None]:
# Memuat ke Database
database.createTableDatabaseTD()
database.intoDatabaseTD()