In [38]:
import re
import string
from pyvi import ViTokenizer
import os
import numpy as np
from scipy.stats import entropy
import sqlite3
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.utils import simple_preprocess

con = sqlite3.connect('../../instance/recommend_system.sqlite')
db_env = con.cursor()

stopwords_path = './vietnamese-stopword-dash.txt'
save_path = './models'
PATH_LDA_MODEL = f'{save_path}/LDA.model'
PATH_CORPUS = f'{save_path}/CORPUS.mm'
PATH_TOPICS_DOCS_DIST = f'{save_path}/topics_docs_dist.dat'
PATH_DICTIONARY = f'{save_path}/id2word.dictionary'

def jensen_shannon(doc_distribute, matrix_distribute):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M (the number of documents in the corpus)
    """
    p = doc_distribute[None, :].T
    q = matrix_distribute.T
    m = .5 * (p + q)
    return np.sqrt(.5 * (entropy(q, m) + entropy(p, m)))

def get_most_similar_news(doc_distribute, matrix_distribute, k=10):
    """
    This function implements the Jensen-Shannon distance above
    and returns the top k indices of the smallest jensen shannon distances
    """
    # List of jensen-shannon distances
    sims = jensen_shannon(doc_distribute=doc_distribute, matrix_distribute=matrix_distribute)

    # return index of most K similar distribution from list
    return np.argsort(sims)[:k]

def stopwords(text_file_path=stopwords_path):
    _stopwords = list(open(text_file_path, encoding='utf8').read().split())
    return _stopwords

_stopwords_default = stopwords()
def remove_stopwords(text, stopwords=_stopwords_default):
    return [word for word in text.split() if word not in stopwords]

def remove_numeric(text):
    table = str.maketrans({key: None for key in string.digits})
    return text.translate(table)

def remove_emails(text):
    return re.sub('\S*@\S*\s?', '', text)

def remove_links(text):
    return re.sub(r"http\S+", "", text)

def remove_multiple_whitespace(text):
    return re.sub("\s\s+", " ", text)

def remove_newline_characters(text):
    return re.sub('\n', ' ', text)

def remove_punctuation(text):
    """https://stackoverflow.com/a/37221663"""
    table = str.maketrans({key: None for key in string.punctuation})
    return text.translate(table)

def vi_tokenizer(text):
    return ViTokenizer.tokenize(text)

def simple_preprocessing(text):
    _text = remove_newline_characters(text.lower())
    _text = remove_emails(_text)
    _text = remove_links(_text)
    _text = remove_numeric(_text)
    _text = remove_punctuation(_text)
    _text = remove_multiple_whitespace(_text)
    _text = vi_tokenizer(_text)
    _text = remove_stopwords(_text)
    return _text

In [34]:
db_env.execute('SELECT content FROM NEWS')
data = db_env.fetchall()
# print(data)

In [35]:
docs_token = [simple_preprocessing(text[0]) for text in data]
# print(docs_token)
print(len(docs_token))

6764


In [36]:
id2word = Dictionary(docs_token)
corpus = [id2word.doc2bow(doc_token) for doc_token in docs_token]
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=50, passes=10)
if os.path.isdir(save_path):
    lda_model.save(PATH_LDA_MODEL)
else:
    os.mkdir(save_path)
    lda_model.save(PATH_LDA_MODEL)

In [37]:
lda_model.print_topics()

[(31,
  '0.054*"câu" + 0.033*"kính" + 0.021*"tường" + 0.019*"cửa" + 0.018*"đáp_án" + 0.017*"phòng" + 0.016*"gạch" + 0.013*"hình" + 0.013*"độc_giả" + 0.012*"cửa_sổ"'),
 (35,
  '0.045*"nga" + 0.042*"ukraine" + 0.024*"tên_lửa" + 0.015*"mỹ" + 0.012*"lan" + 0.010*"vụ" + 0.009*"lực_lượng" + 0.008*"tổng_thống" + 0.008*"quân_sự" + 0.007*"triều_tiên"'),
 (4,
  '0.040*"quang" + 0.034*"liêm" + 0.033*"thanh_tra" + 0.030*"lý" + 0.027*"ván" + 0.025*"đóng" + 0.019*"linh" + 0.015*"phi" + 0.015*"hong" + 0.014*"kong"'),
 (47,
  '0.144*"tim" + 0.038*"nhịp" + 0.027*"huyền" + 0.020*"dubai" + 0.018*"suy" + 0.018*"tim_mạch" + 0.016*"huyết_áp" + 0.013*"động_mạch" + 0.013*"băng" + 0.011*"ngực"'),
 (20,
  '0.033*"lao_động" + 0.024*"châu" + 0.021*"công_ty" + 0.021*"usd" + 0.020*"âu" + 0.017*"triệu" + 0.016*"hàng" + 0.014*"nga" + 0.013*"công_nhân" + 0.013*"giá"'),
 (0,
  '0.038*"nghiên_cứu" + 0.037*"khoa_học" + 0.035*"công_nghệ" + 0.023*"vaccine" + 0.021*"phát_triển" + 0.019*"sản_phẩm" + 0.018*"tiêm" + 0.015*"sản

In [39]:
id2word.save(PATH_DICTIONARY)

In [41]:
import gensim
import joblib
# gensim.corpora.MmCorpus.serialize(PATH_CORPUS, corpus=corpus)
doc_topic_dist = np.array(
            [[tup[1] for tup in lst] for lst in lda_model[corpus]]
        )
# save documents-topics matrix
joblib.dump(doc_topic_dist, PATH_TOPICS_DOCS_DIST)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (6764,) + inhomogeneous part.