In [30]:
import re
import string
from pyvi import ViTokenizer
import os
import numpy as np
from scipy.stats import entropy
import sqlite3
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.utils import simple_preprocess

con = sqlite3.connect('../../instance/recommend_system.sqlite')
db_env = con.cursor()

stopwords_path = './vietnamese-stopword-dash.txt'
save_path = './models'
PATH_LDA_MODEL = f'{save_path}/LDA.model'
PATH_CORPUS = f'{save_path}/CORPUS.mm'
PATH_TOPICS_DOCS_DIST = f'{save_path}/topics_docs_dist.dat'
PATH_DICTIONARY = f'{save_path}/id2word.dictionary'

def jensen_shannon(doc_distribute, matrix_distribute):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M (the number of documents in the corpus)
    """
    p = doc_distribute[None, :].T
    q = matrix_distribute.T
    m = .5 * (p + q)
    return np.sqrt(.5 * (entropy(q, m) + entropy(p, m)))

def get_most_similar_news(doc_distribute, matrix_distribute, k=10):
    """
    This function implements the Jensen-Shannon distance above
    and returns the top k indices of the smallest jensen shannon distances
    """
    # List of jensen-shannon distances
    sims = jensen_shannon(doc_distribute=doc_distribute, matrix_distribute=matrix_distribute)

    # return index of most K similar distribution from list
    return np.argsort(sims)[:k]

def stopwords(text_file_path=stopwords_path):
    _stopwords = list(open(text_file_path, encoding='utf8').read().split())
    return _stopwords

_stopwords_default = stopwords()
def remove_stopwords(text, stopwords=_stopwords_default):
    return [word for word in text.split() if word not in stopwords]

def remove_numeric(text):
    table = str.maketrans({key: None for key in string.digits})
    return text.translate(table)

def remove_emails(text):
    return re.sub('\S*@\S*\s?', '', text)

def remove_links(text):
    return re.sub(r"http\S+", "", text)

def remove_multiple_whitespace(text):
    return re.sub("\s\s+", " ", text)

def remove_newline_characters(text):
    return re.sub('\n', ' ', text)

def remove_punctuation(text):
    """https://stackoverflow.com/a/37221663"""
    table = str.maketrans({key: None for key in string.punctuation})
    return text.translate(table)

def vi_tokenizer(text):
    return ViTokenizer.tokenize(text)

def simple_preprocessing(text):
    _text = remove_newline_characters(text.lower())
    _text = remove_emails(_text)
    _text = remove_links(_text)
    _text = remove_numeric(_text)
    _text = remove_punctuation(_text)
    _text = remove_multiple_whitespace(_text)
    _text = vi_tokenizer(_text)
    _text = remove_stopwords(_text)
    return _text

In [31]:
db_env.execute('SELECT content FROM NEWS')
data = db_env.fetchall()
# print(data)

In [32]:
docs_token = [simple_preprocessing(text[0]) for text in data]
# print(docs_token)
print(len(docs_token))

6764


In [33]:
import gensim
id2word = Dictionary(docs_token)
corpus = [id2word.doc2bow(doc_token) for doc_token in docs_token]
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=50, passes=10, chunksize=100)
if os.path.isdir(save_path):
    # save corpus
    gensim.corpora.MmCorpus.serialize(PATH_CORPUS, corpus)
    # save dictionary
    id2word.save(PATH_DICTIONARY)
    # save lda model
    lda_model.save(PATH_LDA_MODEL)
else:
    os.mkdir(save_path)
    # save corpus
    gensim.corpora.MmCorpus.serialize(PATH_CORPUS, corpus)
    # save dictionary
    id2word.save(PATH_DICTIONARY)
    # save lda model
    lda_model.save(PATH_LDA_MODEL)

import joblib
dt_dist = lda_model.get_document_topics(corpus, minimum_probability=0.0)
dt_dist[:2]
docs_topics_dist = np.array(
            [[tup[1] for tup in lst] for lst in dt_dist]
        )
print(docs_topics_dist)
# save documents-topics matrix
joblib.dump(docs_topics_dist, PATH_TOPICS_DOCS_DIST)


[[1.8532420e-02 1.8347186e-01 1.8357347e-04 ... 1.8357347e-04
  1.8357347e-04 1.8357347e-04]
 [1.2830396e-04 4.3324664e-02 1.2830396e-04 ... 1.2830396e-04
  1.2830396e-04 1.2830396e-04]
 [1.0262377e-04 2.0616885e-01 1.0262377e-04 ... 1.0262377e-04
  1.0262377e-04 1.0262377e-04]
 ...
 [2.0836425e-04 3.4758214e-02 2.0836425e-04 ... 6.3464053e-02
  2.0836425e-04 4.0619001e-02]
 [8.3349145e-04 8.3349145e-04 8.3349145e-04 ... 8.3349145e-04
  8.3349145e-04 1.5501770e-01]
 [5.2518455e-05 5.2518455e-05 5.2518455e-05 ... 5.2518455e-05
  4.1588023e-02 5.2518455e-05]]


['./models/topics_docs_dist.dat']

In [34]:
# save corpus
gensim.corpora.MmCorpus.serialize(PATH_CORPUS, corpus)

In [35]:
lda_model.print_topics()

[(1,
  '0.045*"ảnh" + 0.028*"đi" + 0.028*"chụp" + 0.020*"video" + 0.016*"đầu" + 0.016*"đồ" + 0.015*"ván" + 0.013*"chỉ_trích" + 0.012*"đăng" + 0.012*"giày"'),
 (10,
  '0.030*"nguyễn" + 0.022*"tỉnh" + 0.018*"bình" + 0.018*"đại_biểu" + 0.017*"văn" + 0.017*"quy_định" + 0.016*"nhà_nước" + 0.014*"phú" + 0.014*"ảnh" + 0.013*"đề_nghị"'),
 (36,
  '0.037*"hệ_thống" + 0.035*"điện" + 0.020*"thiết_bị" + 0.018*"máy" + 0.016*"năng_lượng" + 0.015*"an_toàn" + 0.015*"giúp" + 0.015*"c" + 0.014*"hoạt_động" + 0.014*"độ"'),
 (41,
  '0.041*"nga" + 0.029*"quốc_hội" + 0.028*"kinh_tế" + 0.022*"phát_triển" + 0.020*"xã_hội" + 0.019*"bộ_trưởng" + 0.018*"lãnh_đạo" + 0.017*"cơ_sở" + 0.016*"xây_dựng" + 0.014*"luật"'),
 (4,
  '0.129*"trường" + 0.061*"ngành" + 0.061*"học" + 0.057*"giáo_dục" + 0.050*"học_sinh" + 0.042*"đào_tạo" + 0.042*"giáo_viên" + 0.034*"lớp" + 0.024*"đại_học" + 0.023*"thpt"'),
 (32,
  '0.166*"cháy" + 0.131*"cá" + 0.102*"lửa" + 0.057*"đàn" + 0.049*"dập" + 0.034*"chó" + 0.034*"bùng" + 0.031*"tiêu" + 0.

In [36]:
import joblib
dt_dist = lda_model.get_document_topics(corpus, minimum_probability=0.0)
dt_dist[:2]
docs_topics_dist = np.array(
            [[tup[1] for tup in lst] for lst in dt_dist]
        )
print(docs_topics_dist)

# save documents-topics matrix
# joblib.dump(docs_topics_dist, PATH_TOPICS_DOCS_DIST)


[[1.85324233e-02 1.83476999e-01 1.83573502e-04 ... 1.83573502e-04
  1.83573502e-04 1.83573502e-04]
 [1.28303946e-04 4.33120355e-02 1.28303946e-04 ... 1.28303946e-04
  1.28303946e-04 1.28303946e-04]
 [1.02623766e-04 2.06170991e-01 1.02623766e-04 ... 1.02623766e-04
  1.02623766e-04 1.02623766e-04]
 ...
 [2.08364232e-04 3.47809456e-02 2.08364232e-04 ... 6.34641126e-02
  2.08364232e-04 4.06194739e-02]
 [8.33491504e-04 8.33491504e-04 8.33491504e-04 ... 8.33491504e-04
  8.33491504e-04 1.55016467e-01]
 [5.25184587e-05 5.25184587e-05 5.25184587e-05 ... 5.25184587e-05
  4.15883213e-02 5.25184587e-05]]


In [27]:
item_dist = [[tup[1] for tup in doc_dist] for doc_dist in lda_model[corpus]]

In [19]:
[len(doc_dist) for doc_dist in item_dist]

[9,
 8,
 7,
 9,
 12,
 5,
 7,
 12,
 8,
 10,
 3,
 11,
 6,
 14,
 12,
 14,
 8,
 10,
 15,
 14,
 6,
 7,
 3,
 7,
 9,
 17,
 6,
 8,
 6,
 11,
 4,
 7,
 8,
 8,
 16,
 8,
 13,
 10,
 7,
 7,
 8,
 12,
 8,
 7,
 8,
 12,
 11,
 9,
 14,
 9,
 8,
 8,
 12,
 4,
 9,
 9,
 8,
 11,
 14,
 9,
 7,
 13,
 12,
 10,
 5,
 14,
 7,
 8,
 10,
 7,
 12,
 5,
 8,
 5,
 2,
 3,
 7,
 7,
 8,
 5,
 11,
 10,
 13,
 15,
 8,
 5,
 6,
 13,
 12,
 10,
 16,
 14,
 9,
 1,
 14,
 10,
 11,
 13,
 8,
 5,
 50,
 2,
 8,
 10,
 6,
 15,
 8,
 9,
 3,
 4,
 8,
 7,
 10,
 8,
 12,
 11,
 13,
 6,
 8,
 7,
 13,
 10,
 11,
 5,
 7,
 5,
 14,
 8,
 8,
 9,
 10,
 7,
 3,
 6,
 8,
 6,
 5,
 13,
 6,
 8,
 8,
 6,
 10,
 11,
 7,
 5,
 12,
 7,
 9,
 4,
 12,
 9,
 6,
 8,
 11,
 7,
 8,
 10,
 3,
 13,
 9,
 5,
 9,
 17,
 7,
 9,
 16,
 5,
 10,
 17,
 7,
 8,
 11,
 6,
 7,
 10,
 2,
 5,
 10,
 4,
 13,
 13,
 6,
 13,
 7,
 4,
 11,
 9,
 15,
 9,
 2,
 5,
 14,
 12,
 13,
 10,
 7,
 4,
 11,
 6,
 3,
 6,
 10,
 10,
 12,
 10,
 9,
 8,
 4,
 7,
 9,
 4,
 6,
 3,
 7,
 7,
 7,
 7,
 10,
 4,
 2,
 6,
 7,
 7,
 7,
 8,
 7,
 6,
 11,


In [23]:
docs_topics_dist = lda_model.get_document_topics(corpus, minimum_probability=0.0)
print(docs_topics_dist[0])

[(0, 0.00015664446), (1, 0.00015664446), (2, 0.00015664446), (3, 0.14611824), (4, 0.00015664446), (5, 0.00015664446), (6, 0.00015664446), (7, 0.068384126), (8, 0.00015664446), (9, 0.00015664446), (10, 0.00015664446), (11, 0.00015664446), (12, 0.00015664446), (13, 0.00015664446), (14, 0.00015664446), (15, 0.00015664446), (16, 0.00015664446), (17, 0.00015664446), (18, 0.00015664446), (19, 0.018986873), (20, 0.00015664446), (21, 0.00015664446), (22, 0.020576175), (23, 0.00015664446), (24, 0.00015664446), (25, 0.00015664446), (26, 0.07081053), (27, 0.00015664446), (28, 0.00015664446), (29, 0.00015664446), (30, 0.55614895), (31, 0.00015664446), (32, 0.00015664446), (33, 0.00015664446), (34, 0.00015664446), (35, 0.03936303), (36, 0.05692358), (37, 0.00015664446), (38, 0.00015664446), (39, 0.00015664446), (40, 0.00015664446), (41, 0.00015664446), (42, 0.00015664446), (43, 0.00015664446), (44, 0.00015664446), (45, 0.00015664446), (46, 0.00015664446), (47, 0.00015664446), (48, 0.016266108), (49

In [26]:
topics_dist_docs_matrix = [[tup[1] for tup in topics_dist_doc] for topics_dist_doc in docs_topics_dist]

[50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
