In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
doc1 = "I am learning NLP, it is very interesting and exciting. it includes machine learning and deep learning "
doc2 = "My father is a data scientist and he is nlp expert"
doc3 = "My sister has good exposure into android development"

In [4]:
doc_complete = [doc1, doc2, doc3]
doc_complete

['I am learning NLP, it is very interesting and exciting. it includes machine learning and deep learning ',
 'My father is a data scientist and he is nlp expert',
 'My sister has good exposure into android development']

### Cleaning and preprocessing

In [5]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
  stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
  punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
  normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
  return normalized

In [6]:
doc_clean = [clean(doc).split() for doc in doc_complete]
doc_clean

[['learning',
  'nlp',
  'interesting',
  'exciting',
  'includes',
  'machine',
  'learning',
  'deep',
  'learning'],
 ['father', 'data', 'scientist', 'nlp', 'expert'],
 ['sister', 'good', 'exposure', 'android', 'development']]

### Preparing document term matrix

In [7]:
%pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [8]:
# Importing gensim
import gensim
from gensim import corpora

# Tao dictionary
dictionary = corpora.Dictionary(doc_clean)
dictionary.token2id

{'deep': 0,
 'exciting': 1,
 'includes': 2,
 'interesting': 3,
 'learning': 4,
 'machine': 5,
 'nlp': 6,
 'data': 7,
 'expert': 8,
 'father': 9,
 'scientist': 10,
 'android': 11,
 'development': 12,
 'exposure': 13,
 'good': 14,
 'sister': 15}

In [9]:
#Converting a list of documents (corpus)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# doc2bow(doc) trả về danh sách các từ theo thứ tự tăng dần của word_id thay vì thứ tự xuất hiện ban đầu trong văn bản
# Mỗi phần tư tử trong danh sách là một tuple (word_id, word_frequency)
doc_term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 3), (5, 1), (6, 1)],
 [(6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1)]]

### LDA model

In [10]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
# Running and Training LDA model on the document term matrix for 3 topics.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
# Results
ldamodel.print_topics()

[(0,
  '0.063*"father" + 0.063*"expert" + 0.063*"scientist" + 0.063*"data" + 0.063*"nlp" + 0.062*"includes" + 0.062*"machine" + 0.062*"exciting" + 0.062*"deep" + 0.062*"interesting"'),
 (1,
  '0.173*"learning" + 0.121*"nlp" + 0.069*"deep" + 0.069*"includes" + 0.069*"interesting" + 0.069*"machine" + 0.069*"exciting" + 0.069*"scientist" + 0.069*"data" + 0.069*"expert"'),
 (2,
  '0.129*"sister" + 0.129*"good" + 0.129*"exposure" + 0.129*"development" + 0.129*"android" + 0.032*"father" + 0.032*"scientist" + 0.032*"data" + 0.032*"expert" + 0.032*"nlp"')]

In [14]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary

# Chuẩn bị dữ liệu
documents = doc_complete

# Tiền xử lý dữ liệu
tokenized_documents = [document.split() for document in documents]

# Tạo từ điển
dictionary = Dictionary(tokenized_documents)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

# Huấn luyện mô hình LDA
lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15, random_state=42)

# Dự đoán chủ đề cho một tài liệu cụ thể
new_document = 'My father is a data scientist and he is nlp expert'
new_bow = dictionary.doc2bow(new_document.split())
topic_distribution = lda_model.get_document_topics(new_bow)

# In kết quả
print(topic_distribution)
print("Topic distribution for the input:")
for topic in topic_distribution:
    print(f"Topic {topic[0]}: {topic[1]:.4f}")

lda_model.print_topics()

[(0, 0.028784046), (1, 0.028392164), (2, 0.9428238)]
Topic distribution for the input:
Topic 0: 0.0288
Topic 1: 0.0284
Topic 2: 0.9428


[(0,
  '0.127*"learning" + 0.089*"and" + 0.089*"it" + 0.051*"deep" + 0.051*"I" + 0.051*"very" + 0.051*"machine" + 0.051*"exciting." + 0.051*"interesting" + 0.051*"am"'),
 (1,
  '0.077*"My" + 0.077*"android" + 0.077*"sister" + 0.077*"good" + 0.077*"development" + 0.077*"into" + 0.077*"has" + 0.077*"exposure" + 0.019*"learning" + 0.019*"and"'),
 (2,
  '0.115*"is" + 0.066*"My" + 0.066*"expert" + 0.066*"he" + 0.066*"data" + 0.066*"father" + 0.066*"a" + 0.066*"nlp" + 0.066*"scientist" + 0.065*"and"')]

In [12]:
# Dự đoán chủ đề cho một tài liệu cụ thể
new_document = 'I am developing a new android application'
new_bow = dictionary.doc2bow(new_document.split())
topic_distribution = lda_model.get_document_topics(new_bow)

# In kết quả
print(topic_distribution)
print("Topic distribution for the input:")
for topic in topic_distribution:
    print(f"Topic {topic[0]}: {topic[1]:.4f}")

print("LDA Model Topics:")
lda_model.print_topics()

[(0, 0.4685647), (1, 0.27251175), (2, 0.2589236)]
Topic distribution for the input:
Topic 0: 0.4686
Topic 1: 0.2725
Topic 2: 0.2589
LDA Model Topics:


[(0,
  '0.127*"learning" + 0.089*"and" + 0.089*"it" + 0.051*"deep" + 0.051*"I" + 0.051*"very" + 0.051*"machine" + 0.051*"exciting." + 0.051*"interesting" + 0.051*"am"'),
 (1,
  '0.077*"My" + 0.077*"android" + 0.077*"sister" + 0.077*"good" + 0.077*"development" + 0.077*"into" + 0.077*"has" + 0.077*"exposure" + 0.019*"learning" + 0.019*"and"'),
 (2,
  '0.115*"is" + 0.066*"My" + 0.066*"expert" + 0.066*"he" + 0.066*"data" + 0.066*"father" + 0.066*"a" + 0.066*"nlp" + 0.066*"scientist" + 0.065*"and"')]