In [1]:
# !pip install gensim

In [2]:
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

In [3]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [4]:
# 1. Dataset (contoh data dummy)
documents = [
    "Machine learning models are a subset of artificial intelligence.",
    "Natural language processing and computer vision are branches of AI.",
    "Deep learning has transformed the field of AI.",
    "Support vector machines and neural networks are popular algorithms.",
    "Generative AI like ChatGPT and DALL-E is advancing rapidly."
]

In [5]:
# 2. Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(doc):
    tokens = word_tokenize(doc.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

processed_docs = [preprocess(doc) for doc in documents]


In [6]:
# 3. Dictionary and Corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


In [7]:
# 4. Train LDA Model
num_topics = 3  # Set number of topics
lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=15)


In [8]:
# 5. Extract Topic Proportions in Documents
topic_proportions_docs = []
for i, row in enumerate(lda_model[corpus]):
    topic_proportions = {f"Topic {topic_id}": prop for topic_id, prop in row}
    topic_proportions['Document'] = f"Doc {i+1}"
    topic_proportions_docs.append(topic_proportions)

df_topic_proportions = pd.DataFrame(topic_proportions_docs).fillna(0)

In [9]:










# 6. Extract Word Proportions in Topics
topic_word_proportions = []
for topic_id in range(num_topics):
    words = lda_model.show_topic(topic_id, topn=10)
    for word, prop in words:
        topic_word_proportions.append({"Topic": f"Topic {topic_id}", "Word": word, "Proportion": prop})

df_word_proportions = pd.DataFrame(topic_word_proportions)

# Display the results
print("Proporsi Topik dalam Dokumen:")
print(df_topic_proportions)

print("\nProporsi Kata dalam Topik:")
print(df_word_proportions)


Proporsi Topik dalam Dokumen:
    Topic 0   Topic 1   Topic 2 Document
0  0.901470  0.048057  0.050473    Doc 1
1  0.042490  0.915027  0.042483    Doc 2
2  0.059659  0.058401  0.881940    Doc 3
3  0.042807  0.042047  0.915146    Doc 4
4  0.901269  0.049666  0.049066    Doc 5

Proporsi Kata dalam Topik:
      Topic          Word  Proportion
0   Topic 0      learning    0.063589
1   Topic 0          like    0.063474
2   Topic 0    generative    0.063474
3   Topic 0     advancing    0.063474
4   Topic 0       chatgpt    0.063473
5   Topic 0       rapidly    0.063473
6   Topic 0    artificial    0.063457
7   Topic 0        subset    0.063454
8   Topic 0  intelligence    0.063452
9   Topic 0         model    0.063451
10  Topic 1            ai    0.084178
11  Topic 1    processing    0.083171
12  Topic 1      computer    0.083170
13  Topic 1       natural    0.083170
14  Topic 1      language    0.083170
15  Topic 1        branch    0.083170
16  Topic 1        vision    0.083170
17  Topic 1 