In [1]:
import os
import json
from tqdm import tqdm

os.chdir('dataset/Diff_Quality_Estimation')
msg_file = 'comments.jsonl'

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from gensim import corpora, models

class ReviewCorpus:
    def __init__(self, filepath):
        self.filepath = filepath
        self.dictionary = corpora.Dictionary(self.iter_docs())

    def iter_docs(self):
        # Iterate over the file and yield documents
        with open(self.filepath, 'r', encoding='utf-8') as fr:
            for line in tqdm(fr):
                stop_words = set(stopwords.words('english'))
                words = [word for word in word_tokenize(line.lower()) if word.isalpha() and word not in stop_words]
                yield words

    def __iter__(self):
        # Iterate over the file and yield BOW vectors
        for tokens in self.iter_docs():
            yield self.dictionary.doc2bow(tokens)

corpus_stream = ReviewCorpus(msg_file)

# Train LDA model
lda_model = models.LdaModel(corpus=(corpus_stream), num_topics=5, id2word=corpus_stream.dictionary, passes=15)

# Display topics
topics = lda_model.print_topics()
for topic in topics:
    print(topic)


132918it [00:18, 7246.47it/s]
132918it [00:17, 7633.51it/s]
132918it [00:30, 4358.46it/s]
132918it [00:30, 4358.03it/s]
132918it [00:25, 5272.25it/s]
132918it [00:25, 5283.98it/s]
132918it [00:24, 5348.41it/s]
132918it [00:24, 5358.29it/s]
132918it [00:24, 5350.80it/s]
132918it [00:24, 5376.43it/s]
132918it [00:24, 5381.71it/s]
132918it [00:24, 5377.03it/s]
132918it [00:24, 5393.68it/s]
132918it [00:24, 5374.89it/s]
132918it [00:24, 5377.50it/s]
132918it [00:24, 5390.00it/s]
132918it [00:24, 5385.23it/s]


(0, '0.023*"like" + 0.013*"would" + 0.012*"seems" + 0.012*"think" + 0.011*"set" + 0.010*"could" + 0.009*"used" + 0.008*"call" + 0.008*"value" + 0.008*"needed"')
(1, '0.046*"use" + 0.025*"instead" + 0.019*"would" + 0.017*"name" + 0.016*"think" + 0.015*"method" + 0.014*"make" + 0.011*"using" + 0.011*"could" + 0.011*"better"')
(2, '0.029*"need" + 0.024*"test" + 0.013*"think" + 0.011*"want" + 0.010*"tests" + 0.010*"get" + 0.008*"sure" + 0.008*"work" + 0.008*"necessary" + 0.008*"check"')
(3, '0.061*"line" + 0.034*"return" + 0.032*"error" + 0.024*"remove" + 0.019*"long" + 0.016*"null" + 0.014*"check" + 0.014*"message" + 0.014*"empty" + 0.013*"missing"')
(4, '0.025*"file" + 0.025*"add" + 0.023*"code" + 0.021*"please" + 0.016*"pr" + 0.015*"function" + 0.015*"move" + 0.015*"remove" + 0.014*"class" + 0.013*"change"')


In [4]:
from gensim import corpora, models
from gensim.models.phrases import Phrases, Phraser

# Assuming texts is a list of documents
with open(msg_file, 'r') as fr:
    texts = [json.loads(line) for line in fr]

# Enhanced Preprocessing
texts_tokenized = [[word for word in word_tokenize(document.lower()) if word.isalpha() and word not in stop_words] for document in texts]

# Detect and form bigrams
bigram_model = Phrases(texts_tokenized, min_count=5, threshold=100)
bigram_phraser = Phraser(bigram_model)
texts_bigrams = [bigram_phraser[text] for text in texts_tokenized]

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(texts_bigrams)
corpus = [dictionary.doc2bow(text) for text in texts_bigrams]

# Using NMF for topic modeling
nmf_model = models.Nmf(corpus, num_topics=5, id2word=dictionary, passes=15)


# Display topics
for idx, topic in nmf_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")


Topic: 0
Words: 0.072*"test" + 0.015*"make" + 0.014*"tests" + 0.014*"code" + 0.013*"also" + 0.012*"change" + 0.009*"file" + 0.009*"case" + 0.009*"sure" + 0.009*"one"

Topic: 1
Words: 0.054*"need" + 0.045*"think" + 0.009*"also" + 0.009*"method" + 0.008*"check" + 0.008*"add" + 0.007*"one" + 0.007*"since" + 0.006*"want" + 0.005*"new"

Topic: 2
Words: 0.064*"like" + 0.015*"looks" + 0.015*"seems" + 0.014*"something" + 0.014*"line" + 0.013*"could" + 0.008*"one" + 0.008*"change" + 0.008*"code" + 0.008*"name"

Topic: 3
Words: 0.096*"would" + 0.014*"make" + 0.010*"better" + 0.007*"could" + 0.006*"think" + 0.006*"error" + 0.006*"code" + 0.005*"also" + 0.005*"sense" + 0.005*"return"

Topic: 4
Words: 0.139*"use" + 0.037*"instead" + 0.014*"could" + 0.011*"please" + 0.007*"name" + 0.007*"string" + 0.006*"using" + 0.006*"function" + 0.006*"also" + 0.005*"method"

