In [1]:
#!pip install emoji
import pandas as pd
import regex as re
import string 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


In [2]:
# Read the CSV file into a DataFrame
df = pd.read_csv('good_bad_with_splits_full.csv')
df

Unnamed: 0,review,labels,split
0,asked chatgpt to explain why fast growing lend...,bad,train
1,building a virtual machine inside chatgpt tco...,bad,train
2,imagining chatgpt but with video input n nin r...,bad,train
3,openai s chatgpt shows why implementation is k...,bad,train
4,google stock don t be afraid of chatgpt nasdaq...,bad,train
...,...,...,...
39995,chatgpt poetry this one is about womenlifefre...,bad,test
39996,taiyo oil sales units floor price nfts blockc...,bad,test
39997,chatgpt doesn t only pass turing s test but wo...,bad,test
39998,chatgpt is an incredible tool but when asked ...,bad,test


In [3]:
import nltk
from nltk.tokenize import word_tokenize
tokenize = lambda x: word_tokenize(x)
df['token'] = df['review'].apply(tokenize)
df

Unnamed: 0,review,labels,split,token
0,asked chatgpt to explain why fast growing lend...,bad,train,"[asked, chatgpt, to, explain, why, fast, growi..."
1,building a virtual machine inside chatgpt tco...,bad,train,"[building, a, virtual, machine, inside, chatgp..."
2,imagining chatgpt but with video input n nin r...,bad,train,"[imagining, chatgpt, but, with, video, input, ..."
3,openai s chatgpt shows why implementation is k...,bad,train,"[openai, s, chatgpt, shows, why, implementatio..."
4,google stock don t be afraid of chatgpt nasdaq...,bad,train,"[google, stock, don, t, be, afraid, of, chatgp..."
...,...,...,...,...
39995,chatgpt poetry this one is about womenlifefre...,bad,test,"[chatgpt, poetry, this, one, is, about, womenl..."
39996,taiyo oil sales units floor price nfts blockc...,bad,test,"[taiyo, oil, sales, units, floor, price, nfts,..."
39997,chatgpt doesn t only pass turing s test but wo...,bad,test,"[chatgpt, doesn, t, only, pass, turing, s, tes..."
39998,chatgpt is an incredible tool but when asked ...,bad,test,"[chatgpt, is, an, incredible, tool, but, when,..."


In [4]:
from nltk.corpus import stopwords

custom_stopwords = ["tco", "n", "s","q","e","v","u","f","l","g","p","x","c","w","z","chatgpt","j","h","b","ni"]
stopwords_list = stopwords.words('english') + custom_stopwords

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stopwords_list]
    return filtered_tokens

df["token"] = df["review"].apply(tokenize_and_remove_stopwords)

In [5]:
good_df = df[df["labels"] == "good"]

In [6]:
bad_df = df[df["labels"] == "bad"]

In [7]:
bad_df

Unnamed: 0,review,labels,split,token
0,asked chatgpt to explain why fast growing lend...,bad,train,"[asked, explain, fast, growing, lenders, often..."
1,building a virtual machine inside chatgpt tco...,bad,train,"[building, virtual, machine, inside, uizlbxv]"
2,imagining chatgpt but with video input n nin r...,bad,train,"[imagining, video, input, nin, real, time, ljf..."
3,openai s chatgpt shows why implementation is k...,bad,train,"[openai, shows, implementation, key, generativ..."
4,google stock don t be afraid of chatgpt nasdaq...,bad,train,"[google, stock, afraid, nasdaq, goog, tnupz]"
...,...,...,...,...
39995,chatgpt poetry this one is about womenlifefre...,bad,test,"[poetry, one, womenlifefreedom, point, qdsajhl..."
39996,taiyo oil sales units floor price nfts blockc...,bad,test,"[taiyo, oil, sales, units, floor, price, nfts,..."
39997,chatgpt doesn t only pass turing s test but wo...,bad,test,"[pass, turing, test, would, also, slide, dms, ..."
39998,chatgpt is an incredible tool but when asked ...,bad,test,"[incredible, tool, asked, code, program, outpu..."


## Topic modeling

### Good reviews

In [8]:
good_df

Unnamed: 0,review,labels,split,token
13981,honestly openai chatgpt has to be one of the b...,good,train,"[honestly, openai, one, best, ai, tool, ever, ..."
13982,are you a polyglot how many programminglanguag...,good,train,"[polyglot, many, programminglanguages, good, t..."
13983,free startup idea nwikihow generator n nenter ...,good,train,"[free, startup, idea, nwikihow, generator, nen..."
13984,give credit where credit is due nice n nmy ins...,good,train,"[give, credit, credit, due, nice, nmy, inspira..."
13985,the thing with chatgpt and using it for produc...,good,train,"[thing, using, production, whether, computer, ..."
...,...,...,...,...
33976,on chatgpt and stevejobs and how to grow obsol...,good,test,"[stevejobs, grow, obsolete, grace, wow, gvdngx..."
33977,world changing how to talk to a computer tco ...,good,test,"[world, changing, talk, computer, ob, uzjkprv,..."
33978,chatgpt is amazing,good,test,[amazing]
33979,late to the party but tested chatgpt today aft...,good,test,"[late, party, tested, today, email, garyvee, t..."


In [9]:
good_text = good_df['token'].tolist()

In [10]:
import gensim
from gensim import corpora
good_dictionary = corpora.Dictionary(good_text)
corpus = [good_dictionary.doc2bow(text) for text in good_text]

In [11]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=good_dictionary,
                                            num_topics=7, 
                                            random_state=100,
                                            iterations = 1000,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

In [12]:
# Print the topics and their top words
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.034*"amazing" + 0.030*"amp" + 0.028*"people" + 0.014*"things" + 0.014*"tech" + 0.013*"twitter" + 0.013*"made" + 0.011*"thanks" + 0.010*"awesome" + 0.010*"say"
Topic: 1 
Words: 0.040*"code" + 0.026*"much" + 0.024*"create" + 0.015*"right" + 0.014*"results" + 0.014*"impressed" + 0.014*"generated" + 0.013*"important" + 0.012*"ideas" + 0.012*"seems"
Topic: 2 
Words: 0.055*"ai" + 0.028*"openai" + 0.016*"new" + 0.014*"good" + 0.013*"write" + 0.012*"asked" + 0.012*"use" + 0.011*"using" + 0.011*"google" + 0.010*"one"
Topic: 3 
Words: 0.057*"like" + 0.017*"time" + 0.014*"chatbot" + 0.013*"human" + 0.013*"know" + 0.012*"really" + 0.011*"way" + 0.011*"future" + 0.011*"work" + 0.011*"language"
Topic: 4 
Words: 0.019*"conversation" + 0.018*"playing" + 0.017*"creative" + 0.016*"story" + 0.016*"web" + 0.015*"real" + 0.014*"trying" + 0.013*"truly" + 0.012*"nft" + 0.011*"another"
Topic: 5 
Words: 0.040*"gt" + 0.028*"life" + 0.024*"asking" + 0.015*"poem" + 0.013*"space" + 0.013*"soon" 

In [13]:
from gensim.models.coherencemodel import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=good_text, dictionary=good_dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score:', coherence_lda)

Coherence Score: 0.2811737211565014


In [14]:
bad_text = bad_df['token'].tolist()

In [15]:
bad_dictionary = corpora.Dictionary(bad_text)
corpus = [bad_dictionary.doc2bow(text) for text in bad_text]

In [16]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=bad_dictionary,
                                            num_topics=9, 
                                            random_state=100,
                                            iterations = 1000,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

In [17]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.047*"gpt" + 0.024*"need" + 0.022*"chat" + 0.021*"content" + 0.019*"nchatgpt" + 0.018*"take" + 0.017*"business" + 0.015*"say" + 0.014*"wrong" + 0.013*"might"
Topic: 1 
Words: 0.077*"via" + 0.027*"day" + 0.024*"web" + 0.018*"vs" + 0.016*"knows" + 0.016*"bad" + 0.014*"script" + 0.013*"coming" + 0.013*"industry" + 0.011*"wonder"
Topic: 2 
Words: 0.042*"asked" + 0.021*"know" + 0.020*"amp" + 0.019*"people" + 0.017*"think" + 0.012*"really" + 0.011*"article" + 0.011*"r" + 0.010*"months" + 0.009*"data"
Topic: 3 
Words: 0.028*"answer" + 0.022*"questions" + 0.021*"work" + 0.021*"technology" + 0.016*"replace" + 0.015*"next" + 0.014*"many" + 0.013*"check" + 0.011*"anyone" + 0.010*"information"
Topic: 4 
Words: 0.106*"ai" + 0.060*"openai" + 0.022*"chatbot" + 0.012*"make" + 0.012*"future" + 0.012*"time" + 0.012*"would" + 0.010*"artificialintelligence" + 0.009*"ask" + 0.009*"like"
Topic: 5 
Words: 0.079*"new" + 0.035*"see" + 0.029*"tech" + 0.029*"answers" + 0.029*"bot" + 0.026*"twit

In [18]:
from gensim.models.coherencemodel import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=bad_text, dictionary=bad_dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score:', coherence_lda)

Coherence Score: 0.2870188782074861
