In [1]:
#!pip install emoji
import pandas as pd
import emoji
import regex as re
import string 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


In [2]:
# Read the CSV file into a DataFrame
df = pd.read_csv('good_bad_with_splits_full.csv')
df

Unnamed: 0,review,labels,split
0,asked chatgpt to explain why fast growing lend...,bad,train
1,building a virtual machine inside chatgpt tco...,bad,train
2,imagining chatgpt but with video input n nin r...,bad,train
3,openai s chatgpt shows why implementation is k...,bad,train
4,google stock don t be afraid of chatgpt nasdaq...,bad,train
...,...,...,...
39995,chatgpt poetry this one is about womenlifefre...,bad,test
39996,taiyo oil sales units floor price nfts blockc...,bad,test
39997,chatgpt doesn t only pass turing s test but wo...,bad,test
39998,chatgpt is an incredible tool but when asked ...,bad,test


In [3]:
import nltk
from nltk.tokenize import word_tokenize
tokenize = lambda x: word_tokenize(x)
df['token'] = df['review'].apply(tokenize)
df

Unnamed: 0,review,labels,split,token
0,asked chatgpt to explain why fast growing lend...,bad,train,"[asked, chatgpt, to, explain, why, fast, growi..."
1,building a virtual machine inside chatgpt tco...,bad,train,"[building, a, virtual, machine, inside, chatgp..."
2,imagining chatgpt but with video input n nin r...,bad,train,"[imagining, chatgpt, but, with, video, input, ..."
3,openai s chatgpt shows why implementation is k...,bad,train,"[openai, s, chatgpt, shows, why, implementatio..."
4,google stock don t be afraid of chatgpt nasdaq...,bad,train,"[google, stock, don, t, be, afraid, of, chatgp..."
...,...,...,...,...
39995,chatgpt poetry this one is about womenlifefre...,bad,test,"[chatgpt, poetry, this, one, is, about, womenl..."
39996,taiyo oil sales units floor price nfts blockc...,bad,test,"[taiyo, oil, sales, units, floor, price, nfts,..."
39997,chatgpt doesn t only pass turing s test but wo...,bad,test,"[chatgpt, doesn, t, only, pass, turing, s, tes..."
39998,chatgpt is an incredible tool but when asked ...,bad,test,"[chatgpt, is, an, incredible, tool, but, when,..."


# SVM

In [25]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [26]:
train_df = df[df['split']=='train']
val_df = df[df['split']=='val']
test_df = df[df['split']=='test']

In [29]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train_df['review'])
X_val = vectorizer.transform(val_df['review'])
X_test = vectorizer.transform(test_df['review'])
y_train = train_df['labels']
y_val = val_df['labels']
y_test = test_df['labels']


In [52]:
svm = SVC(kernel='linear', C=1, random_state=42)
svm.fit(X_train, y_train)

SVC(C=1, kernel='linear', random_state=42)

In [53]:
y_val_pred = svm.predict(X_val)
acc = accuracy_score(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred)
print(f"Accuracy: {acc}")
print(report)

Accuracy: 0.9427893175074183
              precision    recall  f1-score   support

         bad       0.94      0.94      0.94      4194
        good       0.94      0.94      0.94      4231

    accuracy                           0.94      8425
   macro avg       0.94      0.94      0.94      8425
weighted avg       0.94      0.94      0.94      8425



In [64]:
y_test_pred = svm.predict(X_test)
acc = accuracy_score(y_test, y_test_pred)
report = classification_report(y_test, y_test_pred)
print(f"Accuracy: {acc}")
print(report)

Accuracy: 0.9446822798623352
              precision    recall  f1-score   support

         bad       0.94      0.95      0.95      6019
        good       0.95      0.94      0.94      5894

    accuracy                           0.94     11913
   macro avg       0.94      0.94      0.94     11913
weighted avg       0.94      0.94      0.94     11913



In [55]:
text1 = ["chatgpt is helpful"]
X1 = vectorizer.transform(text1)
y1 = svm.predict(X1)
text2 = ["chatgpt is pretty cool"]
X2 = vectorizer.transform(text2)
y2 = svm.predict(X2)
text3 = ["chatgpt is slow"]
X3 = vectorizer.transform(text3)
y3 = svm.predict(X3)
text4 = ["the answers of chatgpt is crazy"]
X4 = vectorizer.transform(text4)
y4 = svm.predict(X4)

print(text1,y1)
print(text2,y2)
print(text3,y3)
print(text4,y4)

['chatgpt is helpful'] ['good']
['chatgpt is pretty cool'] ['good']
['chatgpt is slow'] ['bad']
['the answers of chatgpt is crazy'] ['bad']


# Random forest

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [56]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [57]:
y_val_pred = rf.predict(X_val)
acc = accuracy_score(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred)
print(f"Accuracy: {acc}")
print(report)

Accuracy: 0.9045697329376855
              precision    recall  f1-score   support

         bad       0.91      0.89      0.90      4194
        good       0.90      0.92      0.91      4231

    accuracy                           0.90      8425
   macro avg       0.90      0.90      0.90      8425
weighted avg       0.90      0.90      0.90      8425



In [58]:
y_test_pred = rf.predict(X_test)
acc = accuracy_score(y_test, y_test_pred)
report = classification_report(y_test, y_test_pred)
print(f"Accuracy: {acc}")
print(report)

Accuracy: 0.9044741039200873
              precision    recall  f1-score   support

         bad       0.91      0.90      0.90      6019
        good       0.90      0.91      0.90      5894

    accuracy                           0.90     11913
   macro avg       0.90      0.90      0.90     11913
weighted avg       0.90      0.90      0.90     11913



In [65]:
text1 = ["chatgpt is helpful"]
X1 = vectorizer.transform(text1)
y1 = rf.predict(X1)
text2 = ["chatgpt is not very good"]
X2 = vectorizer.transform(text2)
y2 = rf.predict(X2)
text3 = ["chatgpt is slow"]
X3 = vectorizer.transform(text3)
y3 = rf.predict(X3)
text4 = ["the answers of chatgpt is crazy"]
X4 = vectorizer.transform(text4)
y4 = rf.predict(X4)


print(text1,y1)
print(text2,y2)
print(text3,y3)
print(text4,y4)

['chatgpt is helpful'] ['good']
['chatgpt is not very good'] ['good']
['chatgpt is slow'] ['bad']
['the answers of chatgpt is crazy'] ['bad']


# Naive Bayes

In [45]:
from sklearn.naive_bayes import MultinomialNB

In [60]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

MultinomialNB()

In [61]:
y_val_pred = nb.predict(X_val)
acc = accuracy_score(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred)
print(f"Accuracy: {acc}")
print(report)

Accuracy: 0.776379821958457
              precision    recall  f1-score   support

         bad       0.97      0.57      0.72      4194
        good       0.70      0.98      0.81      4231

    accuracy                           0.78      8425
   macro avg       0.83      0.78      0.77      8425
weighted avg       0.83      0.78      0.77      8425



In [62]:
y_test_pred = nb.predict(X_test)
acc = accuracy_score(y_test, y_test_pred)
report = classification_report(y_test, y_test_pred)
print(f"Accuracy: {acc}")
print(report)

Accuracy: 0.7800721900444892
              precision    recall  f1-score   support

         bad       0.97      0.58      0.73      6019
        good       0.70      0.98      0.82      5894

    accuracy                           0.78     11913
   macro avg       0.83      0.78      0.77     11913
weighted avg       0.84      0.78      0.77     11913



In [63]:
text1 = ["chatgpt is helpful"]
X1 = vectorizer.transform(text1)
y1 = nb.predict(X1)
text2 = ["chatgpt is not very good"]
X2 = vectorizer.transform(text2)
y2 = nb.predict(X2)
text3 = ["chatgpt is slow"]
X3 = vectorizer.transform(text3)
y3 = nb.predict(X3)
text4 = ["the answers of chatgpt is crazy"]
X4 = vectorizer.transform(text4)
y4 = nb.predict(X4)


print(text1,y1)
print(text2,y2)
print(text3,y3)
print(text4,y4)

['chatgpt is helpful'] ['good']
['chatgpt is not very good'] ['good']
['chatgpt is slow'] ['bad']
['the answers of chatgpt is crazy'] ['bad']
