In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix, classification_report
df = pd.read_csv('labeled_data.csv', index_col=0)

print(df.shape)
df.head(5)

(61282, 2)


Unnamed: 0,sentence,prediction
0,twitter prior musk takeover talking directly n...,1
1,article say imply states feature turns crashes...,1
2,og musk duck lives wall,1
3,dare_speak way great powerful musk obviously d...,1
4,cannot_wait finally excuse shower douche,1


In [2]:
df.dropna(inplace=True)
df.shape

(61278, 2)

In [21]:
df['prediction'].value_counts()/df.shape[0] * 100 

1    90.332583
0     9.667417
Name: prediction, dtype: float64

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.sentence, df.prediction, test_size=0.1, stratify=df.prediction, random_state=42)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer(max_features=50000, dtype=np.int8)

count_matrix = countvec.fit_transform(X_train)
count_array = count_matrix.toarray()
X_train = pd.DataFrame(data=count_array,columns = countvec.get_feature_names_out())

In [5]:
X_test = pd.DataFrame(data=countvec.transform(X_test).toarray(), columns=countvec.get_feature_names_out())

In [6]:
model_NB = BernoulliNB()
model_NB.fit(X_train, y_train)

In [7]:
y_pred_NB = model_NB.predict(X_test)
print(classification_report(y_test, y_pred_NB))

              precision    recall  f1-score   support

           0       0.91      0.60      0.73       592
           1       0.96      0.99      0.98      5536

    accuracy                           0.96      6128
   macro avg       0.94      0.80      0.85      6128
weighted avg       0.95      0.96      0.95      6128



In [6]:
from sklearn.tree import DecisionTreeClassifier

model_DCF = DecisionTreeClassifier(random_state=0, max_depth=5)
model_DCF.fit(X_train, y_train)

In [9]:
y_pred_DCF = model_DCF.predict(X_test)
print(classification_report(y_test, y_pred_DCF))

              precision    recall  f1-score   support

           0       0.94      0.35      0.51       592
           1       0.94      1.00      0.97      5536

    accuracy                           0.94      6128
   macro avg       0.94      0.68      0.74      6128
weighted avg       0.94      0.94      0.92      6128



In [12]:
print(confusion_matrix(y_test, y_pred_DCF))

[[ 209  383]
 [  13 5523]]


In [13]:
from sklearn.ensemble import RandomForestClassifier

model_RFC = RandomForestClassifier(
    random_state=0,
    max_depth=50
)

model_RFC.fit(X_train, y_train)

In [15]:
model_RFC = model_RFC.predict(X_test)
print(classification_report(y_test, model_RFC))

              precision    recall  f1-score   support

           0       0.96      0.33      0.49       592
           1       0.93      1.00      0.96      5536

    accuracy                           0.93      6128
   macro avg       0.94      0.66      0.73      6128
weighted avg       0.94      0.93      0.92      6128



In [16]:
print(confusion_matrix(y_test, model_RFC))

[[ 195  397]
 [   9 5527]]
