In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:
df = pd.read_csv('train.csv')
df.head()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r"[^a-z\s]", "", text)  
    text = re.sub(r"\s+", " ", text).strip()
    return text

x = df['tweet'].apply(clean_text)
y = df['class']

In [3]:
vectorizer = CountVectorizer(binary=True, stop_words='english')
x_vectorized = vectorizer.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x_vectorized, y, test_size=0.2, random_state=42)
model = BernoulliNB()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       290
           1       0.82      0.99      0.90      3832
           2       0.87      0.35      0.50       835

    accuracy                           0.83      4957
   macro avg       0.56      0.45      0.47      4957
weighted avg       0.78      0.83      0.78      4957



In [4]:
logistic_model_cv = LogisticRegression(max_iter=1000, class_weight='balanced')
logistic_model_cv.fit(x_train, y_train)
y_pred_logistic_cv = logistic_model_cv.predict(x_test)
print(classification_report(y_test, y_pred_logistic_cv))

              precision    recall  f1-score   support

           0       0.36      0.54      0.43       290
           1       0.96      0.89      0.93      3832
           2       0.79      0.93      0.86       835

    accuracy                           0.88      4957
   macro avg       0.71      0.79      0.74      4957
weighted avg       0.90      0.88      0.89      4957

