<a href="https://colab.research.google.com/github/yugetcodes/Cars24_Data_Analysis/blob/main/NLP_TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
df = pd.read_csv('hate_speech_balanced_data.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,class
0,0,i hate when black people try and be white so b...,0
1,1,eda the queers at aa lol gduenez httptcogzzrzj...,0
2,2,heauxmersimpson im jus tryna vaca away from th...,0
3,3,iycmicant get any work done if you keep showin...,1
4,4,eh bitch how about you worry about your own ps...,0


In [None]:
df = df.drop('Unnamed: 0', axis = 1)

In [None]:
df.shape

(57237, 2)

In [None]:
df.columns

Index(['tweet', 'class'], dtype='object')

In [None]:
df.duplicated().sum()

32639

In [None]:
df.isnull().sum()

Unnamed: 0,0
tweet,0
class,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['class'], test_size=0.2, stratify = df['class'], random_state=42)

In [None]:
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(),
    "DesicionTreeClassifier": DecisionTreeClassifier(),
    "Multinomial Naive Bayes": MultinomialNB(),
    "Bernoulli Naive Bayes": BernoulliNB(),
}

In [None]:
mnb_model = MultinomialNB()
mnb_model.fit(X_train_vec, y_train)
y_pred_mnb = mnb_model.predict(X_test_vec)

print("Multinomial Naive Bayes Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_mnb):.2f}")
print(classification_report(y_test, y_pred_mnb))
print("-" * 60)

Multinomial Naive Bayes Performance:
Accuracy: 0.77
              precision    recall  f1-score   support

           0       0.75      0.70      0.73      3816
           1       0.71      0.77      0.74      3816
           2       0.85      0.83      0.84      3816

    accuracy                           0.77     11448
   macro avg       0.77      0.77      0.77     11448
weighted avg       0.77      0.77      0.77     11448

------------------------------------------------------------


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

bernoulli_vectorizer = CountVectorizer(stop_words='english', binary=True, max_features=1000)
X_train_binary = bernoulli_vectorizer.fit_transform(X_train)
X_test_binary = bernoulli_vectorizer.transform(X_test)

bnb_model = BernoulliNB()
bnb_model.fit(X_train_binary, y_train)
y_pred_bnb = bnb_model.predict(X_test_binary)

print("Bernoulli Naive Bayes Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_bnb):.2f}")
print(classification_report(y_test, y_pred_bnb))

Bernoulli Naive Bayes Performance:
Accuracy: 0.78
              precision    recall  f1-score   support

           0       0.79      0.67      0.72      3816
           1       0.74      0.77      0.76      3816
           2       0.81      0.89      0.85      3816

    accuracy                           0.78     11448
   macro avg       0.78      0.78      0.78     11448
weighted avg       0.78      0.78      0.78     11448



In [None]:
from sklearn import tree
t_m = tree.DecisionTreeClassifier()
t_m.fit(X_train_vec, y_train)

In [None]:
y_pred = t_m.predict(X_test_vec)

print("DecisionTreeClassifier Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))
print("-" * 60)

DecisionTreeClassifier Performance:
Accuracy: 0.94
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      3816
           1       0.99      0.84      0.91      3816
           2       0.93      0.98      0.95      3816

    accuracy                           0.94     11448
   macro avg       0.94      0.94      0.94     11448
weighted avg       0.94      0.94      0.94     11448

------------------------------------------------------------
DecisionTreeClassifier Performance:
Accuracy: 0.94
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      3816
           1       0.99      0.84      0.91      3816
           2       0.93      0.98      0.95      3816

    accuracy                           0.94     11448
   macro avg       0.94      0.94      0.94     11448
weighted avg       0.94      0.94      0.94     11448

------------------------------------------------------------


In [None]:
from sklearn.svm import SVC
sv = SVC(kernel='linear')
sv.fit(X_train_vec, y_train)

In [None]:
y_pred = sv.predict(X_test_vec)

print("SVC Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))
print("-" * 60)

SVC Performance:
Accuracy: 0.82
              precision    recall  f1-score   support

           0       0.77      0.82      0.79      3816
           1       0.83      0.71      0.76      3816
           2       0.85      0.92      0.88      3816

    accuracy                           0.82     11448
   macro avg       0.82      0.82      0.81     11448
weighted avg       0.82      0.82      0.81     11448

------------------------------------------------------------


In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=10)
model.fit(X_train_vec, y_train)