In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df = pd.read_csv('hate_speech_balanced_data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,class
0,0,i hate when black people try and be white so b...,0
1,1,eda the queers at aa lol gduenez httptcogzzrzj...,0
2,2,heauxmersimpson im jus tryna vaca away from th...,0
3,3,iycmicant get any work done if you keep showin...,1
4,4,eh bitch how about you worry about your own ps...,0


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,class
0,0,i hate when black people try and be white so b...,0
1,1,eda the queers at aa lol gduenez httptcogzzrzj...,0
2,2,heauxmersimpson im jus tryna vaca away from th...,0
3,3,iycmicant get any work done if you keep showin...,1
4,4,eh bitch how about you worry about your own ps...,0


In [5]:
df = df.drop('Unnamed: 0', axis = 1)

In [6]:
df.shape

(57237, 2)

In [7]:
df.columns

Index(['tweet', 'class'], dtype='object')

In [8]:
df.duplicated().sum()

32639

In [9]:
df.isnull().sum()

tweet    0
class    0
dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['class'], test_size=0.2, stratify = df['class'], random_state=42)

In [11]:
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

Let's break down what each part of this code does:

### 1. `vectorizer = TfidfVectorizer(max_features=1000)`

- **`TfidfVectorizer`**: 
  - This is a feature extraction method from the `scikit-learn` library, used to convert a collection of raw text documents into a matrix of TF-IDF (Term Frequency-Inverse Document Frequency) features.
  - **TF-IDF** is a numerical representation that reflects how important a word is in a document relative to a collection of documents. It combines two statistics:
    - **Term Frequency (TF)**: The number of times a word appears in a document, divided by the total number of words in that document.
    - **Inverse Document Frequency (IDF)**: The logarithm of the total number of documents divided by the number of documents that contain the word. This helps to reduce the weight of commonly occurring words.
  - **`max_features=1000`**:
    - This parameter limits the number of features (i.e., words) that the vectorizer will consider to the top 1,000 most important words across the corpus (based on their TF-IDF scores).
    - This is often done to reduce dimensionality and remove less important or very rare words that might not contribute significantly to the model.

### 2. `X_train_vec = vectorizer.fit_transform(X_train)`

- **`fit_transform(X_train)`**:
  - **`fit`**: The vectorizer first learns the vocabulary of the training data (`X_train`). It identifies the top 1,000 features (words) that will be used.
  - **`transform`**: The text in the training data is then transformed into a matrix of TF-IDF features. Each document (e.g., each tweet) is converted into a vector where each element corresponds to a TF-IDF score of a word.
  - The result, `X_train_vec`, is a sparse matrix where rows represent documents (tweets), and columns represent the TF-IDF score of each word in the selected vocabulary.

### 3. `X_test_vec = vectorizer.transform(X_test)`

- **`transform(X_test)`**:
  - Here, the vectorizer transforms the test data (`X_test`) into the same TF-IDF feature space learned from the training data.
  - It uses the same vocabulary (top 1,000 words) identified during training and converts each test document into a vector of TF-IDF scores.
  - This ensures that the test data is represented in the same feature space as the training data, allowing for consistent model evaluation.

### Summary

- The `TfidfVectorizer` converts raw text data into numerical feature vectors using the TF-IDF approach, focusing on the most important words (`max_features=1000`).
- The `fit_transform(X_train)` method fits the vectorizer to the training data and transforms it into TF-IDF vectors.
- The `transform(X_test)` method then converts the test data into TF-IDF vectors using the same vocabulary, ensuring that the model can evaluate the test data consistently.

These steps are crucial for preparing text data for machine learning models.

In [12]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Multinomial Naive Bayes": MultinomialNB(),
    "Bernoulli Naive Bayes": BernoulliNB(),
}

for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print("-" * 60)

In [14]:
mnb_model = MultinomialNB()
mnb_model.fit(X_train_vec, y_train)
y_pred_mnb = mnb_model.predict(X_test_vec)

print("Multinomial Naive Bayes Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_mnb):.2f}")
print(classification_report(y_test, y_pred_mnb))
print("-" * 60)

Multinomial Naive Bayes Performance:
Accuracy: 0.77
              precision    recall  f1-score   support

           0       0.75      0.70      0.73      3816
           1       0.71      0.77      0.74      3816
           2       0.85      0.83      0.84      3816

    accuracy                           0.77     11448
   macro avg       0.77      0.77      0.77     11448
weighted avg       0.77      0.77      0.77     11448

------------------------------------------------------------


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

bernoulli_vectorizer = CountVectorizer(stop_words='english', binary=True, max_features=1000)
X_train_binary = bernoulli_vectorizer.fit_transform(X_train)
X_test_binary = bernoulli_vectorizer.transform(X_test)

bnb_model = BernoulliNB()
bnb_model.fit(X_train_binary, y_train)
y_pred_bnb = bnb_model.predict(X_test_binary)

print("Bernoulli Naive Bayes Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_bnb):.2f}")
print(classification_report(y_test, y_pred_bnb))

Bernoulli Naive Bayes Performance:
Accuracy: 0.78
              precision    recall  f1-score   support

           0       0.79      0.67      0.72      3816
           1       0.74      0.77      0.76      3816
           2       0.81      0.89      0.85      3816

    accuracy                           0.78     11448
   macro avg       0.78      0.78      0.78     11448
weighted avg       0.78      0.78      0.78     11448

