In [None]:
# Data Analysis
import pandas as pd
import numpy as np

# Data Visualization
from matplotlib import pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# NLP
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
import re

# Warning
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Loading data
df = pd.read_csv('/content/hate.csv', encoding="latin-1")
print(f'data shape: {df.shape}')
df.head()

data shape: (41144, 2)


Unnamed: 0,comment,label
0,dalits are lowlives,N
1,gay people are a burden to our society,N
2,Arabs are not welcome,N
3,I'm not saying we should actually eliminate he...,N
4,bananas are for black people,N


In [None]:
df.groupby('label').describe()

Unnamed: 0_level_0,comment,comment,comment,comment
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
N,22158,22151,Blacks are such pondlife,5
O,36,36,"My teacher is Chinese and he is awful, no surp...",1
P,18950,18947,Politicians are crap,2


In [None]:
df = df[df['label'] != 'O']
print(df.head())

df.groupby('label').describe()

                                             comment label
0                                dalits are lowlives     N
1             gay people are a burden to our society     N
2                              Arabs are not welcome     N
3  I'm not saying we should actually eliminate he...     N
4                       bananas are for black people     N


Unnamed: 0_level_0,comment,comment,comment,comment
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
N,22158,22151,Blacks are such pondlife,5
P,18950,18947,Politicians are crap,2


In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

df['label'] = encoder.fit_transform(df['label'])

In [None]:
# missing values
df.isnull().sum()

comment    0
label      0
dtype: int64

In [None]:
# check for duplicate values
df.duplicated().sum()

0

In [None]:
# remove duplicates
df = df.drop_duplicates(keep='first')

In [None]:
# check for duplicate values
df.duplicated().sum()

0

In [None]:
df.shape

(10000, 2)

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

# Function to tokenize and clean the text
def tokenize_and_clean(text):
    # Changing case of the text to lower case
    lowered = text.lower()

    # Cleaning the text
    cleaned = re.sub('@user', '', lowered)

    # Tokenization
    tokens = word_tokenize(cleaned)
    filtered_tokens = [token for token in tokens if re.match(r'\w{1,}', token)]

    # Stemming
    stemmer = PorterStemmer()
    stems = [stemmer.stem(token) for token in filtered_tokens]
    return ' '.join(stems)

In [None]:
# Apply tokenization and cleaning to the 'comment' column
df['transformed_comment'] = df['comment'].apply(tokenize_and_clean)

In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X = tfidf_vectorizer.fit_transform(df['transformed_comment'])

In [None]:
y = df['label'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8000, 10904) (2000, 10904) (8000,) (2000,)


In [None]:
# Functions to print scores
def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Training Scores: Accuracy={acc}, F1-Score={f1}')

def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Validation Scores: Accuracy={acc}, F1-Score={f1}')

In [None]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
training_scores(y_train, y_train_pred)
validation_scores(y_test, y_test_pred)

Training Scores: Accuracy=0.784, F1-Score=0.787
Validation Scores: Accuracy=0.652, F1-Score=0.661


In [None]:

# Define the parameter grid to search
param_grid = {
    'penalty': ['l1', 'l2'],  # Regularization penalty
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'solver': ['liblinear', 'saga']  # Optimization algorithm
}

# Instantiate Logistic Regression model
lr = LogisticRegression(max_iter=1000)

# Instantiate GridSearchCV
grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

In [None]:
# Print best parameters and best score
print("Best Parameters: ", grid_search.best_params_)
print("Best F1-Score: ", grid_search.best_score_)

Best Parameters:  {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
Best F1-Score:  0.6421304026476147


In [None]:
# Get the best estimator
best_lr = grid_search.best_estimator_

# Predictions on training and validation sets using the best estimator
y_train_pred = best_lr.predict(X_train)
y_test_pred = best_lr.predict(X_test)

In [None]:
# Calculate training and validation scores using the best estimator
train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1_score = f1_score(y_train, y_train_pred, average='weighted')
print("Training Scores: Accuracy={}, F1-Score={}".format(train_accuracy, train_f1_score))

Training Scores: Accuracy=0.712875, F1-Score=0.7123306802717211


In [None]:
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
print("Validation Scores: Accuracy={}, F1-Score={}".format(test_accuracy, test_f1_score))

Validation Scores: Accuracy=0.6935, F1-Score=0.693338476460744


In [None]:
# Naive Bayes Classifier
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_train_pred = mnb.predict(X_train)
y_test_pred = mnb.predict(X_test)
training_scores(y_train, y_train_pred)
validation_scores(y_test, y_test_pred)

Training Scores: Accuracy=0.797, F1-Score=0.795
Validation Scores: Accuracy=0.619, F1-Score=0.613


In [None]:
# Random Forest Classifier

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 0.5]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train, y_train)
# Print best parameters and best score
print("Best Parameters: ", grid_search.best_params_)
print("Best F1-Score: ", grid_search.best_score_)


Best Parameters:  {'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 150}
Best F1-Score:  0.634275298381876


In [None]:
best_rf = grid_search.best_estimator_

# Predictions on training and validation sets using the best estimator
y_train_pred = best_rf.predict(X_train)
y_test_pred = best_rf.predict(X_test)


In [None]:
# Calculate training and validation scores using the best estimator
train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1_score = f1_score(y_train, y_train_pred, average='weighted')
print("Training Scores: Accuracy={}, F1-Score={}".format(train_accuracy, train_f1_score))

Training Scores: Accuracy=0.722, F1-Score=0.7206555144939829


In [None]:
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
print("Validation Scores: Accuracy={}, F1-Score={}".format(test_accuracy, test_f1_score))

Validation Scores: Accuracy=0.668, F1-Score=0.6674819960022271


In [None]:
# Extreme Gradient Boosting Classifier
xgb = XGBClassifier(objective='binary:logistic', eval_metric='logloss')
xgb.fit(X_train, y_train)
y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)
training_scores(y_train, y_train_pred)
validation_scores(y_test, y_test_pred)

Training Scores: Accuracy=0.758, F1-Score=0.739
Validation Scores: Accuracy=0.696, F1-Score=0.671


In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
# Extreme Gradient Boosting Classifier
bnb.fit(X_train, y_train)
y_train_pred = bnb.predict(X_train)
y_test_pred = bnb.predict(X_test)
training_scores(y_train, y_train_pred)
validation_scores(y_test, y_test_pred)

Training Scores: Accuracy=0.726, F1-Score=0.669
Validation Scores: Accuracy=0.58, F1-Score=0.484


In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
svc.fit(X_train, y_train)
y_train_pred = svc.predict(X_train)
y_test_pred = svc.predict(X_test)
training_scores(y_train, y_train_pred)
validation_scores(y_test, y_test_pred)

Training Scores: Accuracy=0.723, F1-Score=0.695
Validation Scores: Accuracy=0.677, F1-Score=0.646


In [None]:

# Define the parameter grid to search
param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel type
    'gamma': ['scale', 'auto']  # Kernel coefficient for 'rbf' and 'poly' kernels
}

# Instantiate Support Vector Classifier
svc = SVC()

In [None]:
# Instantiate GridSearchCV
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

In [None]:
# Print best parameters and best score
print("Best Parameters: ", grid_search.best_params_)
print("Best F1-Score: ", grid_search.best_score_)

Best Parameters:  {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best F1-Score:  0.6296134522962906


In [None]:
# Get the best estimator
best_svc = grid_search.best_estimator_

In [None]:
# Predictions on training and validation sets using the best estimator
y_train_pred = best_svc.predict(X_train)
y_test_pred = best_svc.predict(X_test)

In [None]:
# Calculate training and validation scores using the best estimator
train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1_score = f1_score(y_train, y_train_pred, average='weighted')
print("Training Scores: Accuracy={}, F1-Score={}".format(train_accuracy, train_f1_score))

Training Scores: Accuracy=0.721375, F1-Score=0.721018473745245


In [None]:
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
print("Validation Scores: Accuracy={}, F1-Score={}".format(test_accuracy, test_f1_score))

Validation Scores: Accuracy=0.683, F1-Score=0.6828541216486594


In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=5)
dtc.fit(X_train, y_train)
y_train_pred = dtc.predict(X_train)
y_test_pred = dtc.predict(X_test)
training_scores(y_train, y_train_pred)
validation_scores(y_test, y_test_pred)

Training Scores: Accuracy=0.574, F1-Score=0.174
Validation Scores: Accuracy=0.572, F1-Score=0.177


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
gbdt.fit(X_train, y_train)
y_train_pred = gbdt.predict(X_train)
y_test_pred = gbdt.predict(X_test)
training_scores(y_train, y_train_pred)
validation_scores(y_test, y_test_pred)

Training Scores: Accuracy=0.596, F1-Score=0.277
Validation Scores: Accuracy=0.583, F1-Score=0.251


In [None]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
abc.fit(X_train, y_train)
y_train_pred = abc.predict(X_train)
y_test_pred = abc.predict(X_test)
training_scores(y_train, y_train_pred)
validation_scores(y_test, y_test_pred)

Training Scores: Accuracy=0.644, F1-Score=0.664
Validation Scores: Accuracy=0.639, F1-Score=0.659


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
etc.fit(X_train, y_train)
y_train_pred = etc.predict(X_train)
y_test_pred = etc.predict(X_test)
training_scores(y_train, y_train_pred)
validation_scores(y_test, y_test_pred)

Training Scores: Accuracy=0.997, F1-Score=0.997
Validation Scores: Accuracy=0.613, F1-Score=0.605


In [None]:
from sklearn.ensemble import BaggingClassifier
bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=500, max_samples=0.25, bootstrap=True, random_state=42)
bc.fit(X_train, y_train)
y_train_pred = bc.predict(X_train)
y_test_pred = bc.predict(X_test)
training_scores(y_train, y_train_pred)
validation_scores(y_test, y_test_pred)

Training Scores: Accuracy=0.812, F1-Score=0.804
Validation Scores: Accuracy=0.638, F1-Score=0.615


In [None]:
bc = BaggingClassifier(base_estimator=LogisticRegression(), n_estimators=500, max_samples=0.25, bootstrap=True, random_state=42)
bc.fit(X_train, y_train)
y_train_pred = bc.predict(X_train)
y_test_pred = bc.predict(X_test)
training_scores(y_train, y_train_pred)
validation_scores(y_test, y_test_pred)

Training Scores: Accuracy=0.729, F1-Score=0.735
Validation Scores: Accuracy=0.65, F1-Score=0.66


In [None]:
bc = BaggingClassifier(base_estimator=RandomForestClassifier(), n_estimators=500, max_samples=0.25, bootstrap=True, random_state=42)
bc.fit(X_train, y_train)
y_train_pred = bc.predict(X_train)
y_test_pred = bc.predict(X_test)
training_scores(y_train, y_train_pred)
validation_scores(y_test, y_test_pred)

Training Scores: Accuracy=0.825, F1-Score=0.822
Validation Scores: Accuracy=0.626, F1-Score=0.623


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier
# Voting Classifier
rf = RandomForestClassifier()
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)

In [None]:
voting = VotingClassifier(estimators=[('rfc', rf), ('et', etc), ('bc', bc)],voting='soft')
voting.fit(X_train,y_train)

In [None]:
y_pred = voting.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("f1",f1_score(y_test,y_pred))

Accuracy 0.6085158150851582
f1 0.5563826854149435


Overall, the models perform well, with Random Forest, Support Vector Classifier, and Voting Classifier achieving the highest validation scores. Random Forest and Support Vector Classifier exhibit strong performance with F1-Scores around 0.85 and 0.87, respectively.