In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

import pickle
import time
import warnings
warnings.filterwarnings("ignore")

# Step 1: Data Loading and Exploration

Load the dataset

In [50]:
df = pd.read_csv('HateSpeech_Kenya.csv')

Display basic information

In [51]:
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(df.head())

Dataset shape: (48076, 5)
Columns: ['hate_speech', 'offensive_language', 'neither', 'Class', 'Tweet']
   hate_speech  offensive_language  neither  Class  \
0            0                   0        3      0   
1            0                   0        3      0   
2            0                   0        3      0   
3            0                   0        3      0   
4            0                   0        3      0   

                                               Tweet  
0  ['The political elite are in desperation. Ordi...  
1  ["Am just curious the only people who are call...  
2  ['USERNAME_3 the area politicians are the one ...  
3  ['War expected in Nakuru if something is not d...  
4  ['USERNAME_4 tells kikuyus activists that they...  


Check for missing values

In [52]:
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
hate_speech           0
offensive_language    0
neither               0
Class                 0
Tweet                 0
dtype: int64


Class distribution

In [53]:
class_counts = df['Class'].value_counts()
print("\nClass distribution:")
print(class_counts)



Class distribution:
Class
0    36352
1     8543
2     3181
Name: count, dtype: int64


Map class values to readable labels

In [54]:
class_mapping = {
    0: "Neither",
    1: "Offensive",
    2: "Hate Speech"
}

df['class_label'] = df['Class'].map(class_mapping)



Visualize class distribution


In [55]:
plt.figure(figsize=(10, 6))
sns.countplot(x='class_label', data=df)
plt.title('Distribution of Classes')
plt.ylabel('Count')
plt.xlabel('Class')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('class_distribution.png')
plt.close()

# Step 2: Text Preprocessing


Download NLTK resources if needed


In [60]:
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt')
    nltk.download('punkt_tab')
    nltk.download('stopwords')
    nltk.download('wordnet')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Initialize lemmatizer and stopwords

In [57]:

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Clean and preprocess text data"""
    if not isinstance(text, str):
        return ""
    
    # Remove the list formatting if present (specific to this dataset)
    text = re.sub(r"^\['|'\]$", "", text)
    text = text.replace("\\\"", "")
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove usernames (specific to this dataset)
    text = re.sub(r'USERNAME_\d+', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and len(token) > 2]
    
    # Rejoin tokens
    cleaned_text = ' '.join(cleaned_tokens)
    
    return cleaned_text



Apply preprocessing to the Tweet column


In [None]:
try:
    df['processed_text'] = df['Tweet'].apply(preprocess_text)
except LookupError as e:
    print(f"NLTK resource error: {e}")
    print("Please run the nltk.download() commands above first")

Compare original and processed text


In [None]:
print("\nOriginal vs Processed text samples:")
for i in range(3):
    print(f"Original: {df['Tweet'].iloc[i]}")
    print(f"Processed: {df['processed_text'].iloc[i]}")
    print()