In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

import pickle
import time
import warnings
warnings.filterwarnings("ignore")

# Step 1: Data Loading and Exploration

Load the dataset

In [82]:
df = pd.read_csv('HateSpeech_Kenya.csv')

Display basic information

In [83]:
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(df.head())

Dataset shape: (48076, 5)
Columns: ['hate_speech', 'offensive_language', 'neither', 'Class', 'Tweet']
   hate_speech  offensive_language  neither  Class  \
0            0                   0        3      0   
1            0                   0        3      0   
2            0                   0        3      0   
3            0                   0        3      0   
4            0                   0        3      0   

                                               Tweet  
0  ['The political elite are in desperation. Ordi...  
1  ["Am just curious the only people who are call...  
2  ['USERNAME_3 the area politicians are the one ...  
3  ['War expected in Nakuru if something is not d...  
4  ['USERNAME_4 tells kikuyus activists that they...  


Check for missing values

In [84]:
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
hate_speech           0
offensive_language    0
neither               0
Class                 0
Tweet                 0
dtype: int64


Class distribution

In [85]:
class_counts = df['Class'].value_counts()
print("\nClass distribution:")
print(class_counts)



Class distribution:
Class
0    36352
1     8543
2     3181
Name: count, dtype: int64


Map class values to readable labels

In [86]:
class_mapping = {
    0: "Neither",
    1: "Offensive",
    2: "Hate Speech"
}

df['class_label'] = df['Class'].map(class_mapping)



Visualize class distribution


In [87]:
plt.figure(figsize=(10, 6))
sns.countplot(x='class_label', data=df)
plt.title('Distribution of Classes')
plt.ylabel('Count')
plt.xlabel('Class')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('class_distribution.png')
plt.close()

# Step 2: Text Preprocessing


Download NLTK resources if needed


In [88]:
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt')
    nltk.download('punkt_tab')
    nltk.download('stopwords')
    nltk.download('wordnet')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Initialize lemmatizer and stopwords

In [89]:

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Clean and preprocess text data"""
    if not isinstance(text, str):
        return ""
    
    # Remove the list formatting if present (specific to this dataset)
    text = re.sub(r"^\['|'\]$", "", text)
    text = text.replace("\\\"", "")
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove usernames (specific to this dataset)
    text = re.sub(r'USERNAME_\d+', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and len(token) > 2]
    
    # Rejoin tokens
    cleaned_text = ' '.join(cleaned_tokens)
    
    return cleaned_text



Apply preprocessing to the Tweet column


In [None]:
df['processed_text'] = df['Tweet'].apply(preprocess_text)



Compare original and processed text


In [None]:
print("\nOriginal vs Processed text samples:")
for i in range(3):
    print(f"Original: {df['Tweet'].iloc[i]}")
    print(f"Processed: {df['processed_text'].iloc[i]}")
    print()

# Step 3: Feature Engineering and Data Split


In [None]:
print("\nStep 3: Creating features and splitting data...")



Check for and remove empty processed texts


In [None]:
empty_texts = df['processed_text'].str.strip() == ''
print(f"Number of empty texts after preprocessing: {sum(empty_texts)}")
df = df[~empty_texts].reset_index(drop=True)



Split data into features and target


In [None]:
X = df['processed_text']
y = df['Class']  # Using the numerical class labels



Split into training and testing sets


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



Convert text to numerical features using TF-IDF


In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, min_df=5)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"Training features shape: {X_train_tfidf.shape}")
print(f"Testing features shape: {X_test_tfidf.shape}")



Save key vocabulary words for later reference


In [None]:
top_words = pd.DataFrame(tfidf_vectorizer.vocabulary_.items(), columns=['Word', 'Index'])
top_words = top_words.sort_values('Index').head(20)
print("\nTop TF-IDF vocabulary words:")
print(top_words['Word'].tolist())

# Step 4: Model Building and Evaluation


In [None]:
print("\nStep 4: Building and evaluating models...")



Define models to test


In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, C=1, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    'Linear SVM': LinearSVC(C=1, class_weight='balanced', max_iter=10000),
    'Multinomial NB': MultinomialNB(alpha=0.1)
}



Train and evaluate each model


In [None]:
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()
    
    # Train the model
    model.fit(X_train_tfidf, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_tfidf)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=list(class_mapping.values()))
    conf_matrix = confusion_matrix(y_test, y_pred)
    training_time = time.time() - start_time
    
    # Store results
    results[name] = {
        'accuracy': accuracy,
        'report': report,
        'confusion_matrix': conf_matrix,
        'training_time': training_time
    }
    
    # Print results
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(f"Training time: {training_time:.2f} seconds")
    print(f"Classification Report:\n{report}")
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=list(class_mapping.values()),
                yticklabels=list(class_mapping.values()))
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(f'confusion_matrix_{name.replace(" ", "_")}.png')
    plt.close()



Compare model accuracies


In [None]:
plt.figure(figsize=(12, 6))
model_names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in model_names]

plt.bar(model_names, accuracies)
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('model_comparison.png')
plt.close()



Identify the best model


In [None]:
best_model_name = max(results, key=lambda k: results[k]['accuracy'])
print(f"\nBest model: {best_model_name} with accuracy {results[best_model_name]['accuracy']:.4f}")