Imports

In [2]:
import pandas as pd
import numpy as np
import nltk
import re
import string
import warnings

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import joblib
warnings.filterwarnings('ignore')

Import the Dataset

**Note: Make sure to run the imports cell above first!**

In [None]:
try:
    data = pd.read_csv("twitter.csv", index_col=0)
    print("CSV loaded successfully!")
    print("Columns:", data.columns.tolist())
    print("Shape:", data.shape)
except FileNotFoundError:
    print("Error: twitter.csv not found")
    exit(1)
except pd.errors.ParserError:
    print("Error: Failed to parse twitter.csv")
    exit(1)

Load and Display Data

In [None]:
print("Missing values:\n", data.isnull().sum())
print("\nFirst few rows:")
print(data.head())

Map Columns for Hate Speech

In [6]:
def map_labels(df, column='class'):
    if column not in df.columns:
        print(f"Error: {column} column not found")
        exit(1)
    df['labels'] = df[column].map({0: "Hate Speech", 1: "Offensive Language", 2: "Normal"})
    return df

data = map_labels(data)

Apply Mapping and Display

In [None]:
print(data[['tweet', 'labels']].head(10))

Select Relevant Columns

In [None]:
if 'tweet' not in data.columns or 'labels' not in data.columns:
    print("Error: Required columns missing")
    exit(1)
data = data[['tweet', 'labels']]
print("Selected data shape:", data.shape)
print(data.head())

Clean the Sentence in Dataset

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split() if word not in stop_words]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split()]
    text = " ".join(text)
    return text if text.strip() else "empty"

Stemming and Stopwords

In [11]:
if not nltk.data.find('corpora/stopwords'):
    nltk.download('stopwords')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

data['tweet'] = data['tweet'].apply(clean)

Train Dataset

In [12]:
x = np.array(data['tweet'])
y = np.array(data['labels'])

cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Vectorization and Model Training

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

joblib.dump(clf, 'hate_speech_model.pkl')
joblib.dump(cv, 'vectorizer.pkl')

from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Validate the Dataset

In [None]:
y_pred = clf.predict(X_test)
print("Sample predictions:", y_pred[:5])

Sample Prediction

In [None]:
def predict_sample(text, vectorizer, model):
    cleaned = clean(text)
    if cleaned == "empty":
        return "Error: Invalid input after cleaning"
    vectorized = vectorizer.transform([cleaned]).toarray()
    return model.predict(vectorized)[0]

sample = "kill"
print("Prediction for '{}': {}".format(sample, predict_sample(sample, cv, clf)))