In [1]:
# If NLTK or Scikit-learn is not installed, uncomment and run this:
# !pip install nltk scikit-learn pandas numpy

import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VALLI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VALLI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:

file_path = r"E:\cyberbullying_tweets(ML).csv"  # Use raw string literal for Windows paths


df = pd.read_csv(file_path)


df.head()


Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [3]:

print(df.columns)



Index(['tweet_text', 'cyberbullying_type'], dtype='object')


In [4]:

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    words = word_tokenize(str(text))  # Ensure text is string
    cleaned_words = [
        stemmer.stem(word.lower()) 
        for word in words 
        if word.lower() not in stop_words and word not in string.punctuation
    ]
    return " ".join(cleaned_words)


df['cleaned_tweet'] = df['tweet_text'].apply(preprocess_text)


In [5]:

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_tweet']).toarray()


In [6]:
y = df['cyberbullying_type']  

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [7]:

classifier = MultinomialNB()
classifier.fit(X_train, y_train)


y_pred = classifier.predict(X_test)


print("Classification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
                     precision    recall  f1-score   support

                age       0.79      0.96      0.87      1603
          ethnicity       0.87      0.91      0.89      1603
             gender       0.82      0.82      0.82      1531
  not_cyberbullying       0.68      0.43      0.53      1624
other_cyberbullying       0.64      0.61      0.62      1612
           religion       0.84      0.96      0.90      1566

           accuracy                           0.78      9539
          macro avg       0.77      0.78      0.77      9539
       weighted avg       0.77      0.78      0.77      9539


Confusion Matrix:
[[1546    6    6   20   17    8]
 [  45 1456   14   11   32   45]
 [  19   35 1261  101   96   19]
 [ 179   80  108  697  414  146]
 [ 164   86  134  169  984   75]
 [  11    8   13   21    5 1508]]
