In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'cyber-bullying-data-for-multi-label-classification:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5434643%2F9018821%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240804%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240804T224735Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D75734c437406cc4f482813b25f0095f0a773711eee7fc9f0ae34df504a816b90c7d0b0d6524da54fcd77920ff609bb699ca6bdcf0acb0c96cd84c7914df67ad7ae9b19b55161f61b3a3e52f30d5072bf1927f48db359ad3f48d749f2c9aae5831675ea64003972e8d943bc0e0140328193fc9f1530239154d9ba3db0884b8528c3fa2f8d52675c888d98b5dc15b25b0e0795454a03668e3e3dce3459cf4b1bd950b74d234d95cb99384445845bcb4facabd68b27b7ed589c4897b86fd6375067446ece1be1e2f25ed48193059753c98abc07014f0d77bec1a8dd404c2fe0623ccd95bd08419fb3f87dddb8f4883292bd2638850a42e24812f36b3c190eb57e08'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading cyber-bullying-data-for-multi-label-classification, 2975215 bytes compressed
Downloaded and uncompressed: cyber-bullying-data-for-multi-label-classification
Data source import complete.


In [2]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
df = pd.read_csv('/kaggle/input/cyber-bullying-data-for-multi-label-classification/final_hateXplain.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20109 entries, 0 to 20108
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   comment             20109 non-null  object
 1   label               20109 non-null  object
 2   Race                20109 non-null  object
 3   Religion            20109 non-null  object
 4   Gender              20109 non-null  object
 5   Sexual Orientation  20109 non-null  object
 6   Miscellaneous       3533 non-null   object
dtypes: object(7)
memory usage: 1.1+ MB


In [5]:
df.describe()

Unnamed: 0,comment,label,Race,Religion,Gender,Sexual Orientation,Miscellaneous
count,20109,20109,20109,20109,20109,20109,3533
unique,20109,3,8,6,3,5,5
top,0 u0 lmao wow fuck you too 😂 😂,normal,No_race,Nonreligious,No_gender,No_orientation,Other
freq,1,7818,13579,15387,16380,17827,2017


In [6]:
df.isna().sum().sort_values(ascending=False)

Unnamed: 0,0
Miscellaneous,16576
comment,0
label,0
Race,0
Religion,0
Gender,0
Sexual Orientation,0


In [7]:
df.duplicated().any()

False

In [8]:
df['Miscellaneous'].fillna('Missing', inplace=True)

In [9]:
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])

In [10]:
X = df['comment'] + ' ' + df['Race'] + ' ' + df['Religion'] + ' ' + df['Gender'] + ' ' + df['Sexual Orientation'] + ' ' + df['Miscellaneous']

In [11]:
wn = WordNetLemmatizer()
def preprocessing(content):
  for i in range(0, len(content)):
      content = re.sub('[^a-zA-Z]', ' ', content)
      content = content.lower()
      content = [wn.lemmatize(word) for word in content.split() if not word in stopwords.words('english')]
      content = ' '.join(content)
      return content
X = X.apply(preprocessing)

In [12]:
X.tail()

Unnamed: 0,0
20104,mad caucasian failing forget black selling u african nonreligious gender orientation missing
20105,mmm yes trebuchet meme much nicer catapult meme top shelf quality good attacking turk arab arab islam gender orientation missing
20106,ghetto ready get tf race nonreligious gender orientation
20107,spelled name wrong name literally front face nation sucking sand nigger already affecting white iq arab nonreligious gender orientation missing
20108,hm six million yep disarmed ceppin one warsaw ghetto survived defended came usa started jew nd amendment club still exists day race jewish gender orientation missing


In [13]:

cv = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_res = cv.fit_transform(X).toarray()
y = df['label']

In [14]:
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 10000,
 'min_df': 1,
 'ngram_range': (1, 2),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [15]:
cv.get_feature_names_out()[:50]

array(['aa', 'ab', 'abandon', 'abandoned', 'abc', 'abducted',
       'abducted raped', 'ability', 'able', 'abolished', 'abomination',
       'aborted', 'abortion', 'abraham', 'abroad', 'absolute',
       'absolutely', 'absolutely retarded', 'absorbed', 'absurd', 'abt',
       'abuse', 'abused', 'abuser', 'abusing', 'abusive', 'abusive woman',
       'academia', 'academic', 'accent', 'accept', 'accept refugee',
       'acceptable', 'acceptance', 'accepted', 'accepting', 'access',
       'accident', 'accidentally', 'accommodate', 'accomplish',
       'accomplished', 'accomplishment', 'according', 'account',
       'account user', 'accurate', 'accusation', 'accuse', 'accused'],
      dtype=object)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y, test_size=0.2, random_state=42)

In [17]:
count_df = pd.DataFrame(X_train, columns=cv.get_feature_names_out())
count_df.head()

Unnamed: 0,aa,ab,abandon,abandoned,abc,abducted,abducted raped,ability,able,abolished,...,zhid,zion,zionism,zionist,zionist jew,zog,zombie,zone,zoo,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'MultinomialNB': MultinomialNB(),
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'PassiveAggressiveClassifier': PassiveAggressiveClassifier(),
    'XGBClassifier': XGBClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
    print(name, 'Analysis\n')
    print(f"Accuracy: {accuracy}%\n")

LogisticRegression Analysis

Accuracy: 72.38%

MultinomialNB Analysis

Accuracy: 67.33%

RandomForestClassifier Analysis

Accuracy: 71.46%

PassiveAggressiveClassifier Analysis

Accuracy: 60.67%

XGBClassifier Analysis

Accuracy: 72.25%

