**Importing Libraries**

In [1]:
import pandas as pd
import numpy as np
import re
import unicodedata
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
import joblib
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig
from datasets import Dataset
import gradio as gr
import warnings
warnings.filterwarnings("ignore")

**Upload CSV in Colab**

In [4]:
from google.colab import files
uploaded = files.upload()  # Upload 'Multilablel Cyberbully Data.csv'

# Automatically get uploaded file name
DATA_PATH = list(uploaded.keys())[0]
print("Using dataset:", DATA_PATH)

Saving Multilablel Cyberbully Data.csv to Multilablel Cyberbully Data.csv
Using dataset: Multilablel Cyberbully Data.csv


**Load dataset**

In [5]:
df = pd.read_csv(DATA_PATH)
print("Columns:", df.columns.tolist())
# Define columns
text_col = "comment"
label_cols = ["bully", "sexual", "religious", "threat", "spam"]

Columns: ['Gender', 'Profession', 'comment', 'bully', 'sexual', 'religious', 'threat', 'spam']


**PREPROCESSING**

In [6]:
def clean_text(text):
    if pd.isna(text):
        return ""
    # Unicode normalization
    text = unicodedata.normalize("NFKC", text)
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    # Remove emails
    text = re.sub(r"\S+@\S+\.\S+", "", text)
    # Remove mentions
    text = re.sub(r"@\w+", "", text)
    # Remove hashtags but keep text
    text = re.sub(r"#(\w+)", r"\1", text)
    # Remove non-Bangla/English/numbers/punctuation
    allowed_pattern = r"[^a-zA-Z0-9অ-৹০-৯.,!?;:\-()\s]"
    text = re.sub(allowed_pattern, " ", text)
    # Lowercase English
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Preprocessing report
before_count = len(df)
before_empty = df[text_col].isna().sum() + (df[text_col].str.strip() == "").sum()

df["clean_comment"] = df[text_col].astype(str).apply(clean_text)
after_empty = (df["clean_comment"].str.strip() == "").sum()
df = df[df["clean_comment"].str.strip() != ""]
after_count = len(df)

print("\n===== PREPROCESSING REPORT =====")
print(f"Total rows before cleaning         : {before_count}")
print(f"Empty/invalid rows before cleaning : {before_empty}")
print(f"Rows removed due to empty comment  : {after_empty}")
print(f"Total rows after cleaning          : {after_count}")
print("=================================\n")

# Update text_col to clean_comment
text_col = "clean_comment"


===== PREPROCESSING REPORT =====
Total rows before cleaning         : 12546
Empty/invalid rows before cleaning : 0
Rows removed due to empty comment  : 0
Total rows after cleaning          : 12546



**Convert labels to list of labels**

In [7]:
def row_to_labels(row):
    return [c for c in label_cols if int(row[c])==1]

df['labels_list'] = df.apply(row_to_labels, axis=1)

**Train/Val/Test split**

In [8]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.125, random_state=42)  # 70/10/20
print("Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))

Train/Val/Test sizes: 8781 1255 2510


**TF-IDF + Logistic Regression baseline**

In [9]:
bangla_token_pattern = r"(?u)[\w\u0980-\u09FF]+"

tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1,2), token_pattern=bangla_token_pattern)
X_train_tfidf = tfidf.fit_transform(train_df[text_col].values)
X_val_tfidf = tfidf.transform(val_df[text_col].values)
X_test_tfidf = tfidf.transform(test_df[text_col].values)

mlb = MultiLabelBinarizer(classes=label_cols)
y_train = mlb.fit_transform(train_df['labels_list'])
y_val = mlb.transform(val_df['labels_list'])
y_test = mlb.transform(test_df['labels_list'])

ovr = OneVsRestClassifier(LogisticRegression(solver='liblinear', max_iter=500))
print("Training TF-IDF Logistic Regression baseline...")
ovr.fit(X_train_tfidf, y_train)

# Evaluation function
def multilabel_eval(model, X, y_true, label_names):
    probs = model.predict_proba(X)
    if isinstance(probs, list):
        probs = np.vstack([p[:,1] if p.ndim==2 and p.shape[1]>1 else p.ravel() for p in probs]).T
    preds = (probs >= 0.5).astype(int)
    print("Micro F1:", f1_score(y_true, preds, average='micro', zero_division=0))
    print("Macro F1:", f1_score(y_true, preds, average='macro', zero_division=0))
    print("\nPer-label classification report:\n")
    print(classification_report(y_true, preds, target_names=label_names, zero_division=0))
    return probs, preds

print("\nTF-IDF baseline evaluation on validation set:")
baseline_val_probs, baseline_val_preds = multilabel_eval(ovr, X_val_tfidf, y_val, label_cols)

print("\nTF-IDF baseline evaluation on test set:")
baseline_test_probs, baseline_test_preds = multilabel_eval(ovr, X_test_tfidf, y_test, label_cols)

joblib.dump({'tfidf': tfidf, 'ovr': ovr, 'mlb': mlb}, 'tfidf_ovr_mlb.joblib')
print("Saved baseline model to tfidf_ovr_mlb.joblib")

Training TF-IDF Logistic Regression baseline...

TF-IDF baseline evaluation on validation set:
Micro F1: 0.7839272175890827
Macro F1: 0.6655914260765139

Per-label classification report:

              precision    recall  f1-score   support

       bully       0.83      0.94      0.88       800
      sexual       0.91      0.25      0.39       203
   religious       0.97      0.44      0.61       172
      threat       0.94      0.61      0.74       137
        spam       0.97      0.55      0.70       123

   micro avg       0.86      0.72      0.78      1435
   macro avg       0.93      0.56      0.67      1435
weighted avg       0.88      0.72      0.75      1435
 samples avg       0.68      0.59      0.61      1435


TF-IDF baseline evaluation on test set:
Micro F1: 0.7653021442495127
Macro F1: 0.6432350244812374

Per-label classification report:

              precision    recall  f1-score   support

       bully       0.81      0.93      0.87      1566
      sexual       0.90   

**Transformer-based BanglaBERT**

**Token:** 1ca7ea874cecbcc3802ea014cc2760c028af4f58

In [None]:
model_name = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare HF dataset
def prepare_hf(df_in):
    return Dataset.from_pandas(df_in[[text_col, 'labels_list']].rename(columns={text_col:'text'}))

hf_train = prepare_hf(train_df)
hf_val = prepare_hf(val_df)
hf_test = prepare_hf(test_df)

# Tokenize
max_length = 128
def tokenize_fn(batch):
    tokens = tokenizer(batch['text'], truncation=True, padding='max_length', max_length=max_length)
    return tokens

hf_train = hf_train.map(tokenize_fn, batched=True)
hf_val = hf_val.map(tokenize_fn, batched=True)
hf_test = hf_test.map(tokenize_fn, batched=True)

# Encode labels to multi-hot
def encode_labels(batch):
    multi = []
    for labs in batch['labels_list']:
        # Ensure labels are float type for BCEWithLogitsLoss
        arr = [1.0 if l in labs else 0.0 for l in label_cols]
        multi.append(arr)
    return {"labels": multi}

hf_train = hf_train.map(encode_labels, batched=True)
hf_val   = hf_val.map(encode_labels, batched=True)
hf_test  = hf_test.map(encode_labels, batched=True)

cols_to_return = [c for c in hf_train.column_names if c not in ['text','labels_list']]
hf_train.set_format(type='torch', columns=cols_to_return + ['labels'])
hf_val.set_format(type='torch', columns=cols_to_return + ['labels'])
hf_test.set_format(type='torch', columns=cols_to_return + ['labels'])

# Load model for multi-label
config = AutoConfig.from_pretrained(model_name, num_labels=len(label_cols), problem_type="multi_label_classification")
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

# TrainingArguments
training_args = TrainingArguments(
    output_dir="./hf_bangla_multilabel",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch", # Changed from evaluation_strategy to eval_strategy
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Compute metrics
from sklearn.metrics import f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    preds = (probs >= 0.5).astype(int)
    micro_f1 = f1_score(labels, preds, average='micro', zero_division=0)
    macro_f1 = f1_score(labels, preds, average='macro', zero_division=0)
    return {"micro_f1": micro_f1, "macro_f1": macro_f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train
print("Starting BanglaBERT fine-tuning...")
trainer.train()

# Evaluate on test set
print("Evaluating transformer on test set...")
test_metrics = trainer.evaluate(eval_dataset=hf_test)
print(test_metrics)

# Save transformer
trainer.save_model("hf_bangla_multilabel_best")
tokenizer.save_pretrained("hf_bangla_multilabel_best")
print("Saved transformer model to ./hf_bangla_multilabel_best/")

Prediction functions

In [11]:
def baseline_predict(texts, threshold=0.5):
    X = tfidf.transform([str(t) for t in texts])
    probs = ovr.predict_proba(X)
    if isinstance(probs, list):
        probs = np.vstack([p[:,1] if p.ndim==2 and p.shape[1]>1 else p.ravel() for p in probs]).T
    preds = (probs >= threshold).astype(int)
    out = []
    for p_row, pred_row in zip(probs, preds):
        out.append({label_cols[i]: {"prob": float(p_row[i]), "pred": int(pred_row[i])} for i in range(len(label_cols))})
    return out

def transformer_predict(texts, threshold=0.5):
    enc = tokenizer(list(texts), truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    enc = {k:v.to(device) for k,v in enc.items()}
    with torch.no_grad():
        logits = model(**enc).logits.cpu().numpy()
    probs = 1/(1+np.exp(-logits))
    preds = (probs >= threshold).astype(int)
    out = []
    for p_row, pred_row in zip(probs, preds):
        out.append({label_cols[i]: {"prob": float(p_row[i]), "pred": int(pred_row[i])} for i in range(len(label_cols))})
    return out

**Gradio Demo**

In [12]:
def demo_fn(text):
    return baseline_predict([text])[0], transformer_predict([text])[0]

demo = gr.Interface(fn=demo_fn,
                    inputs=gr.Textbox(lines=3, label="Bangla comment"),
                    outputs=[gr.JSON(label="TF-IDF baseline"), gr.JSON(label="Transformer")],
                    title="Bangla Multilabel Abuse Detection Demo")

**Text Handling also automatically label**

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd
# Configuration
MODEL_PATH = "./hf_bangla_multilabel_best/"  # Path to your saved model

# Mapping internal label IDs
LABEL_MAPPING = {
    "LABEL_0": "bully",
    "LABEL_1": "sexual",
    "LABEL_2": "religious",
    "LABEL_3": "threat",
    "LABEL_4": "spam"
}

# Load Model & Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)

# Single Text Prediction (show all labels)
def predict_text(text):
    results = classifier(text)
    return {LABEL_MAPPING[item['label']]: round(item['score'], 4)
              for item in results[0]}

# Example Usage
if __name__ == "__main__":

    sample_text = "ভিডিও Call দেই"
    print("Single text prediction:")
    print(predict_text(sample_text))

Device set to use cuda:0


Single text prediction:
{'bully': 0.0518, 'sexual': 0.0276, 'religious': 0.001, 'threat': 0.0025, 'spam': 0.5873}


**CSV file Handling also automatically label each row**

In [None]:
!pip install -q transformers pandas torch chardet

import pandas as pd
import chardet
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from google.colab import files

# Config
MODEL_PATH = "/content/hf_bangla_multilabel_best"
LABEL_MAPPING = {
    "LABEL_0": "bully",
    "LABEL_1": "sexual",
    "LABEL_2": "religious",
    "LABEL_3": "threat",
    "LABEL_4": "spam"
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model & tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True)
model.to(device)
model.eval()

# Prediction functions
def detect_encoding(file_path):
    rawdata = open(file_path, 'rb').read()
    result = chardet.detect(rawdata)
    return result['encoding']

def read_csv_auto(path):
    # Try common encodings, including the one detected by chardet if it's not None
    detected_enc = detect_encoding(path)
    print(f"Detected CSV encoding: {detected_enc}")

    # Prioritize detected encoding, then common ones
    encodings_to_try = []
    if detected_enc and detected_enc.lower() != 'ascii': # Exclude 'ascii' if chardet isn't confident
        encodings_to_try.append(detected_enc)
    encodings_to_try.extend(['utf-8', 'utf-16', 'cp1252', 'latin1'])
    encodings_to_try = list(dict.fromkeys(encodings_to_try)) # Remove duplicates while preserving order

    for enc in encodings_to_try:
        try:
            df = pd.read_csv(path, encoding=enc)
            print(f"Successfully read CSV with encoding: {enc}")
            return df
        except UnicodeDecodeError:
            print(f"Failed to read with {enc}, trying next...")
            continue
        except Exception as e:
            print(f"An unexpected error occurred with encoding {enc}: {e}")
            continue

    raise UnicodeDecodeError(f"Failed to read CSV {path} with any of the attempted encodings.")

def predict_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits.squeeze(0)
        probs = torch.sigmoid(logits)
    scores = probs.tolist()
    return {LABEL_MAPPING[f"LABEL_{i}"]: round(scores[i], 4) for i in range(len(scores))}

def predict_csv(input_csv_path, output_csv_path, threshold=0.0, text_column='comment'): # Changed text_column to 'comment'
    df = read_csv_auto(input_csv_path)

    for label in LABEL_MAPPING.values():
        df[label] = 0.0
        df[label + "_flag"] = False

    for idx, text in enumerate(df[text_column]):
        # Ensure text is string, handle potential non-string entries
        if not isinstance(text, str):
            text = str(text)

        pred = predict_text(text)
        for label_name, score in pred.items():
            df.at[idx, label_name] = score
            if score >= threshold:
                df.at[idx, label_name + "_flag"] = True

    df.to_csv(output_csv_path, index=False, encoding='utf-8')
    return output_csv_path

# Upload CSV and run prediction
from google.colab import files
uploaded = files.upload()
input_csv = list(uploaded.keys())[0]
output_csv = "labeled_output.csv"
predict_csv(input_csv, output_csv, threshold=0.5, text_column='text')
files.download(output_csv)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Saving input.csv to input.csv
Detected CSV encoding: utf-8
Successfully read CSV with encoding: utf-8


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Zip best Model**

In [None]:
!zip -r hf_bangla_multilabel_best.zip hf_bangla_multilabel_best

from google.colab import files
files.download("hf_bangla_multilabel_best.zip")