## Cleaning the data

### Fix the test data

As the test data has the -1 values, we need to remove those rows. This is because they were added in the Kaggle competition.

In [7]:
import pandas as pd

test_data = pd.read_csv("../data/test.csv")
test_labels = pd.read_csv("../data/test_labels.csv")

In [None]:
test_data.head()

In [None]:
test_labels.head()

In [None]:
# Merge test_data and test_labels on 'id'
merged_test_data = pd.merge(test_data, test_labels, on='id', how='inner')

# Remove rows where any label is -1
valid_test_data = merged_test_data[~(merged_test_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] == -1).any(axis=1)]

print(valid_test_data.head())
# Reset index after filtering
valid_test_data.reset_index(drop=True, inplace=True)

In [None]:
valid_test_data.head()

In [None]:
# Check if all rows are valid (i.e. all labels are not -1)
print((valid_test_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] == -1).any(axis=1).sum() == 0)

In [13]:
# Save the valid test data
#valid_test_data.to_csv('../data/train_data.csv', index=False)

### Fix the train data

We saw on the previous experiment with the complete dataset that there is an imbalance in the dataset. We need to fix that.

In [14]:
import pandas as pd
train_data = pd.read_csv("../data/train.csv")

In [None]:
train_data.info()

In [None]:
import pandas as pd
import numpy as np

# Read the data
df = pd.read_csv('../data/train.csv')

# List of toxic categories
toxic_categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

def analyze_toxic_comments(df):
    # 1. Exclusive labels analysis
    print("1. EXCLUSIVE LABELS ANALYSIS")
    print("-" * 50)
    
    exclusive_counts = {}
    for category in toxic_categories:
        # Get rows where only this category is 1 and others are 0
        mask = df[toxic_categories].sum(axis=1) == 1
        exclusive = df[mask & (df[category] == 1)]
        if len(exclusive) > 0:
            exclusive_counts[category] = len(exclusive)
    
    print("Comments with exactly one label:")
    for category, count in exclusive_counts.items():
        print(f"{category}: {count}")
    print(f"Total exclusive labels: {sum(exclusive_counts.values())}")
    
    print("\n2. COMBINATION LABELS ANALYSIS")
    print("-" * 50)
    
    # Get number of labels per comment
    label_counts = df[toxic_categories].sum(axis=1)
    
    # Count combinations
    for i in range(2, len(toxic_categories) + 1):
        count = len(df[label_counts == i])
        if count > 0:
            print(f"Comments with {i} labels: {count}")
    
    print("\n3. TOTAL COUNT PER LABEL")
    print("-" * 50)
    
    # Count total occurrences of each label
    for category in toxic_categories:
        print(f"{category}: {df[category].sum()}")
    
    print("\n4. TOTAL COMMENTS")
    print("-" * 50)
    print(f"Total number of comments: {len(df)}")

# Run the analysis
analyze_toxic_comments(df)

Exclusive Labels Analysis
- **Total comments with single labels:** 6,360
  - **toxic alone:** 5,666 (89% of exclusive labels)
  - Very few comments are exclusively labeled as other categories.
  - **severe_toxic:** never appears alone (0 exclusive counts).
  - **threat:** only 22 exclusive cases.
  - This suggests that **severe_toxic** always appears with other labels.

Label Combinations
- **Total comments with multiple labels:** 9,865 (3,480 + 4,209 + 1,760 + 385 + 31).
- **Interesting pattern:** More comments have **3 labels (4,209)** than **2 labels (3,480)**.
- Very few comments have all 6 labels (31).

Distribution of Combinations:
- **2 labels:** 35.3% of multi-label cases.
- **3 labels:** 42.7% of multi-label cases.
- **4 labels:** 17.8% of multi-label cases.
- **5 labels:** 3.9% of multi-label cases.
- **6 labels:** 0.3% of multi-label cases.

Total Label Distribution:
- **toxic** appears in 15,294 comments.
- Other labels in descending order:
  - **obscene:** 8,449 (55.2% of toxic comments).
  - **insult:** 7,877 (51.5% of toxic comments).
  - **severe_toxic:** 1,595 (10.4% of toxic comments).
  - **identity_hate:** 1,405 (9.2% of toxic comments).
  - **threat:** 478 (3.1% of toxic comments).

Key Findings:
1. The dataset has a **hierarchical nature**, where **toxic** acts as a parent category.
2. There's a **strong correlation between labels**, especially with **toxic**.
3. **Imbalances** in the dataset:
   - Between **single** vs **multiple labels**.
   - Between **different categories**.
   - In the **combination patterns** of labels.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def create_balanced_datasets(input_file, output_dir, toxic_cols):
    # Read original data
    df = pd.read_csv(input_file)
    
    # Identify toxic and non-toxic comments
    toxic_mask = df[toxic_cols].sum(axis=1) > 0
    toxic_samples = df[toxic_mask]
    non_toxic_samples = df[~toxic_mask]
    
    # Calculate sizes
    n_toxic = len(toxic_samples)
    
    # Create 1:1 ratio dataset
    n_non_toxic_1_1 = n_toxic
    non_toxic_1_1 = non_toxic_samples.sample(n=n_non_toxic_1_1, random_state=11)
    balanced_1_1 = pd.concat([toxic_samples, non_toxic_1_1]).sample(frac=1, random_state=11)
    
    # Create 2:1 ratio dataset
    n_non_toxic_2_1 = n_toxic * 2
    non_toxic_2_1 = non_toxic_samples.sample(n=n_non_toxic_2_1, random_state=11)
    balanced_2_1 = pd.concat([toxic_samples, non_toxic_2_1]).sample(frac=1, random_state=11)
    
    # Create 3:1 ratio dataset
    n_non_toxic_3_1 = n_toxic * 3
    non_toxic_3_1 = non_toxic_samples.sample(n=n_non_toxic_3_1, random_state=11)
    balanced_3_1 = pd.concat([toxic_samples, non_toxic_3_1]).sample(frac=1, random_state=11)

    # Save datasets
    balanced_1_1.to_csv(f"{output_dir}/balanced_1_1_ratio.csv", index=False)
    balanced_2_1.to_csv(f"{output_dir}/balanced_2_1_ratio.csv", index=False)
    balanced_3_1.to_csv(f"{output_dir}/balanced_3_1_ratio.csv", index=False)

    # Create distribution statistics
    def get_stats(df, name):
        total = len(df)
        toxic = df.loc[toxic_mask].shape[0]
        non_toxic = df.loc[~toxic_mask].shape[0]
        return {
            'dataset': name,
            'total_samples': total,
            'toxic_samples': toxic,
            'non_toxic_samples': non_toxic,
            'toxic_ratio': toxic/total,
            'non_toxic_ratio': non_toxic/total
        }
    
    stats = pd.DataFrame([
        get_stats(df, 'original'),
        get_stats(balanced_1_1, 'balanced_1_1'),
        get_stats(balanced_2_1, 'balanced_2_1'),
        get_stats(balanced_3_1, 'balanced_3_1')
    ])
    
    return stats

# Create the datasets and get statistics
stats = create_balanced_datasets('../data/train.csv', '../data', toxic_categories)
print("\nDataset Statistics:")
print("-" * 80)
print(stats.to_string(index=False))

In [18]:
bal11 = pd.read_csv('../data/balanced_1_1_ratio.csv')
bal21 = pd.read_csv('../data/balanced_2_1_ratio.csv')
bal31 = pd.read_csv('../data/balanced_3_1_ratio.csv')  

In [None]:
bal11.head()

In [None]:
bal21.head()

In [None]:
bal31.head()

In [None]:
analyze_toxic_comments(bal11)

In [None]:
analyze_toxic_comments(bal21)

In [None]:
analyze_toxic_comments(bal31)

In [25]:
import pandas as pd
import numpy as np

def analyze_toxic_categories(df, toxic_cols):
    # Get only toxic comments
    toxic_mask = df[toxic_cols].sum(axis=1) > 0
    toxic_samples = df[toxic_mask]
    
    print("\nDistribution of Specific Toxic Categories:")
    print("-" * 50)
    for col in toxic_cols[1:]:  # Skip 'toxic' as it's the parent category
        count = toxic_samples[col].sum()
        percentage = (count / len(toxic_samples)) * 100
        print(f"{col}: {count} ({percentage:.1f}%)")
    
    print("\nLabel Combination Analysis:")
    print("-" * 50)
    label_counts = toxic_samples[toxic_cols[1:]].sum(axis=1)
    for i in range(1, len(toxic_cols)):
        count = (label_counts == i).sum()
        if count > 0:
            percentage = (count / len(toxic_samples)) * 100
            print(f"Comments with {i} toxic categories: {count} ({percentage:.1f}%)")

In [None]:
analyze_toxic_categories(bal11, toxic_categories)

In [None]:
analyze_toxic_categories(bal21, toxic_categories)

In [None]:
analyze_toxic_categories(bal31, toxic_categories)

## Tokenize the dataset

In [1]:
import pandas as pd

data_url = "../data/balanced_3_1_ratio.csv"
train_data = pd.read_csv(data_url)

In [2]:
from datasets import load_dataset
data = load_dataset("csv", data_files=data_url)
data = data.shuffle(seed=11)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd
from datasets import Dataset
temp = pd.read_csv("../data/valid_test_data.csv", encoding="utf-8")
test_dataset = Dataset.from_pandas(temp)

In [4]:
train_test_dataset = data['train'].train_test_split(test_size=0.2, seed=11)

In [5]:
from datasets import DatasetDict
train_test_val_dataset = DatasetDict({
    'train': train_test_dataset['train'],
    'test': test_dataset,
    'validation': train_test_dataset['test']
})

In [6]:
train_test_val_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 51920
    })
    test: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 63978
    })
    validation: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 12980
    })
})

In [7]:
train_test_val_dataset['train'].column_names

['id',
 'comment_text',
 'toxic',
 'severe_toxic',
 'obscene',
 'threat',
 'insult',
 'identity_hate']

In [8]:
labels = [column for column in train_test_val_dataset['train'].column_names[2:]]

In [9]:
labels

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [10]:
non_labels = [column for column in train_test_val_dataset['train'].column_names if column not in labels]

In [11]:
label2id = {label:id for id, label in enumerate(labels)}

In [12]:
id2label = {id:label for label, id in label2id.items()}

In [13]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=False)

In [14]:
label2id, id2label

({'toxic': 0,
  'severe_toxic': 1,
  'obscene': 2,
  'threat': 3,
  'insult': 4,
  'identity_hate': 5},
 {0: 'toxic',
  1: 'severe_toxic',
  2: 'obscene',
  3: 'threat',
  4: 'insult',
  5: 'identity_hate'})

In [15]:
def tokenize_function(batch, labels, tokenizer):
    batch_labels = [
        [float(batch[label][i]) for label in labels]
        for i in range(len(batch["comment_text"]))
    ]

    tokenized_output = tokenizer(
        batch["comment_text"],
        truncation=True,
        max_length=512
    )

    tokenized_output["labels"] = batch_labels
    return tokenized_output


In [16]:
tokenized_datasets = train_test_val_dataset.map(
    tokenize_function,
    fn_kwargs={'labels': labels, 'tokenizer': tokenizer},
    remove_columns=non_labels + labels,
    batched=True,
)


Map: 100%|██████████| 63978/63978 [00:36<00:00, 1734.51 examples/s]


In [22]:
tokenized_datasets.save_to_disk('dataset-bert-31')

Saving the dataset (1/1 shards): 100%|██████████| 51920/51920 [00:00<00:00, 1298581.75 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 63978/63978 [00:00<00:00, 1592720.73 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 12980/12980 [00:00<00:00, 1230579.46 examples/s]


## Train the model

In [1]:
from datasets import load_from_disk
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoTokenizer,
    DataCollatorWithPadding,
)
import evaluate
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configuration
CONFIG = {
    'checkpoint': 'bert-base-uncased',
    'labels': ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
    'data': "dataset-bert-31",
    'run-name': "bert-31"
}


# Labels
labels = CONFIG['labels']
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

# Load dataset and tokenizer
checkpoint = CONFIG['checkpoint']
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_datasets = load_from_disk(CONFIG['data'])


In [3]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    # Ensure predictions and labels are NumPy arrays
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.detach().cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.detach().cpu().numpy()

    # Apply sigmoid to convert logits to probabilities
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.tensor(predictions)).numpy()

    # Convert probabilities to binary predictions using the threshold
    y_pred = (probs >= threshold).astype(int)
    y_true = labels

    # Initialize metrics dictionary
    metrics = {}

    # Calculate ROC AUC
    try:
        roc_auc = roc_auc_score(y_true, probs, average='micro', multi_class='ovr')
    except ValueError:
        # Handle cases where one label is missing positive/negative samples
        roc_auc = np.nan
    metrics['roc_auc'] = roc_auc

    # Calculate overall metrics
    metrics.update({
        'f1_micro': f1_score(y_true, y_pred, average='micro', zero_division=0),
        'f1_macro': f1_score(y_true, y_pred, average='macro', zero_division=0),
        'precision_micro': precision_score(y_true, y_pred, average='micro', zero_division=0),
        'precision_macro': precision_score(y_true, y_pred, average='macro', zero_division=0),
        'recall_micro': recall_score(y_true, y_pred, average='micro', zero_division=0),
        'recall_macro': recall_score(y_true, y_pred, average='macro', zero_division=0),
        'accuracy': accuracy_score(y_true, y_pred)
    })

    # Add per-label F1 scores
    metrics.update({
        f'f1_label_{i}': f1_score(y_true[:, i], y_pred[:, i], zero_division=0)
        for i in range(y_true.shape[1])
    })

    return metrics

def compute_metrics_v2(p, threshold=0.5):
    """
    Computes metrics for Hugging Face Trainer using multi-label classification.
    Args:
        p (EvalPrediction): Contains predictions and labels.
        threshold (float): Threshold for converting probabilities to binary predictions.
    Returns:
        dict: A dictionary of evaluation metrics.
    """
    # Extract predictions (logits) and labels
    if isinstance(p.predictions, tuple):
        preds = p.predictions[0]  # Handle models returning (logits, hidden_states, attentions)
    else:
        preds = p.predictions
    
    return multi_label_metrics(
        predictions=preds,
        labels=p.label_ids,
        threshold=threshold
    )


In [4]:

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')


        # Compute custom loss with class weights
        loss_fct = nn.BCEWithLogitsLoss()
        loss = loss_fct(logits, labels.float())  # Convert labels to float for BCEWithLogitsLoss
        
        return (loss, outputs) if return_outputs else loss



In [5]:

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG['checkpoint'],
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification",
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:

# Training arguments
training_args = TrainingArguments(
    output_dir="v2-distilbert-31-5ep",
    logging_strategy="steps",
    logging_steps=200,
    logging_first_step=True,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    per_device_train_batch_size=16,  # Adjust based on memory usage
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=3,  # Simulate effective batch size of 48
    fp16=False,  # Mixed precision
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_micro",
    report_to="mlflow",
)

# Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_v2,
)

In [None]:
# Train the model
trainer.train()