# Social Media Extremism Detection - RoBERTa PRO

**Goal:** Break 0.90 Accuracy.
**Strategy:**
1.  **Model:** Upgrade to `roberta-large` (Smarter, slower).
2.  **Technique:** Pseudo-Labeling (Use confident test predictions as new training data).

In [None]:
!pip install -q transformers datasets torch scikit-learn simpletransformers

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging
import torch

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

cuda_available = torch.cuda.is_available()

## 1. Load Data

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train['text'] = train['Original_Message'].fillna("")
test['text'] = test['Original_Message'].fillna("")

label_map = {'NON_EXTREMIST': 0, 'EXTREMIST': 1}
train['labels'] = train['Extremism_Label'].map(label_map)

## 2. Configuration (The "Pro" Settings)
*   `roberta-large`: Much bigger brain.
*   `num_train_epochs`: 5 (Train longer).
*   `learning_rate`: 1e-5 (Learn slower and more carefully).

In [None]:
model_args = ClassificationArgs()
model_args.num_train_epochs = 5
model_args.train_batch_size = 8 # Smaller batch size for 'large' model to fit in GPU
model_args.eval_batch_size = 16
model_args.learning_rate = 1e-5
model_args.max_seq_length = 128
model_args.overwrite_output_dir = True
model_args.save_model_every_epoch = False
model_args.save_eval_checkpoints = False
model_args.use_multiprocessing = False
model_args.use_multiprocessing_for_evaluation = False
model_args.manual_seed = 42

## 3. Phase 1: Train `roberta-large` on 5 Folds

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds_list = []
fold_scores = []

print("--- Starting Phase 1: Training Base Models ---")

for fold, (train_idx, val_idx) in enumerate(skf.split(train, train['labels'])):
    print(f"\n--- Fold {fold+1} ---")
    
    train_df = train.iloc[train_idx][['text', 'labels']]
    val_df = train.iloc[val_idx][['text', 'labels']]
    
    model = ClassificationModel(
        "roberta",
        "roberta-large", # <--- The Big Change
        num_labels=2,
        args=model_args,
        use_cuda=cuda_available
    )
    
    model.train_model(train_df)
    
    # Evaluate
    result, model_outputs, wrong_predictions = model.eval_model(val_df)
    preds = np.argmax(model_outputs, axis=1)
    acc = accuracy_score(val_df['labels'], preds)
    print(f"Fold {fold+1} Accuracy: {acc:.4f}")
    fold_scores.append(acc)
    
    # Predict on Test
    predictions, raw_outputs = model.predict(test['text'].tolist())
    test_preds_list.append(raw_outputs) # Save raw logits for averaging

print(f"\nPhase 1 Average Accuracy: {np.mean(fold_scores):.4f}")

## 4. Phase 2: Pseudo-Labeling
We take the high-confidence predictions from Phase 1 and add them to the training set.
This is a common trick to boost performance.

In [None]:
# Average the logits from all 5 folds
avg_logits = np.mean(test_preds_list, axis=0)
# Convert to probabilities
probs = torch.softmax(torch.tensor(avg_logits), dim=1).numpy()

# Select confident predictions (> 0.90 confidence)
high_conf_indices = np.where(np.max(probs, axis=1) > 0.90)[0]
pseudo_labels = np.argmax(probs[high_conf_indices], axis=1)

print(f"Found {len(high_conf_indices)} high-confidence test samples to add.")

# Create Pseudo-Labeled Dataset
pseudo_df = test.iloc[high_conf_indices][['text']].copy()
pseudo_df['labels'] = pseudo_labels

# Combine with original train
full_train_df = pd.concat([train[['text', 'labels']], pseudo_df])
print(f"New Training Size: {len(full_train_df)}")

## 5. Phase 3: Final Training
Train one final model on the combined dataset.

In [None]:
print("--- Starting Phase 3: Final Training ---")
final_model = ClassificationModel(
    "roberta",
    "roberta-large",
    num_labels=2,
    args=model_args,
    use_cuda=cuda_available
)

final_model.train_model(full_train_df)

final_predictions, _ = final_model.predict(test['text'].tolist())

In [None]:
inv_label_map = {0: 'NON_EXTREMIST', 1: 'EXTREMIST'}
final_labels = [inv_label_map[p] for p in final_predictions]

submission = pd.DataFrame({
    'ID': test['ID'],
    'Extremism_Label': final_labels
})

submission.to_csv("submission_roberta_pro.csv", index=False)
print("Saved submission_roberta_pro.csv")