# Social Media Extremism Detection - RoBERTa Baseline

To hit 0.9+ accuracy, we need a model that understands **context**, not just keywords.
This notebook uses **RoBERTa (Robustly Optimized BERT)**, which is the state-of-the-art for this size of data.

In [None]:
!pip install -q transformers datasets torch scikit-learn simpletransformers

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging
import torch

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Check for GPU
cuda_available = torch.cuda.is_available()
print(f"CUDA Available: {cuda_available}")

## 1. Data Preparation

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Simple Transformers expects columns: ['text', 'labels']
train['text'] = train['Original_Message'].fillna("")
test['text'] = test['Original_Message'].fillna("")

# Map labels to integers
label_map = {'NON_EXTREMIST': 0, 'EXTREMIST': 1}
train['labels'] = train['Extremism_Label'].map(label_map)

print(train.head())

## 2. Configuration
We use `roberta-base`. For even better results, try `roberta-large` (requires more GPU memory).

In [None]:
model_args = ClassificationArgs()
model_args.num_train_epochs = 3
model_args.train_batch_size = 16
model_args.eval_batch_size = 32
model_args.learning_rate = 2e-5
model_args.max_seq_length = 128
model_args.overwrite_output_dir = True
model_args.save_model_every_epoch = False
model_args.save_eval_checkpoints = False
model_args.use_multiprocessing = False
model_args.use_multiprocessing_for_evaluation = False
model_args.manual_seed = 42

## 3. Training with Cross-Validation (5 Folds)
This ensures our score is robust and not just luck.

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_scores = []
test_preds_list = []

for fold, (train_idx, val_idx) in enumerate(skf.split(train, train['labels'])):
    print(f"\n--- Fold {fold+1} ---")
    
    train_df = train.iloc[train_idx][['text', 'labels']]
    val_df = train.iloc[val_idx][['text', 'labels']]
    
    # Initialize Model
    model = ClassificationModel(
        "roberta",
        "roberta-base",
        num_labels=2,
        args=model_args,
        use_cuda=cuda_available
    )
    
    # Train
    model.train_model(train_df)
    
    # Evaluate
    result, model_outputs, wrong_predictions = model.eval_model(val_df)
    
    # Calculate Accuracy
    preds = np.argmax(model_outputs, axis=1)
    acc = accuracy_score(val_df['labels'], preds)
    print(f"Fold {fold+1} Accuracy: {acc:.4f}")
    fold_scores.append(acc)
    
    # Predict on Test Set
    predictions, raw_outputs = model.predict(test['text'].tolist())
    test_preds_list.append(predictions)

print(f"\nAverage Accuracy: {np.mean(fold_scores):.4f}")

## 4. Submission (Majority Vote)

In [None]:
test_preds_matrix = np.array(test_preds_list).T
final_preds = []

for row in test_preds_matrix:
    final_preds.append(np.argmax(np.bincount(row)))

# Map back to strings
inv_label_map = {0: 'NON_EXTREMIST', 1: 'EXTREMIST'}
final_labels = [inv_label_map[p] for p in final_preds]

submission = pd.DataFrame({
    'ID': test['ID'],
    'Extremism_Label': final_labels
})

submission.to_csv("submission_roberta.csv", index=False)
print("Saved submission_roberta.csv")
print(submission.head())