In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, recall_score, precision_recall_curve, precision_score, f1_score
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, BertModel, XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

class RacialHoaxDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class CNNModel(nn.Module):
    def __init__(self, bert_model, num_classes=2):
        super(CNNModel, self).__init__()
        self.bert = bert_model
        self.conv1 = nn.Conv1d(768, 128, kernel_size=3, padding=1)  # 768 is BERT hidden size
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state.permute(0, 2, 1)  # [batch, hidden_size, seq_len]
        x = self.relu(self.conv1(x))
        x = self.pool(x).squeeze(-1)
        x = self.dropout(x)
        x = self.fc(x)
        return x

class TransformerFFNNModel(nn.Module):
    def __init__(self, bert_model, num_classes=2):
        super(TransformerFFNNModel, self).__init__()
        self.bert = bert_model
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=768, nhead=8, dim_feedforward=2048, dropout=0.1),
            num_layers=1
        )
        self.fc1 = nn.Linear(768, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state  # [batch, seq_len, 768]
        x = self.transformer(x)  # [batch, seq_len, 768]
        x = x[:, 0, :]  # Use [CLS] token
        x = self.dropout(x)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class RacialHoaxDetector:
    def __init__(self, model_type='bert', label_column='labels'):
        self.model_type = model_type
        self.label_column = label_column
        self.best_accuracy = 0.0

        if model_type == 'bert':
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
            self.model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
        elif model_type == 'xlm-roberta':
            self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
            self.model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)
        elif model_type == 'cnn':
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
            self.bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')
            self.model = CNNModel(self.bert_model)
        elif model_type == 'transformer-ffnn':
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
            self.bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')
            self.model = TransformerFFNNModel(self.bert_model)

        self.model.to(device)

    def load_and_prepare_data(self, train_file, dev_file):
        train_df = pd.read_csv(train_file)
        dev_df = pd.read_csv(dev_file)

        if self.label_column not in train_df.columns:
            raise KeyError(f"Label column '{self.label_column}' not found in training data. Available columns: {train_df.columns}")
        if self.label_column not in dev_df.columns:
            raise KeyError(f"Label column '{self.label_column}' not found in dev data. Available columns: {dev_df.columns}")

        # Debug: Check label distribution
        print(f"Training label distribution:\n{train_df[self.label_column].value_counts()}")
        print(f"Validation label distribution:\n{dev_df[self.label_column].value_counts()}")

        return train_df, dev_df

    def create_dataloaders(self, train_df, dev_df, batch_size=16):
        train_dataset = RacialHoaxDataset(
            train_df['clean_text'].values,
            train_df[self.label_column].values,
            self.tokenizer
        )
        dev_dataset = RacialHoaxDataset(
            dev_df['clean_text'].values,
            dev_df[self.label_column].values,
            self.tokenizer
        )

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
        return train_loader, dev_loader

    def train_model(self, train_loader, dev_loader, epochs=3, learning_rate=2e-5):
        optimizer = AdamW(self.model.parameters(), lr=learning_rate if self.model_type in ['bert', 'xlm-roberta'] else 5e-4)
        # Weighted loss to handle class imbalance
        class_weights = torch.tensor([0.3, 0.7]).to(device)  # Adjust based on class distribution
        for epoch in range(epochs):
            print(f"\nEpoch {epoch + 1}/{epochs} ({self.model_type})")
            self.model.train()
            total_loss = 0

            for batch in tqdm(train_loader, desc="Training"):
                optimizer.zero_grad()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                if self.model_type in ['bert', 'xlm-roberta']:
                    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                else:
                    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                    loss = nn.CrossEntropyLoss(weight=class_weights)(outputs, labels)

                total_loss += loss.item()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)  # Gradient clipping
                optimizer.step()

            avg_train_loss = total_loss / len(train_loader)
            print(f"Average training loss: {avg_train_loss:.4f}")

            eval_results = self.evaluate_model(dev_loader, save_best=True)
            accuracy, dev_loss, true_labels, predictions, probabilities, cm = eval_results
            print(f"Validation accuracy: {accuracy:.4f}, Validation loss: {dev_loss:.4f}")

    def evaluate_model(self, dev_loader, save_best=False):
        self.model.eval()
        total_eval_accuracy = 0
        total_eval_loss = 0
        predictions = []
        true_labels = []
        probabilities = []

        with torch.no_grad():
            for batch in tqdm(dev_loader, desc="Evaluating"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                if self.model_type in ['bert', 'xlm-roberta']:
                    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                    logits = outputs.logits
                else:
                    logits = self.model(input_ids=input_ids, attention_mask=attention_mask)
                    loss = nn.CrossEntropyLoss()(logits, labels)

                total_eval_loss += loss.item()
                probs = torch.softmax(logits, dim=1)
                preds = torch.argmax(probs, dim=1).cpu().numpy()
                labels = labels.cpu().numpy()
                probs = probs.cpu().numpy()[:, 1]

                # Debug: Check predictions and probabilities
                if len(set(preds)) == 1:
                    print(f"Warning: Model {self.model_type} predicts only one class: {set(preds)}")
                if np.any(np.isnan(probs)):
                    print(f"Warning: Model {self.model_type} produces NaN probabilities")

                predictions.extend(preds)
                true_labels.extend(labels)
                probabilities.extend(probs)
                total_eval_accuracy += (preds == labels).mean()

        avg_accuracy = total_eval_accuracy / len(dev_loader)
        avg_loss = total_eval_loss / len(dev_loader)
        cm = confusion_matrix(true_labels, predictions)

        if save_best and avg_accuracy > self.best_accuracy:
            self.best_accuracy = avg_accuracy
            self.save_model(f'best_model_{self.model_type}')
            print(f"New best model saved with accuracy: {avg_accuracy:.4f}")

        print("\nClassification Report:")
        report = classification_report(true_labels, predictions, zero_division=0)
        print(report)
        print("\nConfusion Matrix:")
        print(cm)
        print(f"Model {self.model_type} predictions: {np.bincount(predictions)}")

        return avg_accuracy, avg_loss, true_labels, predictions, probabilities, cm

    def predict(self, text):
        self.model.eval()
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        with torch.no_grad():
            if self.model_type in ['bert', 'xlm-roberta']:
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
            else:
                logits = self.model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
            prediction = np.argmax(probs)

        return prediction, probs

    def save_model(self, output_dir):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        if self.model_type in ['bert', 'xlm-roberta']:
            self.model.save_pretrained(output_dir)
            self.tokenizer.save_pretrained(output_dir)
        else:
            torch.save(self.model.state_dict(), os.path.join(output_dir, 'model.pt'))
        print(f"Model saved to {output_dir}")

    def save_predictions(self, dev_df, predictions, output_file):
        dev_df['prediction'] = predictions
        dev_df[['clean_text', self.label_column, 'prediction']].to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")
        # Debug: Check prediction distribution
        print(f"Prediction distribution for {self.model_type}:\n{pd.read_csv(output_file)['prediction'].value_counts()}")

def plot_metrics(models_results):
    # Plot Confusion Matrices
    plt.figure(figsize=(15, 10))
    for i, (model_name, results) in enumerate(models_results.items(), 1):
        cm = results['confusion_matrix']
        plt.subplot(2, 2, i)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('True')
    plt.tight_layout()
    plt.savefig('confusion_matrices.png')
    plt.close()

    # Plot ROC Curves
    plt.figure(figsize=(10, 8))
    for model_name, results in models_results.items():
        fpr, tpr, _ = roc_curve(results['true_labels'], results['probabilities'])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend()
    plt.savefig('roc_curves.png')
    plt.close()

    # Plot Precision-Recall Curves
    plt.figure(figsize=(10, 8))
    for model_name, results in models_results.items():
        precision, recall, _ = precision_recall_curve(results['true_labels'], results['probabilities'])
        plt.plot(recall, precision, label=f'{model_name}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curves')
    plt.legend()
    plt.savefig('precision_recall_curves.png')
    plt.close()

    # Plot Recall Comparison
    plt.figure(figsize=(10, 6))
    comparison_data = {
        'Model': [],
        'Recall': []
    }
    for model_name, results in models_results.items():
        recall = recall_score(results['true_labels'], results['predictions'], zero_division=0)
        comparison_data['Model'].append(model_name)
        comparison_data['Recall'].append(recall)
    recall_df = pd.DataFrame(comparison_data)
    sns.barplot(x='Model', y='Recall', data=recall_df)
    plt.title('Recall Comparison')
    plt.ylabel('Recall')
    plt.savefig('recall_comparison.png')
    plt.close()

    # Create Comparison Table and Report
    comparison_data = {
        'Model': [],
        'Accuracy': [],
        'Precision': [],
        'Recall': [],
        'F1-Score': [],
        'AUC': []
    }
    report_lines = ["Model Comparison Report\n" + "="*50 + "\n"]
    for model_name, results in models_results.items():
        accuracy = results['accuracy']
        precision = precision_score(results['true_labels'], results['predictions'], zero_division=0)
        recall = recall_score(results['true_labels'], results['predictions'], zero_division=0)
        f1 = f1_score(results['true_labels'], results['predictions'], zero_division=0)
        fpr, tpr, _ = roc_curve(results['true_labels'], results['probabilities'])
        roc_auc = auc(fpr, tpr)

        comparison_data['Model'].append(model_name)
        comparison_data['Accuracy'].append(accuracy)
        comparison_data['Precision'].append(precision)
        comparison_data['Recall'].append(recall)
        comparison_data['F1-Score'].append(f1)
        comparison_data['AUC'].append(roc_auc)

        report_lines.append(f"Model: {model_name}\n")
        report_lines.append(f"Accuracy: {accuracy:.4f}\n")
        report_lines.append(f"Precision: {precision:.4f}\n")
        report_lines.append(f"Recall: {recall:.4f}\n")
        report_lines.append(f"F1-Score: {f1:.4f}\n")
        report_lines.append(f"AUC: {roc_auc:.4f}\n")
        report_lines.append("-"*50 + "\n")

    comparison_df = pd.DataFrame(comparison_data)
    comparison_df.to_csv('model_comparison.csv', index=False)

    # Save Comparison Report
    with open('model_comparison_report.txt', 'w') as f:
        f.writelines(report_lines)

    # Plot Model Comparison
    plt.figure(figsize=(15, 8))
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC']
    for i, metric in enumerate(metrics, 1):
        plt.subplot(2, 3, i)
        sns.barplot(x='Model', y=metric, data=comparison_df)
        plt.title(f'{metric} Comparison')
    plt.tight_layout()
    plt.savefig('model_comparison.png')
    plt.close()

def main():
    model_types = ['bert', 'xlm-roberta', 'cnn', 'transformer-ffnn']
    models_results = defaultdict(dict)

    train_file = '/content/Racial_train.csv'
    dev_file = '/content/Racial_val.csv'

    for model_type in model_types:
        print(f"\nTraining {model_type} model...")
        detector = RacialHoaxDetector(model_type=model_type, label_column='labels')
        train_df, dev_df = detector.load_and_prepare_data(train_file, dev_file)
        train_loader, dev_loader = detector.create_dataloaders(train_df, dev_df)

        detector.train_model(train_loader, dev_loader)
        accuracy, dev_loss, true_labels, predictions, probabilities, cm = detector.evaluate_model(dev_loader)

        models_results[model_type] = {
            'accuracy': accuracy,
            'dev_loss': dev_loss,
            'true_labels': true_labels,
            'predictions': predictions,
            'probabilities': probabilities,
            'confusion_matrix': cm
        }

        detector.save_predictions(dev_df, predictions, f'predictions_{model_type}.csv')

    plot_metrics(models_results)

    # Example prediction
    sample_text = "ahamadtalwar ki nok par tumahre amao ne saya khol diya tha"
    detector = RacialHoaxDetector(model_type='bert', label_column='labels')
    prediction, probability = detector.predict(sample_text)
    print(f"\nSample Prediction:")
    print(f"Text: {sample_text}")
    print(f"Prediction: {'Racial Hoax' if prediction == 1 else 'Not Racial Hoax'}")
    print(f"Confidence: {max(probability)*100:.2f}%")

if __name__ == "__main__":
    main()