<a href="https://colab.research.google.com/github/vnavya2004/BTP/blob/main/Semisupervised_cotraining_bangla.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from tqdm import tqdm
from google.colab import files

# Load the XLM-RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Assuming you're using Google Colab and uploaded a file
uploaded = files.upload()

# Read the Excel file
df = pd.read_excel(pd.ExcelFile(list(uploaded.keys())[0]), header=0)
# Define the columns for features (tweets) and labels
tweets_column = 'tweets'
labels_column = 'labels'

# Convert the labels to numerical values
NUM_LABELS = len(df[labels_column].unique())
possible_labels = df[labels_column].unique()
label_dict = {possible_label: index for index, possible_label in enumerate(possible_labels)}
df[labels_column] = df[labels_column].map(label_dict)

# Split the dataset into labeled, unlabeled, and test sets
labeled_data, temp_data = train_test_split(df, test_size=0.6, stratify=df[labels_column])
unlabeled_data, test_data = train_test_split(temp_data, test_size=1/3, stratify=temp_data[labels_column])

# Tokenize the dataset
def tokenize_data(data):
    return tokenizer.batch_encode_plus(
        data[tweets_column].tolist(),
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=256,
        return_tensors='pt'
    )

# Tokenize labeled, unlabeled, and test data
encoded_labeled = tokenize_data(labeled_data)
encoded_unlabeled = tokenize_data(unlabeled_data)
encoded_test = tokenize_data(test_data)

# Convert labels to tensors
labels_labeled = torch.tensor(labeled_data[labels_column].values)
labels_unlabeled = torch.tensor(unlabeled_data[labels_column].values)
labels_test = torch.tensor(test_data[labels_column].values)

# Create TensorDatasets
dataset_labeled = TensorDataset(encoded_labeled['input_ids'], encoded_labeled['attention_mask'], labels_labeled)
dataset_unlabeled = TensorDataset(encoded_unlabeled['input_ids'], encoded_unlabeled['attention_mask'], labels_unlabeled)
dataset_test = TensorDataset(encoded_test['input_ids'], encoded_test['attention_mask'], labels_test)

# Initialize teacher and student models
teacher_model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=NUM_LABELS)
student_model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=NUM_LABELS)

# Set up the device for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
teacher_model.to(device)
student_model.to(device)

# Optimizers and schedulers for both models
teacher_optimizer = AdamW(teacher_model.parameters(), lr=1e-5, eps=1e-8)
student_optimizer = AdamW(student_model.parameters(), lr=1e-5, eps=1e-8)

epochs = 5
batch_size = 4

teacher_scheduler = get_linear_schedule_with_warmup(teacher_optimizer, num_warmup_steps=0, num_training_steps=len(dataset_labeled)*epochs)
student_scheduler = get_linear_schedule_with_warmup(student_optimizer, num_warmup_steps=0, num_training_steps=len(dataset_labeled)*epochs)

# Training loop
def train_model(dataset, model, optimizer, scheduler, epochs=5):
    for epoch in range(1, epochs + 1):
        model.train()
        loss_train_total = 0
        progress_bar = tqdm(DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=batch_size), desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:
            model.zero_grad()
            batch = tuple(b.to(device) for b in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
            outputs = model(**inputs)
            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})
        loss_train_avg = loss_train_total / len(dataset)
        tqdm.write('\nEpoch {epoch}')
        tqdm.write(f'Training loss: {loss_train_avg}')

# Function to evaluate the model
def evaluate_model(model, dataloader):
    model.eval()
    loss_val_total = 0
    predictions, true_vals = [], []
    for batch in tqdm(dataloader, desc='Evaluating', leave=False, disable=False):
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        with torch.no_grad():
            outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    loss_val_avg = loss_val_total / len(dataloader)
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    return loss_val_avg, predictions, true_vals

# Metrics functions
def binary_accuracy(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, preds_flat)

def binary_f1_score(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def binary_precision(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, preds_flat, average='weighted')

def binary_recall(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return recall_score(labels_flat, preds_flat, average='weighted')

# Train teacher and student on labeled data
train_model(dataset_labeled, teacher_model, teacher_optimizer, teacher_scheduler, epochs=epochs)
train_model(dataset_labeled, student_model, student_optimizer, student_scheduler, epochs=epochs)

# Generate pseudo-labels with the teacher model
teacher_model.eval()
unlabeled_dataloader = DataLoader(dataset_unlabeled, sampler=SequentialSampler(dataset_unlabeled), batch_size=batch_size)
teacher_predictions = []
teacher_confidences = []

for batch in tqdm(unlabeled_dataloader, desc='Teacher Predicting', leave=False, disable=False):
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
    with torch.no_grad():
        outputs = teacher_model(**inputs)
    logits = outputs[0]
    probs = torch.softmax(logits, dim=1)
    confidence, preds = torch.max(probs, dim=1)
    teacher_predictions.extend(preds.cpu().numpy())
    teacher_confidences.extend(confidence.cpu().numpy())

# Generate pseudo-labels with the student model
student_model.eval()
student_predictions = []
student_confidences = []

for batch in tqdm(unlabeled_dataloader, desc='Student Predicting', leave=False, disable=False):
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
    with torch.no_grad():
        outputs = student_model(**inputs)
    logits = outputs[0]
    probs = torch.softmax(logits, dim=1)
    confidence, preds = torch.max(probs, dim=1)
    student_predictions.extend(preds.cpu().numpy())
    student_confidences.extend(confidence.cpu().numpy())

# Convert predictions and confidences to tensors
teacher_predictions = torch.tensor(teacher_predictions)
teacher_confidences = torch.tensor(teacher_confidences)

student_predictions = torch.tensor(student_predictions)
student_confidences = torch.tensor(student_confidences)

# Select samples with high confidence from teacher and student predictions
teacher_high_confidence_idx = torch.where(teacher_confidences > 0.9)[0]
student_high_confidence_idx = torch.where(student_confidences > 0.9)[0]

# Swap high-confidence pseudo-labels between teacher and student
teacher_pseudo_labels = student_predictions[student_high_confidence_idx]
student_pseudo_labels = teacher_predictions[teacher_high_confidence_idx]

teacher_dataset_pseudo = TensorDataset(
    encoded_unlabeled['input_ids'][student_high_confidence_idx],
    encoded_unlabeled['attention_mask'][student_high_confidence_idx],
    teacher_pseudo_labels
)

student_dataset_pseudo = TensorDataset(
    encoded_unlabeled['input_ids'][teacher_high_confidence_idx],
    encoded_unlabeled['attention_mask'][teacher_high_confidence_idx],
    student_pseudo_labels
)

# Combine labeled data and pseudo-labeled data
combined_teacher_dataset = TensorDataset(
    torch.cat((encoded_labeled['input_ids'], encoded_unlabeled['input_ids'][student_high_confidence_idx]), dim=0),
    torch.cat((encoded_labeled['attention_mask'], encoded_unlabeled['attention_mask'][student_high_confidence_idx]), dim=0),
    torch.cat((labels_labeled, teacher_pseudo_labels), dim=0)
)

combined_student_dataset = TensorDataset(
    torch.cat((encoded_labeled['input_ids'], encoded_unlabeled['input_ids'][teacher_high_confidence_idx]), dim=0),
    torch.cat((encoded_labeled['attention_mask'], encoded_unlabeled['attention_mask'][teacher_high_confidence_idx]), dim=0),
    torch.cat((labels_labeled, student_pseudo_labels), dim=0)
)

# Train teacher and student on combined datasets
train_model(combined_teacher_dataset, teacher_model, teacher_optimizer, teacher_scheduler, epochs=epochs)
train_model(combined_student_dataset, student_model, student_optimizer, student_scheduler, epochs=epochs)

# Evaluate both models on the test set
test_dataloader = DataLoader(dataset_test, sampler=SequentialSampler(dataset_test), batch_size=batch_size)
teacher_test_loss, teacher_test_predictions, test_true_vals = evaluate_model(teacher_model, test_dataloader)
student_test_loss, student_test_predictions, test_true_vals = evaluate_model(student_model, test_dataloader)

# Combine predictions from both models on the test set
final_predictions = (teacher_test_predictions + student_test_predictions) / 2

# Calculate evaluation metrics on test data
test_accuracy = binary_accuracy(final_predictions, test_true_vals)
test_f1 = binary_f1_score(final_predictions, test_true_vals)
test_precision = binary_precision(final_predictions, test_true_vals)
test_recall = binary_recall(final_predictions, test_true_vals)

# Print out the evaluation metrics on test data
print(f'Testing Accuracy: {test_accuracy}')
print(f'Testing F1 Score: {test_f1}')
print(f'Testing Precision: {test_precision}')
print(f'Testing Recall: {test_recall}')


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Saving Bangla.xlsx to Bangla (1).xlsx




model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch {epoch}
Training loss: 0.17397943158010365





Epoch {epoch}
Training loss: 0.16057043974175386





Epoch {epoch}
Training loss: 0.15433111665663907





Epoch {epoch}
Training loss: 0.1574710117797385





Epoch {epoch}
Training loss: 0.15011202290256564





Epoch {epoch}
Training loss: 0.17470145661009917





Epoch {epoch}
Training loss: 0.16062375134403853





Epoch {epoch}
Training loss: 0.15930448542556133





Epoch {epoch}
Training loss: 0.1294584636691262





Epoch {epoch}
Training loss: 0.12790745736976666





Epoch {epoch}
Training loss: 0.12180481197779605





Epoch {epoch}
Training loss: 0.11179176526719246





Epoch {epoch}
Training loss: 0.10319783735228188





Epoch {epoch}
Training loss: 0.10361091929059314





Epoch {epoch}
Training loss: 0.08452312224518878





Epoch {epoch}
Training loss: 0.11393253872432675





Epoch {epoch}
Training loss: 0.1089677065125099





Epoch {epoch}
Training loss: 0.09231917556838845





Epoch {epoch}
Training loss: 0.09348069843407596





Epoch {epoch}
Training loss: 0.0933826210366072


                                                           

Testing Accuracy: 0.7233502538071066
Testing F1 Score: 0.7231344486348837
Testing Precision: 0.7240488006617039
Testing Recall: 0.7233502538071066


