<a href="https://colab.research.google.com/github/vnavya2004/BTP/blob/main/l%2Cul_Mteacherstudent_bangla_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import pandas as pd
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from tqdm import tqdm
from google.colab import files
from transformers import AutoModelForSequenceClassification

# Load the XLM-RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Assuming you're using Google Colab and uploaded a file
uploaded = files.upload()

# Read the Excel file
df = pd.read_excel(pd.ExcelFile(list(uploaded.keys())[0]), header=0)
df = df.sample(frac=0.4, random_state=42)
# Tokenize the text data in the dataset
tweets_column = 'tweets'
labels_column = 'labels'
NUM_LABELS = len(df[labels_column].unique())
possible_labels = df[labels_column].unique()
label_dict = {possible_label: index for index, possible_label in enumerate(possible_labels)}
df['labels'] = df[labels_column].map(label_dict)


# Split the dataset into labeled (20%), unlabeled (60%), and test (20%) sets
df_labeled, df_temp = train_test_split(df, stratify=df[labels_column], test_size=0.8)
df_unlabeled, df_test = train_test_split(df_temp, stratify=df_temp[labels_column], test_size=0.25)

# Tokenize the labeled data for training
encoded_data_train = tokenizer.batch_encode_plus(
    df_labeled[tweets_column].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df_labeled['labels'].values)

# Tokenize the unlabeled data
encoded_data_unlabeled = tokenizer.batch_encode_plus(
    df_unlabeled[tweets_column].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_unlabeled = encoded_data_unlabeled['input_ids']
attention_masks_unlabeled = encoded_data_unlabeled['attention_mask']

# Tokenize the test data
encoded_data_test = tokenizer.batch_encode_plus(
    df_test[tweets_column].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(df_test['labels'].values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_unlabeled = TensorDataset(input_ids_unlabeled, attention_masks_unlabeled)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



Saving Bangla.xlsx to Bangla.xlsx


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Define the XLM-RoBERTa model for sequence classification
student_model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=NUM_LABELS)
teacher_model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=NUM_LABELS)

# Set up the device for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
student_model.to(device)
teacher_model.to(device)

  # Define the batch size for training
batch_size = 4

# Set up the optimizer and scheduler for the teacher's pre-training
teacher_optimizer = AdamW(teacher_model.parameters(), lr=1e-5, eps=1e-8)
teacher_scheduler = get_linear_schedule_with_warmup(teacher_optimizer, num_warmup_steps=0, num_training_steps=len(dataset_train))

# 1. Pre-train the teacher model on labeled data for 1 epoch
teacher_model.train()
pretrain_loss_total = 0

# Pre-training loop for 1 epoch on labeled data
tqdm.write("\nStarting Teacher Pre-training (1 epoch on labeled data)")
progress_bar = tqdm(DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size), desc='Teacher Pre-training', leave=False, disable=False)

for batch in progress_bar:
    teacher_model.zero_grad()
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

    outputs_teacher = teacher_model(**inputs)
    loss_teacher = outputs_teacher.loss  # Supervised loss for teacher
    pretrain_loss_total += loss_teacher.item()

    loss_teacher.backward()
    torch.nn.utils.clip_grad_norm_(teacher_model.parameters(), 1.0)
    teacher_optimizer.step()
    teacher_scheduler.step()

progress_bar.set_postfix({'teacher_pretrain_loss': '{:.3f}'.format(pretrain_loss_total / len(dataset_train))})
tqdm.write(f'Teacher Pre-training Loss: {pretrain_loss_total / len(dataset_train)}')


# Set up the optimizer and scheduler
optimizer = AdamW(student_model.parameters(), lr=1e-5, eps=1e-8)
epochs = 7
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataset_train) * epochs)

# Define accuracy and F1 score functions
def compute_metrics(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    accuracy = accuracy_score(labels_flat, preds_flat)
    f1 = f1_score(labels_flat, preds_flat, average='weighted')
    precision = precision_score(labels_flat, preds_flat, average='weighted')
    recall = recall_score(labels_flat, preds_flat, average='weighted')
    return accuracy, f1, precision, recall

# Training loop with Mean Teacher logic
def calculate_alpha(epoch, total_epochs, base_alpha=0.99, final_alpha=0.999):
    """
    Calculate a dynamic alpha value based on the training epoch.
    Alpha increases gradually from base_alpha to final_alpha as training progresses.
    """
    alpha = base_alpha + (final_alpha - base_alpha) * (epoch / total_epochs)
    return alpha
base_alpha = 0.95  # Starting alpha
final_alpha = 0.999  # Final alpha
for epoch in range(1, epochs + 1):
    student_model.train()
    loss_train_total = 0
    progress_bar = tqdm(DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size), desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    alpha = calculate_alpha(epoch, epochs, base_alpha, final_alpha)
    # Supervised loss on labeled data
    for batch in progress_bar:
        student_model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        outputs_student = student_model(**inputs)
        loss_supervised = outputs_student.loss  # Supervised loss for labeled data
        logits_student = outputs_student.logits

        loss_train_total += loss_supervised.item()

        # Forward pass on unlabeled data for consistency loss
        unlabeled_batch = next(iter(DataLoader(dataset_unlabeled, sampler=RandomSampler(dataset_unlabeled), batch_size=batch_size)))
        unlabeled_batch = tuple(b.to(device) for b in unlabeled_batch)

        # Student and teacher predictions on unlabeled data
        with torch.no_grad():
            outputs_teacher = teacher_model(input_ids=unlabeled_batch[0], attention_mask=unlabeled_batch[1])
            logits_teacher = outputs_teacher.logits

        outputs_student_unlabeled = student_model(input_ids=unlabeled_batch[0], attention_mask=unlabeled_batch[1])
        logits_student_unlabeled = outputs_student_unlabeled.logits

        # Consistency loss between student and teacher predictions on unlabeled data
        consistency_loss = F.mse_loss(logits_student_unlabeled, logits_teacher)

        # Total loss: Supervised (on labeled data) + Consistency loss (on unlabeled data)
        total_loss = loss_supervised + consistency_loss
        total_loss.backward()

        torch.nn.utils.clip_grad_norm_(student_model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
         # EMA update: apply after every student model update
        with torch.no_grad():
             for teacher_param, student_param in zip(teacher_model.parameters(), student_model.parameters()):
                #  print(f'Teacher param: {teacher_param.data[0]}, Student param: {student_param.data[0]}')  # Example for logging
                 teacher_param.data = alpha * teacher_param.data + (1.0 - alpha) * student_param.data
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(total_loss.item() / len(batch))})

    # Calculate average training loss
    loss_train_avg = loss_train_total / len(dataset_train)
    tqdm.write(f'\nEpoch {epoch}')
    tqdm.write(f'Training loss: {loss_train_avg}')

    # Evaluation on validation data
    student_model.eval()
    teacher_model.eval()
    loss_val_total = 0
    predictions_student, true_vals = [], []
    predictions_teacher = []

    for batch in tqdm(DataLoader(dataset_test, sampler=SequentialSampler(dataset_test), batch_size=batch_size), desc='Evaluating', leave=False, disable=False):
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        with torch.no_grad():
            outputs_student = student_model(**inputs)
            outputs_teacher = teacher_model(**inputs)
            logits_student = outputs_student.logits
            logits_teacher = outputs_teacher.logits

        loss = outputs_student.loss
        loss_val_total += loss.item()

        logits_student = logits_student.detach().cpu().numpy()
        logits_teacher = logits_teacher.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()

        predictions_student.append(logits_student)
        predictions_teacher.append(logits_teacher)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataset_test)

    predictions_student = np.concatenate(predictions_student, axis=0)
    predictions_teacher = np.concatenate(predictions_teacher, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    val_accuracy_student, val_f1_student, val_precision_student, val_recall_student = compute_metrics(predictions_student, true_vals)
    val_accuracy_teacher, val_f1_teacher, val_precision_teacher, val_recall_teacher = compute_metrics(predictions_teacher, true_vals)

    tqdm.write(f'Validation loss: {loss_val_avg}')
    tqdm.write(f'Student Model - Accuracy: {val_accuracy_student}, F1 Score: {val_f1_student}, Precision: {val_precision_student}, Recall: {val_recall_student}')
    tqdm.write(f'Teacher Model - Accuracy: {val_accuracy_teacher}, F1 Score: {val_f1_teacher}, Precision: {val_precision_teacher}, Recall: {val_recall_teacher}')

# Final model selection based on validation performance
final_model = teacher_model
# Evaluation on test data
dataloader_test = DataLoader(dataset_test, sampler=RandomSampler(dataset_test), batch_size=batch_size)

def evaluate_test(model, dataloader):
    model.eval()
    loss_test_total = 0
    predictions, true_vals = [], []

    for batch in tqdm(dataloader, desc='Testing', leave=False, disable=False):
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        with torch.no_grad():
            outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]
        loss_test_total += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_test_avg = loss_test_total / len(dataloader)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_test_avg, predictions, true_vals

test_loss, test_predictions, test_true_vals = evaluate_test(final_model, dataloader_test)

# Calculate evaluation metrics on test data
test_accuracy, test_f1, test_precision, test_recall = compute_metrics(test_predictions, test_true_vals)

# Print out the evaluation metrics on test data
print(f'Testing Accuracy: {test_accuracy}')
print(f'Testing F1 Score: {test_f1}')
print(f'Testing Precision: {test_precision}')
print(f'Testing Recall: {test_recall}')

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting Teacher Pre-training (1 epoch on labeled data)




Teacher Pre-training Loss: 0.17816184090960557





Epoch 1
Training loss: 0.17731955172909294


  _warn_prf(average, modifier, msg_start, len(result))


Validation loss: 0.17516126398798787
Student Model - Accuracy: 0.5443037974683544, F1 Score: 0.4087534863763141, Precision: 0.7573565674831497, Recall: 0.5443037974683544
Teacher Model - Accuracy: 0.5189873417721519, F1 Score: 0.35464135021097043, Precision: 0.2693478609197244, Recall: 0.5189873417721519





Epoch 2
Training loss: 0.17624935147109305




Validation loss: 0.17113488912582397
Student Model - Accuracy: 0.6139240506329114, F1 Score: 0.6118294377178868, Precision: 0.6209777375467911, Recall: 0.6139240506329114
Teacher Model - Accuracy: 0.5886075949367089, F1 Score: 0.5125340198647322, Precision: 0.6869082133257289, Recall: 0.5886075949367089





Epoch 3
Training loss: 0.16172948536599518




Validation loss: 0.16333497325076332
Student Model - Accuracy: 0.6645569620253164, F1 Score: 0.6611893064394226, Precision: 0.6670177553588613, Recall: 0.6645569620253164
Teacher Model - Accuracy: 0.6139240506329114, F1 Score: 0.6089648725531236, Precision: 0.6153199427523783, Recall: 0.6139240506329114





Epoch 4
Training loss: 0.13895131789954604




Validation loss: 0.16373400974877272
Student Model - Accuracy: 0.6582278481012658, F1 Score: 0.6521738227765057, Precision: 0.6638847664775207, Recall: 0.6582278481012658
Teacher Model - Accuracy: 0.6455696202531646, F1 Score: 0.6432744583315402, Precision: 0.6462158736512066, Recall: 0.6455696202531646





Epoch 5
Training loss: 0.11842011987783346




Validation loss: 0.16561629632605782
Student Model - Accuracy: 0.6645569620253164, F1 Score: 0.6602481679559927, Precision: 0.6682632962735563, Recall: 0.6645569620253164
Teacher Model - Accuracy: 0.6772151898734177, F1 Score: 0.6754537625979785, Precision: 0.6781401104091449, Recall: 0.6772151898734177





Epoch 6
Training loss: 0.10485318132266877




Validation loss: 0.17529764516821392
Student Model - Accuracy: 0.6518987341772152, F1 Score: 0.6327882820544526, Precision: 0.6783811803253511, Recall: 0.6518987341772152
Teacher Model - Accuracy: 0.6772151898734177, F1 Score: 0.6760305390276213, Precision: 0.6775254102432499, Recall: 0.6772151898734177





Epoch 7
Training loss: 0.09163736238790925




Validation loss: 0.1642797518568703
Student Model - Accuracy: 0.6392405063291139, F1 Score: 0.6388490540794208, Precision: 0.6389192056222525, Recall: 0.6392405063291139
Teacher Model - Accuracy: 0.6772151898734177, F1 Score: 0.6760305390276213, Precision: 0.6775254102432499, Recall: 0.6772151898734177


                                                        

Testing Accuracy: 0.6772151898734177
Testing F1 Score: 0.6760305390276213
Testing Precision: 0.6775254102432499
Testing Recall: 0.6772151898734177


