<a href="https://colab.research.google.com/github/vnavya2004/BTP/blob/main/semi_supervised_arabic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from tqdm import tqdm
from google.colab import files

# Load the XLM-RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Assuming you're using Google Colab and uploaded a file
uploaded = files.upload()

# Read the Excel file
df = pd.read_excel(pd.ExcelFile(list(uploaded.keys())[0]), header=0)

# Define the columns for features (tweets) and labels
tweets_column = 'tweet'
labels_column = 'label'

# Convert the labels to numerical values
NUM_LABELS = len(df[labels_column].unique())
possible_labels = df[labels_column].unique()
label_dict = {possible_label: index for index, possible_label in enumerate(possible_labels)}
df[labels_column] = df[labels_column].map(label_dict)

# Split the dataset into labeled, unlabeled, and test sets
labeled_data, temp_data = train_test_split(df, test_size=0.6, stratify=df[labels_column])
unlabeled_data, test_data = train_test_split(temp_data, test_size=1/3, stratify=temp_data[labels_column])

# Tokenize the dataset
def tokenize_data(data):
    return tokenizer.batch_encode_plus(
        data[tweets_column].tolist(),
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=256,
        return_tensors='pt'
    )

# Tokenize labeled, unlabeled, and test data
encoded_labeled = tokenize_data(labeled_data)
encoded_unlabeled = tokenize_data(unlabeled_data)
encoded_test = tokenize_data(test_data)

# Convert labels to tensors
labels_labeled = torch.tensor(labeled_data[labels_column].values)
labels_unlabeled = torch.tensor(unlabeled_data[labels_column].values)
labels_test = torch.tensor(test_data[labels_column].values)

# Create TensorDatasets
dataset_labeled = TensorDataset(encoded_labeled['input_ids'], encoded_labeled['attention_mask'], labels_labeled)
dataset_unlabeled = TensorDataset(encoded_unlabeled['input_ids'], encoded_unlabeled['attention_mask'], labels_unlabeled)
dataset_test = TensorDataset(encoded_test['input_ids'], encoded_test['attention_mask'], labels_test)

# Define the XLM-RoBERTa model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=NUM_LABELS)

# Set up the device for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 5
batch_size = 4
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataset_labeled)*epochs)

# Training loop
def train_model(dataset, model, optimizer, scheduler, epochs=5):
    for epoch in range(1, epochs + 1):
        model.train()
        loss_train_total = 0
        progress_bar = tqdm(DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=batch_size), desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:
            model.zero_grad()
            batch = tuple(b.to(device) for b in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
            outputs = model(**inputs)
            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})
        loss_train_avg = loss_train_total / len(dataset)
        tqdm.write('\nEpoch {epoch}')
        tqdm.write(f'Training loss: {loss_train_avg}')

# Function to evaluate the model
def evaluate_model(model, dataloader):
    model.eval()
    loss_val_total = 0
    predictions, true_vals = [], []
    for batch in tqdm(dataloader, desc='Evaluating', leave=False, disable=False):
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        with torch.no_grad():
            outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    loss_val_avg = loss_val_total / len(dataloader)
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    return loss_val_avg, predictions, true_vals

# Metrics functions
def binary_accuracy(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, preds_flat)

def binary_f1_score(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def binary_precision(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, preds_flat, average='weighted')

def binary_recall(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return recall_score(labels_flat, preds_flat, average='weighted')

# Train model on labeled data
train_model(dataset_labeled, model, optimizer, scheduler, epochs=epochs)

# Predict labels for the unlabeled data
model.eval()
unlabeled_dataloader = DataLoader(dataset_unlabeled, sampler=SequentialSampler(dataset_unlabeled), batch_size=batch_size)
predicted_labels = []
for batch in tqdm(unlabeled_dataloader, desc='Predicting', leave=False, disable=False):
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs[0]
    preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
    predicted_labels.extend(preds)
predicted_labels = torch.tensor(predicted_labels)

# Create a new dataset with labeled and pseudo-labeled data
all_input_ids = torch.cat((encoded_labeled['input_ids'], encoded_unlabeled['input_ids']), dim=0)
all_attention_masks = torch.cat((encoded_labeled['attention_mask'], encoded_unlabeled['attention_mask']), dim=0)
all_labels = torch.cat((labels_labeled, predicted_labels), dim=0)
combined_dataset = TensorDataset(all_input_ids, all_attention_masks, all_labels)

# Train model on combined dataset
train_model(combined_dataset, model, optimizer, scheduler, epochs=epochs)

# Evaluate the model on the test set
test_dataloader = DataLoader(dataset_test, sampler=SequentialSampler(dataset_test), batch_size=batch_size)
test_loss, test_predictions, test_true_vals = evaluate_model(model, test_dataloader)

# Calculate evaluation metrics on test data
test_accuracy = binary_accuracy(test_predictions, test_true_vals)
test_f1 = binary_f1_score(test_predictions, test_true_vals)
test_precision = binary_precision(test_predictions, test_true_vals)
test_recall = binary_recall(test_predictions, test_true_vals)

# Print out the evaluation metrics on test data
print(f'Testing Accuracy: {test_accuracy}')
print(f'Testing F1 Score: {test_f1}')
print(f'Testing Precision: {test_precision}')
print(f'Testing Recall: {test_recall}')


Saving Arabic_Depression_10.000_Tweets.xlsx to Arabic_Depression_10.000_Tweets (1).xlsx


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch {epoch}
Training loss: 0.09626102793587779





Epoch {epoch}
Training loss: 0.038181092046852425





Epoch {epoch}
Training loss: 0.023587136437612573





Epoch {epoch}
Training loss: 0.019737434895917432





Epoch {epoch}
Training loss: 0.012330489172246417





Epoch {epoch}
Training loss: 0.021818320398408104





Epoch {epoch}
Training loss: 0.011668018804793064





Epoch {epoch}
Training loss: 0.00942273153186443





Epoch {epoch}
Training loss: 0.00505145797461202





Epoch {epoch}
Training loss: 0.0019357146396427397


                                                             

Testing Accuracy: 0.9755
Testing F1 Score: 0.9754989648312642
Testing Precision: 0.9755803730830511
Testing Recall: 0.9755


