<a href="https://colab.research.google.com/github/vnavya2004/BTP/blob/main/xlm_roberta_spanish(20%25).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
#bangla
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from tqdm import tqdm
from google.colab import files
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification

# Load the XLM-RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Assuming you're using Google Colab and uploaded a file
uploaded = files.upload()

# Read the Excel file
df = pd.read_excel(pd.ExcelFile(list(uploaded.keys())[0]), header=0)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

Saving Spanish.xlsx to Spanish.xlsx


In [15]:
# Tokenize the text data in the dataset
tokenized_texts = []
max_length = 128
for text in df['Tweets']:  # Replace 'tweet' with the name of the column containing text data
    tokenized_text = tokenizer.encode(str(text), add_special_tokens=True, truncation=True, max_length=max_length)
    tokenized_texts.append(tokenized_text)

# Convert tokenized texts to PyTorch tensors
max_length = max(len(text) for text in tokenized_texts)
padded_input_ids = [torch.tensor(tokenized_text + [tokenizer.pad_token_id] * (max_length - len(tokenized_text))) for tokenized_text in tokenized_texts]
input_ids = torch.stack(padded_input_ids)

# Print the tokenized texts
print("Tokenized texts:", input_ids)

# Specify the columns for features (tweets) and labels
tweets_column = 'Tweets'
labels_column = 'Labels'
NUM_LABELS = len(df[labels_column].unique())
possible_labels = df[labels_column].unique()

# Split the data into 40% (20% train + 20% test) and 60% (discarded)
df_subset, _, labels_subset, _ = train_test_split(
    df[tweets_column], df[labels_column], stratify=df[labels_column], test_size=0.6, random_state=42
)

# Further split the 40% subset into 20% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(
    df_subset, labels_subset, stratify=labels_subset, test_size=0.5, random_state=42
)

X_train = X_train.astype(str)
X_test = X_test.astype(str)
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()
# Tokenize the training data
encoded_data_train = tokenizer.batch_encode_plus(
    X_train.tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train)

# Tokenize the validation data
encoded_data_val = tokenizer.batch_encode_plus(
    X_test.tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(y_test)

dataset_train = TensorDataset(input_ids_train,
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val,
                            attention_masks_val,
                            labels_val)

# Define the XLM-RoBERTa model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=NUM_LABELS)

# Set up the device for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 5
batch_size = 4  # Define the batch size here
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataset_train)*epochs)

# Training loop
def binary_accuracy(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, preds_flat)

def binary_f1_score(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat)

def binary_precision(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, preds_flat)

def binary_recall(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return recall_score(labels_flat, preds_flat)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Tokenized texts: tensor([[     0,    438,  78397,  ...,      1,      1,      1],
        [     0,  13015,    525,  ...,      1,      1,      1],
        [     0, 228269,     11,  ...,      1,      1,      1],
        ...,
        [     0, 228269,     11,  ...,      1,      1,      1],
        [     0,   1720,   2194,  ...,  30418,    198,      2],
        [     0,  72686,    220,  ...,      1,      1,      1]])


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:

# Training loop
for epoch in range(1, epochs + 1):
    model.train()
    loss_train_total = 0
    progress_bar = tqdm(DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size), desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    loss_train_avg = loss_train_total / len(dataset_train)

    # Evaluation on validation data
    model.eval()
    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in tqdm(DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size), desc='Evaluating', leave=False, disable=False):
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        with torch.no_grad():
            outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataset_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    val_accuracy = binary_accuracy(predictions, true_vals)
    val_f1 = binary_f1_score(predictions, true_vals)
    val_precision = binary_precision(predictions, true_vals)
    val_recall = binary_recall(predictions, true_vals)

# Evaluation on test data
dataloader_val = DataLoader(dataset_val, sampler=RandomSampler(dataset_val), batch_size=batch_size)

def evaluate_test(model, dataloader):
    model.eval()
    loss_test_total = 0
    predictions, true_vals = [], []

    for batch in tqdm(dataloader, desc='Testing', leave=False, disable=False):
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        with torch.no_grad():
            outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]
        loss_test_total += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_test_avg = loss_test_total/len(dataloader)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_test_avg, predictions, true_vals

test_loss, test_predictions, test_true_vals = evaluate_test(model, dataloader_val)

# Calculate evaluation metrics on test data
test_accuracy = binary_accuracy(test_predictions, test_true_vals)
test_f1 = binary_f1_score(test_predictions, test_true_vals)
test_precision = binary_precision(test_predictions, test_true_vals)
test_recall = binary_recall(test_predictions, test_true_vals)

# Print out the evaluation metrics on test data
print(f'Testing Accuracy: {test_accuracy:.4f}')
print(f'Testing F1 Score: {test_f1:.4f}')
print(f'Testing Precision: {test_precision:.4f}')
print(f'Testing Recall: {test_recall:.4f}')



Testing Accuracy: 0.9126
Testing F1 Score: 0.9009
Testing Precision: 0.8994
Testing Recall: 0.9025
