<a href="https://colab.research.google.com/github/vnavya2004/BTP/blob/main/semi_supervised_spanish_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from tqdm import tqdm
from google.colab import files
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification

# Load the XLM-RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Assuming you're using Google Colab and uploaded a file
uploaded = files.upload()

# Read the CSV file
df = pd.read_csv("spanish.csv", encoding='latin1')
df = df.sample(frac=1, random_state=42)  # Shuffle the dataset


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Saving spanish.csv to spanish.csv


In [2]:
df.head()

NameError: name 'df' is not defined

In [4]:
# Specify the columns for features (tweets) and labels
tweets_column = 'traducido'
labels_column = 'class'
NUM_LABELS = len(df[labels_column].unique())

# Split the dataset
labeled_df, unlabeled_df = train_test_split(df, test_size=0.6, stratify=df[labels_column])
unlabeled_df, test_df = train_test_split(unlabeled_df, test_size=1/3, stratify=unlabeled_df[labels_column])

# Tokenize and encode the data
def encode_data(df, column):
    encoded_data = tokenizer.batch_encode_plus(
        df[column].tolist(),
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=256,
        return_tensors='pt'
    )
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    return input_ids, attention_masks

input_ids_labeled, attention_masks_labeled = encode_data(labeled_df, tweets_column)
labels_labeled = torch.tensor(labeled_df[labels_column].values, dtype=torch.long)  # Changed dtype to long

input_ids_unlabeled, attention_masks_unlabeled = encode_data(unlabeled_df, tweets_column)
input_ids_test, attention_masks_test = encode_data(test_df, tweets_column)
labels_test = torch.tensor(test_df[labels_column].values, dtype=torch.long)  # Changed dtype to long

dataset_labeled = TensorDataset(input_ids_labeled, attention_masks_labeled, labels_labeled)
dataset_unlabeled = TensorDataset(input_ids_unlabeled, attention_masks_unlabeled)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

# Define the XLM-RoBERTa model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=NUM_LABELS)

# Set up the device for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 5
batch_size = 32
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataset_labeled)*epochs)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# Training loop
def binary_accuracy(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, preds_flat)

def binary_f1_score(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def binary_precision(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, preds_flat, average='weighted')

def binary_recall(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return recall_score(labels_flat, preds_flat, average='weighted')

def train_model(model, dataset, optimizer, scheduler, epochs):
    for epoch in range(1, epochs + 1):
        model.train()
        loss_train_total = 0
        progress_bar = tqdm(DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=batch_size), desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:
            model.zero_grad()
            batch = tuple(b.to(device) for b in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
            outputs = model(**inputs)
            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

        loss_train_avg = loss_train_total / len(dataset)
        tqdm.write(f'\nEpoch {epoch}')
        tqdm.write(f'Training loss: {loss_train_avg}')

# Train the model on labeled data
train_model(model, dataset_labeled, optimizer, scheduler, epochs)

# Predict labels for unlabeled data
model.eval()
predicted_labels = []

for batch in tqdm(DataLoader(dataset_unlabeled, batch_size=batch_size), desc='Predicting labels for unlabeled data', leave=False, disable=False):
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs[0]
    predictions = torch.argmax(logits, dim=1).cpu().numpy()
    predicted_labels.extend(predictions)

predicted_labels = torch.tensor(predicted_labels, dtype=torch.long)  # Changed dtype to long

# Create a new dataset with original labeled data and newly labeled data
combined_input_ids = torch.cat((input_ids_labeled, input_ids_unlabeled), dim=0)
combined_attention_masks = torch.cat((attention_masks_labeled, attention_masks_unlabeled), dim=0)
combined_labels = torch.cat((labels_labeled, predicted_labels), dim=0)

dataset_combined = TensorDataset(combined_input_ids, combined_attention_masks, combined_labels)

# Train the model on the combined dataset
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataset_combined)*epochs)
train_model(model, dataset_combined, optimizer, scheduler, epochs)

# Evaluate the model on the test data
model.eval()
predictions, true_vals = [], []

for batch in tqdm(DataLoader(dataset_test, batch_size=batch_size), desc='Evaluating', leave=False, disable=False):
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs[1]
    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    predictions.append(logits)
    true_vals.append(label_ids)

predictions = np.concatenate(predictions, axis=0)
true_vals = np.concatenate(true_vals, axis=0)

test_accuracy = binary_accuracy(predictions, true_vals)
test_f1 = binary_f1_score(predictions, true_vals)
test_precision = binary_precision(predictions, true_vals)
test_recall = binary_recall(predictions, true_vals)

# Print out the evaluation metrics on test data
print(f'Testing Accuracy: {test_accuracy}')
print(f'Testing F1 Score: {test_f1}')
print(f'Testing Precision: {test_precision}')
print(f'Testing Recall: {test_recall}')




Epoch 1
Training loss: 0.008201904392280343





Epoch 2
Training loss: 0.005237101758083718





Epoch 3
Training loss: 0.004134625454981548





Epoch 4
Training loss: 0.003228462260345851





Epoch 5
Training loss: 0.002651800587727997


Predicting labels for unlabeled data:  29%|██▉       | 306/1052 [02:27<05:59,  2.07it/s]