In [None]:
!pip install -q transformers
%pip install -q torch
%pip install -q datasets
%pip install -q accelerate wandb

In [2]:
import os
import logging
import gdown
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader
from transformers import (AutoTokenizer, 
                          AutoModelForSequenceClassification, 
                          Trainer, 
                          TrainingArguments, 
                          EarlyStoppingCallback
                          )
from torch.nn import CrossEntropyLoss
import wandb  # Add wandb

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:


def download_drive_file_by_id(file_id, target_file_name):
  dataset_url = f"https://drive.google.com/u/1/uc?id={file_id}&export=download"
  gdown.download(dataset_url, target_file_name)
  print("File Download succesfull")

FILE_ID = ['166k7N9KV6jEDvvAr9iLTwrWyfdEcAs5p', '1-2TjS6xPfjWj9YaJGSf-JXXXfNz-2pNT', '1-1k1yHOGP7Wij1mUG2iKaSTN8i1WUgPz']
FILENAME = ["train.csv", "val_tweet.csv", "val_label.csv"]


for file_id, file_name in zip(FILE_ID, FILENAME):
    if not os.path.exists(file_name):
        download_drive_file_by_id(file_id, file_name)
    else:
        print(f"File '{file_name}' already exists.")
        
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

COL_NAMES = ['index', 'text', 'label']
train_df = pd.read_csv(FILENAME[0], header=0, names=COL_NAMES)
valid_df_tweet = pd.read_csv(FILENAME[1])
valid_df_label = pd.read_csv(FILENAME[2])
valid_df = pd.merge(valid_df_tweet, valid_df_label, on='index')
valid_df.columns = COL_NAMES
train_df.drop('index', axis=1, inplace=True)
valid_df.drop('index', axis=1, inplace=True)

train_df.head()

File 'train.csv' already exists.
File 'val_tweet.csv' already exists.
File 'val_label.csv' already exists.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

COL_NAMES = ['index', 'text', 'label']
train_df = pd.read_csv(FILENAME[0], header=0, names=COL_NAMES)
valid_df_tweet = pd.read_csv(FILENAME[1])
valid_df_label = pd.read_csv(FILENAME[2])
valid_df = pd.merge(valid_df_tweet, valid_df_label, on='index')
valid_df.columns = COL_NAMES
train_df.drop('index', axis=1, inplace=True)
valid_df.drop('index', axis=1, inplace=True)

train_df.head()

In [None]:
import re

def preprocess_text(text):
    # Remove hyperlinks
    text = re.sub(r'http\S+', '', text)
    # Remove unnecessary things
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

train_df['text'] = train_df['text'].apply(preprocess_text)
valid_df['text'] = valid_df['text'].apply(preprocess_text)

In [13]:
def tokenize_dataset(dataset, tokenizer, max_length):
    """Tokenize the dataset with dynamic max length."""
    return dataset.map(lambda x: tokenizer(x['text'], padding="max_length", truncation=True, max_length=169), batched=True)

In [None]:
model_name = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [14]:
api_key = os.getenv("WANDB_API_KEY")

wandb.init(project="NLP CHIPSAL", name=get_run_name(model_name))

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FocalLoss(nn.Module):
  def __init__(self, gamma=2.0, alpha=None, reduction='mean'):
    """
    Args:
      gamma (float): Focusing parameter that adjusts the rate at which easy examples are down-weighted.
      alpha (list, optional): Weights for each class. Can be used to tackle class imbalance._
      reduction (str, optional): Specifies the reduction to apply to the output. Can be 'mean', 'sum', or 'none'.
    """
    super(FocalLoss, self).__init__()
    self.gamma = gamma
    self.alpha = alpha
    self.reduction = reduction

  def get_aggregated_loss(self, loss):
    if self.reduction == 'mean':
      return torch.mean(loss)
    elif self.reduction == 'sum':
      return torch.sum(loss)
    elif self.reduction == 'none':
      return loss

  def forward(self, inputs, targets):
    device = targets.device

    # Calculate cross-entropy loss
    ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=torch.tensor(self.alpha, device=device))

    # Get probabilites from Crossentropy Loss
    probs = torch.exp(-ce_loss)

    # Compute Focal Loss
    focal_loss = ((1 - probs) ** self.gamma) * ce_loss

    return self.get_aggregated_loss(focal_loss)

In [None]:
class CustomTrainer(Trainer):
  def __init__(self, *args, focal_loss_gamma=2.0, focal_loss_alpha=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.focal_loss = FocalLoss(gamma=focal_loss_gamma, alpha=focal_loss_alpha)

  def compute_loss(self, model, inputs, return_outputs=False):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.logits
    loss = self.focal_loss(logits, labels)
    return (loss, outputs) if return_outputs else loss

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)


In [None]:
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
os.environ["WANDB_DISABLED"] = "false" 

from datetime import datetime

def get_run_name(model_name):
    return f"model-{model_name.replace('/', '_')}-{datetime.now().strftime('%Y-%m-%d')}"

run_name = get_run_name(model_name)
wandb.run.name = run_name

In [None]:
training_args = TrainingArguments(
    output_dir="f./{run_name}_results",
    num_train_epochs=5,  # Adjusted number of epochs
    per_device_train_batch_size=8,  # Adjusted batch size
    per_device_eval_batch_size=8,
    learning_rate=1e-5,
    evaluation_strategy="steps",
    eval_steps=50,
    save_total_limit=1,
    save_strategy="steps",
    load_best_model_at_end=True,
    logging_dir='./logs',
    logging_steps=50,
    fp16=True,
    gradient_accumulation_steps=1,  # Adjusted based on batch size
    warmup_steps=500,
    weight_decay=0.01,
    report_to="wandb",  # Enable reporting to WandB
    run_name=run_name,  # Set the run name dynamically
    logging_first_step=True,
)

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    focal_loss_gamma=4.0,
    focal_loss_alpha=torch.tensor([2.0615, 2.5864, 7.7958], dtype=torch.float)
)

   for name, param in model.named_parameters():
        if not param.data.is_contiguous():
            param.data = param.data.contiguous()

    trainer.train()