## Install Modules

In [1]:
# !pip install -q transformers
# !pip install -q torch
# !pip install -q datasets
# !pip install -q accelerate -U
# !pip install wandb -q
!pip install gdown

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0


## Load Modules

In [2]:
import os
import torch
import wandb
import gdown
import logging
import numpy as np
import pandas as pd
from datetime import datetime
from datasets import Dataset
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
# from google.colab import userdata
from transformers import Trainer, TrainingArguments
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          Trainer,
                          TrainingArguments,
                          EarlyStoppingCallback
                          )
from torch.nn import CrossEntropyLoss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Download Data

In [3]:
def download_drive_file_by_id(file_id, target_file_name):
  dataset_url = f"https://drive.google.com/u/1/uc?id={file_id}&export=download"
  gdown.download(dataset_url, target_file_name)
  print("File Download succesfull")

FILE_ID = ['166k7N9KV6jEDvvAr9iLTwrWyfdEcAs5p', '1-2TjS6xPfjWj9YaJGSf-JXXXfNz-2pNT', '1-1k1yHOGP7Wij1mUG2iKaSTN8i1WUgPz']
FILENAME = ["train.csv", "val_tweet.csv", "val_label.csv"]

for file_id, file_name in zip(FILE_ID, FILENAME):
    if not os.path.exists(file_name):
        download_drive_file_by_id(file_id, file_name)
    else:
        print(f"File '{file_name}' already exists.")

Downloading...
From: https://drive.google.com/u/1/uc?id=166k7N9KV6jEDvvAr9iLTwrWyfdEcAs5p&export=download
To: /kaggle/working/train.csv
100%|██████████| 914k/914k [00:00<00:00, 114MB/s]


File Download succesfull


Downloading...
From: https://drive.google.com/u/1/uc?id=1-2TjS6xPfjWj9YaJGSf-JXXXfNz-2pNT&export=download
To: /kaggle/working/val_tweet.csv
100%|██████████| 196k/196k [00:00<00:00, 70.7MB/s]


File Download succesfull


Downloading...
From: https://drive.google.com/u/1/uc?id=1-1k1yHOGP7Wij1mUG2iKaSTN8i1WUgPz&export=download
To: /kaggle/working/val_label.csv
100%|██████████| 3.80k/3.80k [00:00<00:00, 13.7MB/s]

File Download succesfull





## Load Data

In [4]:
COL_NAMES = ['index', 'text', 'label']
train_df = pd.read_csv(FILENAME[0], header=0, names=COL_NAMES)
valid_df_tweet = pd.read_csv(FILENAME[1])
valid_df_label = pd.read_csv(FILENAME[2])
valid_df = pd.merge(valid_df_tweet, valid_df_label, on='index')
valid_df.columns = COL_NAMES
train_df.drop('index', axis=1, inplace=True)
valid_df.drop('index', axis=1, inplace=True)

In [5]:
train_df.head()

Unnamed: 0,text,label
0,@me_sherya सक्छ। यो जाबो एमाले त मलाई मनै पर्दैन।,1
1,‘धार्मिक भावनाएँ भड़का कर दंगे करवाना चाहते है...,0
2,@belakoboli ये ट्वाके अङ्कल ओलि देउवा माकुने ...,0
3,"@suvashsanatani यिनिहरुले बाटो, पुल, स्कुल, बि...",2
4,हिजो पार्टी मा ओलिले एकलौटि गर्‍यो भनेर स्याल ...,0


In [6]:
valid_df.head()

Unnamed: 0,text,label
0,भोट त लौरोमा हाल्ने हो रुखमा त मल हाले हुन्छ ।,1
1,केन्द्रीय गृह एवं सहकारिता मंत्री अमित शाह (@A...,0
2,@Alive_Aleeza @bishwaprakash77 @kpsharmaoli वि...,0
3,"@cmprachanda, @ncp_madhavnepal , @SherBDeuba ,...",0
4,महाविद्ध्वान खड्ग प्रसादलाई भोट दिने भनेको जोक...,0


## Remove Hyperlinks

In [7]:
# import re

# def preprocess_text(text):
#     # Remove hyperlinks
#     text = re.sub(r'http\S+', '', text)
#     # Remove unnecessary things
#     text = re.sub(r'[^a-zA-Z\s]', '', text)
#     return text

# train_df['text'] = train_df['text'].apply(preprocess_text)
# valid_df['text'] = valid_df['text'].apply(preprocess_text)

## Tokenizer Functions

In [8]:
def tokenize_function(examples):
  return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=169)

## Create Dataset Class


In [9]:
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

## Tokenizer and Dataset

In [10]:
from transformers import AutoTokenizer
model_name = "google/muril-large-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/406 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Map:   0%|          | 0/2214 [00:00<?, ? examples/s]

Map:   0%|          | 0/474 [00:00<?, ? examples/s]

## Model

In [11]:


model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
model

pytorch_model.bin:   0%|          | 0.00/2.03G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(197285, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((

## Loss Fuction (Focal Loss)

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer, TrainingArguments


class FocalLoss(nn.Module):
  def __init__(self, gamma=0.1, alpha=None, reduction='mean'):
    """
    Args:
      gamma (float): Focusing parameter that adjusts the rate at which easy examples are down-weighted.
      alpha (list, optional): Weights for each class. Can be used to tackle class imbalance._
      reduction (str, optional): Specifies the reduction to apply to the output. Can be 'mean', 'sum', or 'none'.
    """
    super(FocalLoss, self).__init__()
    self.gamma = gamma
    self.alpha = alpha
    self.reduction = reduction

  def get_aggregated_loss(self, loss):
    if self.reduction == 'mean':
      return torch.mean(loss)
    elif self.reduction == 'sum':
      return torch.sum(loss)
    elif self.reduction == 'none':
      return loss

  def forward(self, inputs, targets):
    device = targets.device

    # Calculate cross-entropy loss
    ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=torch.tensor(self.alpha, device=device))

    # Get probabilites from Crossentropy Loss
    probs = torch.exp(-ce_loss)

    # Compute Focal Loss
    focal_loss = ((1 - probs) ** self.gamma) * ce_loss

    return self.get_aggregated_loss(focal_loss)


## Custom Trainer

In [13]:
class CustomTrainer(Trainer):
  def __init__(self, *args, focal_loss_gamma=0.1, focal_loss_alpha=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.focal_loss = FocalLoss(gamma=focal_loss_gamma, alpha=focal_loss_alpha)

  def compute_loss(self, model, inputs, return_outputs=False):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.logits
    loss = self.focal_loss(logits, labels)
    return (loss, outputs) if return_outputs else loss

## WandB Config

In [14]:
def parse_today_date():
    return f"{datetime.now().strftime('%Y-%m-%d')}"

run_name = f'model-{model_name}-{parse_today_date()}-focal-loss-no-preprocessed'.replace('/','-')
output_dir=f"./{run_name}_results"


# change this accrodiang to your env i.e google colab, kaggle, sagemakers studio
# #############################################################################

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
WANDB_API_KEY = user_secrets.get_secret("WANDB_API_KEY")

os.environ['WANDB_API_KEY'] = WANDB_API_KEY
os.environ["WANDB_DISABLED"] = "false"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
wandb.init(project="NLP CHIPSAL", name=run_name)
wandb.run.name = run_name


[34m[1mwandb[0m: Currently logged in as: [33msuman-805522[0m ([33msuman-smstu[0m). Use [1m`wandb login --relogin`[0m to force relogin


## Training Arguments

In [15]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,  # Adjusted number of epochs
    per_device_train_batch_size=8,  # Adjusted batch size
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    evaluation_strategy="steps",
    eval_steps=50,
    save_total_limit=1,
    save_strategy="steps",
    load_best_model_at_end=True,
    logging_dir='./logs',
    logging_steps=50,
    fp16=True,
    gradient_accumulation_steps=1,  # Adjusted based on batch size
    warmup_steps=500,
    weight_decay=0.01,
    report_to="wandb",  # Enable reporting to WandB
    run_name=run_name,  # Set the run name dynamically
    logging_first_step=True,
)



In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    focal_loss_gamma=4.0,
    focal_loss_alpha=torch.tensor([2.0615, 2.5864, 7.7958], dtype=torch.float),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]

)

# Ensure model parameters are contiguous
for name, param in model.named_parameters():
    if not param.data.is_contiguous():
        param.data = param.data.contiguous()


trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=torch.tensor(self.alpha, device=device))


Step,Training Loss,Validation Loss
50,2.7295,2.665742
100,2.7216,2.556474
150,2.5057,2.450201
200,2.3381,2.228677
250,1.9836,1.832031
300,1.7082,1.686206
350,1.5945,1.74727
400,1.3301,1.68794


## Predictions and Evaluations

In [None]:
# Predictions and evaluation
y_true = valid_dataset['label']
predictions = trainer.predict(valid_dataset)
y_hat = np.argmax(predictions.predictions, axis=1)

# Save metrics to CSV
metrics_df = pd.DataFrame({'y_true': y_true, 'y_hat': y_hat})
metrics_df.to_csv(f'muril_metrics_preprocessed.csv', index=False)
logging.info("Metrics saved to metrics.csv")

In [None]:
# Save classification report
report = classification_report(y_true, y_hat, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df.to_csv(f'muril_classification_report_preprocessed.csv')
logging.info("Classification report saved to classification_report.csv")

# Log GPU memory usage
logging.info(f"Peak GPU memory usage: {torch.cuda.max_memory_allocated() / (1024 ** 3):.2f} GB")

# End the WandB run
wandb.finish()