### IMPORTING THE REQUIRED MODULES

In [1]:
!pip install transformers 
!pip install datasets
!pip install pynvml
!pip install evaluate 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import transformers
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import os
import nltk
import torch
import evaluate
import sys
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### UTILS FUNCTION TO WORK WITH GPU

In [4]:
# define utils functions to facilitate gpu 

def check_gpu_availability():
    # Check if CUDA is available
    print(f"Cuda is available: {torch.cuda.is_available()}")

def getting_device(gpu_prefence=True) -> torch.device:
    """
    This function gets the torch device to be used for computations, 
    based on the GPU preference specified by the user.
    """
    
    # If GPU is preferred and available, set device to CUDA
    if gpu_prefence and torch.cuda.is_available():
        device = torch.device('cuda')
    # If GPU is not preferred or not available, set device to CPU
    else: 
        device = torch.device("cpu")
    
    # Print the selected device
    print(f"Selected device: {device}")
    
    # Return the device
    return device

# Define a function to print GPU memory utilization
def print_gpu_utilization():
    # Initialize the PyNVML library
    nvmlInit()
    # Get a handle to the first GPU in the system
    handle = nvmlDeviceGetHandleByIndex(0)
    # Get information about the memory usage on the GPU
    info = nvmlDeviceGetMemoryInfo(handle)
    # Print the GPU memory usage in MB
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

# Define a function to print training summary information
def print_summary(result):
    # Print the total training time in seconds
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    # Print the number of training samples processed per second
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    # Print the GPU memory utilization
    print_gpu_utilization()

In [5]:
# CHECK IF GPU IS UP
check_gpu_availability()

Cuda is available: True


In [6]:
# SAVE THE DEVICE WE ARE WORKING WITH
device = getting_device(gpu_prefence=True)

Selected device: cuda


In [7]:
# SHOULD BE FEW MB
print_gpu_utilization()

GPU memory occupied: 261 MB.


### IMPORTING THE DATA

In [8]:
# Read in train and test CSV files using Pandas
path2train = '/content/drive/MyDrive/LT_SHARED_FOLDER/train.csv'
df = pd.read_csv(path2train)
# split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# split the data into training and validation sets
train_df, validation_df = train_test_split(df, test_size=0.1, random_state=42)

In [9]:
train_df.head(n=10)

Unnamed: 0,id,keyword,location,text,target
4620,6568,injury,"Plano, Texas",'McFadden Reportedly to Test Hamstring Thursda...,0
2858,4107,drought,Nigeria,w--=-=-=-[ NEMA warns Nigerians to prepare for...,1
3098,4448,electrocuted,,When I was cooking earlier I got electrocuted ...,0
3751,5330,fire,Canada,I'm On Fire. http://t.co/WATsmxYTVa,0
5285,7552,outbreak,Indonesia,More than 40 families affected by the fatal ou...,1
5863,8375,ruin,"Winnipeg, Manitoba",Why do u ruin everything? @9tarbox u ruined t...,0
4827,6874,mass%20murder,Anonymous,http://t.co/c1H7JECFrV @RoyalCarribean do your...,1
5190,7410,obliterated,Tennessee,WACKOES like #MicheleBachman predict the WORLD...,0
5784,8253,rioting,Vidalia GA,@Reuters people like you should be charged aft...,0
4369,6206,hijacker,,Remove the http://t.co/2nS5TfnxpA and Linkury ...,0


In [10]:
train_df.rename(columns = {"target":"labels"}, inplace = True)
validation_df.rename(columns = {"target":"labels"}, inplace = True)
test_df.rename(columns = {"target":"labels"}, inplace = True)
train_df.head(10)

Unnamed: 0,id,keyword,location,text,labels
4620,6568,injury,"Plano, Texas",'McFadden Reportedly to Test Hamstring Thursda...,0
2858,4107,drought,Nigeria,w--=-=-=-[ NEMA warns Nigerians to prepare for...,1
3098,4448,electrocuted,,When I was cooking earlier I got electrocuted ...,0
3751,5330,fire,Canada,I'm On Fire. http://t.co/WATsmxYTVa,0
5285,7552,outbreak,Indonesia,More than 40 families affected by the fatal ou...,1
5863,8375,ruin,"Winnipeg, Manitoba",Why do u ruin everything? @9tarbox u ruined t...,0
4827,6874,mass%20murder,Anonymous,http://t.co/c1H7JECFrV @RoyalCarribean do your...,1
5190,7410,obliterated,Tennessee,WACKOES like #MicheleBachman predict the WORLD...,0
5784,8253,rioting,Vidalia GA,@Reuters people like you should be charged aft...,0
4369,6206,hijacker,,Remove the http://t.co/2nS5TfnxpA and Linkury ...,0


In [11]:
len(train_df), len(validation_df), len(test_df)

(6851, 762, 762)

### BUILDING IT INTO DATASETES

In [12]:
# pandas2dataset
ds_train = Dataset.from_pandas(train_df)
ds_validation = Dataset.from_pandas(validation_df)
ds_test = Dataset.from_pandas(test_df)

In [13]:
ds_train

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'labels', '__index_level_0__'],
    num_rows: 6851
})

### MODEL CHOICE

In [14]:
model_nm = "distilbert-base-uncased" 

### TOKENIZING

In [15]:
# IMPORTING THE MODULE TO GET THE TOKENIZER
from transformers import AutoTokenizer

In [17]:
# IMPORTING OUR TOKENIZER
tokz = AutoTokenizer.from_pretrained(model_nm)

In [18]:
# getting the max length of the tokenized tweet

def getting_max_length(tokenizer, items):

  # initialize a list for lengths
  max_len = 0

  # iterate over the list
  for item in items:

    # record lenght of each item
    lenght_of_item = len(tokenizer.encode(item, add_special_tokens = True))

    if lenght_of_item > max_len:

      max_len = lenght_of_item

  return max_len

In [19]:
max_len = getting_max_length(tokz,ds_train["text"])

In [20]:
# DEFINING A TOKENIZE FUNCTION TO TOKENIZE BOTH THE TWO DATASETS
def tok_func(x): return tokz(x["text"], truncation=True, padding = "max_length", max_length=max_len)

In [21]:
# CHECK THAT TOKENIZER FUNCTION WORKS
tok_func(ds_train[19]) # the 1 are for padding it; the attention mask show to not care about the 1

{'input_ids': [101, 1001, 10507, 4140, 1001, 22975, 4140, 1001, 8249, 4517, 5057, 9651, 2415, 8299, 1024, 1013, 1013, 1056, 1012, 2522, 1013, 1047, 2546, 2581, 2549, 2080, 2475, 12458, 11020, 16770, 1024, 1013, 1013, 1056, 1012, 2522, 1013, 1050, 2475, 27922, 11140, 2278, 2290, 2615, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [22]:
# TOKENIZING THE DS
tok_ds_train = ds_train.map(tok_func, batched=True, remove_columns=['text','id', 'keyword', 'location'])
tok_ds_validation = ds_test.map(tok_func, batched=True, remove_columns=['text','id', 'keyword', 'location'])
tok_ds_test = ds_test.map(tok_func, batched=True, remove_columns=['text','id', 'keyword', 'location'])

Map:   0%|          | 0/6851 [00:00<?, ? examples/s]

Map:   0%|          | 0/762 [00:00<?, ? examples/s]

Map:   0%|          | 0/762 [00:00<?, ? examples/s]

In [23]:
# CREATE A DATASET TO FEED THE MODEL
ds = DatasetDict({"train":tok_ds_train,
                  "validation":tok_ds_validation,
             "test": tok_ds_test})

In [24]:
ds

DatasetDict({
    train: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 6851
    })
    validation: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 762
    })
    test: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 762
    })
})

In [25]:
# let's convert the tokens into text and check if it is the same as the printed above
tokz.decode(ds["train"][19]["input_ids"])


'[CLS] # ccot # tcot # radiation nuclear emergency tracking center http : / / t. co / kf74o2mcsc https : / / t. co / n2zhrchcgv [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [26]:
# importing the datacollator for DISTILBERT

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokz)

### IMPORTING THE MODEL

In [27]:
# IMPORTING THE MODEL
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels = 2).to(device) 

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [28]:
# checking if the model is on the gpu
print_gpu_utilization()

GPU memory occupied: 1349 MB.


### DEFINING METRICS

If we use the Trainer API to fine tune our model, we must define the metric function and pass it as one of the trainer argument; the trainer does not compute the metrics in automatic. Given the lack of class imbalance and cost imbalance, we will deploy Accuracy as evaluation strategy

In [29]:
import numpy as np

In [30]:
# we create an object called metrics; if the method "compute" is called and we pass in a list of predictions and true labels, it automatically compute them
metrics = evaluate.load("f1")

In [31]:
# now that we have our callable object, we define a function that the trainer can use to compute its metric => we cannot call directly metrics.compute because the output
# of the model is not a prediction but a logist
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

### SETTING HYPERPARAMETERS OF BERT

In [32]:
from transformers import TrainingArguments, Trainer

In [33]:
# setting the hyperparameter for the trainer 
training_args = TrainingArguments(
    model_nm,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01, 
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8, 
    logging_steps = 50, # FROM BELOW MEMORY TRICKS
    gradient_accumulation_steps=16, # adding them to offset small batch size due to memory problem => so 2*8 => 16 batch-size traning
    fp16 = True
    )

### CREATING THE TRAINER

In [34]:
# passing in the hyperparameter for the trainer 
trainer = Trainer(
    model = model, # our model
    args = training_args, # hyperparameter defined before
    train_dataset = ds["train"], 
    eval_dataset = ds["test"],
    compute_metrics = compute_metrics, # evaluation function defined before
    data_collator = data_collator,
)

### TRAINING THE MODEL AND TESTING IT

In [35]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.5058,0.421968,0.82021,0.820061
1,0.3767,0.405418,0.841207,0.838827
2,0.3393,0.402863,0.833333,0.832726


TrainOutput(global_step=159, training_loss=0.40452393495811606, metrics={'train_runtime': 82.5594, 'train_samples_per_second': 248.948, 'train_steps_per_second': 1.926, 'total_flos': 442091314786464.0, 'train_loss': 0.40452393495811606, 'epoch': 2.97})

In [36]:
trainer = Trainer(model=model, compute_metrics=compute_metrics)
eval_result = trainer.evaluate(ds['test'])
print(eval_result)

{'eval_loss': 0.40285524725914, 'eval_accuracy': 0.8333333333333334, 'eval_f1': 0.83272646075782, 'eval_runtime': 2.0521, 'eval_samples_per_second': 371.33, 'eval_steps_per_second': 46.782}


### SAVING THE MODEL

In [None]:
import os
# Set the output directory
output_dir = '/content/output/'

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the model and tokenizer to the output directory
trainer.save_model(output_dir)
tokz.save_pretrained(output_dir)
