### IMPORTING THE REQUIRED MODULES

In [None]:
# !pip install transformers 
# !pip install datasets
# !pip install pynvml
# !pip install evaluate 

In [None]:
import transformers
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
import numpy as np
import os
import nltk
import torch
import evaluate
import sys
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

### UTILS FUNCTION TO WORK WITH GPU

In [None]:
# define utils functions to facilitate gpu 

def check_gpu_availability():
    # Check if CUDA is available
    print(f"Cuda is available: {torch.cuda.is_available()}")

def getting_device(gpu_prefence=True) -> torch.device:
    """
    This function gets the torch device to be used for computations, 
    based on the GPU preference specified by the user.
    """
    
    # If GPU is preferred and available, set device to CUDA
    if gpu_prefence and torch.cuda.is_available():
        device = torch.device('cuda')
    # If GPU is not preferred or not available, set device to CPU
    else: 
        device = torch.device("cpu")
    
    # Print the selected device
    print(f"Selected device: {device}")
    
    # Return the device
    return device

# Define a function to print GPU memory utilization
def print_gpu_utilization():
    # Initialize the PyNVML library
    nvmlInit()
    # Get a handle to the first GPU in the system
    handle = nvmlDeviceGetHandleByIndex(0)
    # Get information about the memory usage on the GPU
    info = nvmlDeviceGetMemoryInfo(handle)
    # Print the GPU memory usage in MB
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

# Define a function to print training summary information
def print_summary(result):
    # Print the total training time in seconds
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    # Print the number of training samples processed per second
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    # Print the GPU memory utilization
    print_gpu_utilization()

In [None]:
# CHECK IF GPU IS UP
check_gpu_availability()

Cuda is available: True


In [None]:
# SAVE THE DEVICE WE ARE WORKING WITH
device = getting_device(gpu_prefence=True)

Selected device: cuda


In [None]:
# SHOULD BE FEW MB
print_gpu_utilization()

GPU memory occupied: 253 MB.


### IMPORTING THE DATA

In [None]:
# Read in train and test CSV files using Pandas
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
train_df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [None]:
# pandas2dataset
ds_train = Dataset.from_pandas(train_df)
ds_test = Dataset.from_pandas(test_df)

In [None]:
ds_train

Dataset({
    features: ['discourse_id', 'essay_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness'],
    num_rows: 36765
})

### MODEL CHOICE

We picked a longformer because we plan to discard every information but the text and develop a baseline for classification with a small fine tuning. The text is pretty long and the cool thing about longformer is that their space complexity scale linearly based on the input size, not with O(x^2) as a normal transfomer (which make unfeasable to fine tune the transfomer on a single colab gpu)

In [None]:
model_nm = "allenai/longformer-base-4096" 

### TOKENIZING

In [None]:
# IMPORTING THE MODULE TO GET THE TOKENIZER
from transformers import AutoTokenizer

In [None]:
# IMPORTING OUR TOKENIZER
tokz = AutoTokenizer.from_pretrained(model_nm)

Downloading (…)lve/main/config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# DEFINING A TOKENIZE FUNCTION TO TOKENIZE BOTH THE TWO DATASETS
def tok_func(x): return tokz(x["discourse_text"], truncation=True, max_length = 4096, padding = "max_length")

In [None]:
# CHECK THAT TOKENIZER FUNCTION WORKS
tok_func(ds_train[19]) # the 1 are for padding it; the attention mask show to not care about the 1

{'input_ids': [0, 43174, 512, 304, 965, 75, 1099, 4, 19079, 9, 2172, 634, 1677, 6, 959, 6, 16, 182, 1099, 4, 6068, 5, 9723, 4, 1648, 95, 442, 10, 10146, 6514, 1351, 7, 512, 3716, 6, 4806, 7, 173, 50, 334, 6, 304, 285, 4264, 6, 50, 304, 103, 2345, 9, 3626, 4264, 115, 28, 615, 7, 699, 62, 5, 935, 9, 103, 9, 24, 18, 5035, 6, 489, 36158, 8, 643, 16265, 8, 540, 5882, 6, 1871, 86, 8, 418, 6, 8, 1477, 49, 343, 70, 11, 65, 410, 568, 4, 85, 18, 95, 10, 948, 9, 164, 789, 8, 608, 24, 6, 98, 99, 16, 8197, 1268, 31, 164, 66, 8, 442, 14, 464, 122, 116, 1456, 5, 464, 47, 236, 7, 192, 8, 3000, 110, 512, 9453, 122, 4, 1437, 1437, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
# TOKENIZING THE DS
tok_ds_train = ds_train.map(tok_func, batched=True, remove_columns=["discourse_text",'discourse_id', "essay_id", "discourse_type"])
tok_ds_test = ds_test.map(tok_func, batched=True, remove_columns=["discourse_text",'discourse_id', "essay_id", "discourse_type"])

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# CREATE A DATASET TO FEED THE MODEL
ds = DatasetDict({"train":tok_ds_train,
             "test": tok_ds_test})

In [None]:
# CHECK IF THE TOKENIZER WORKS, lets print the normal text
ds_train[19]

{'discourse_id': '9fd314f638e5',
 'essay_id': '00944C693682',
 'discourse_text': "Individual car use isn't bad. Millions of individuals using cars, however, is very bad. Note the emphasis. Even just making a concious effort to car pool, bike to work or school, use public transportation, or use some sort of alternative transportation could be enough to clear up the air of some of it's emissions, keep oneself and others happier and less stressed, save time and money, and improve their city all in one little decision. It's just a matter of going ahead and doing it, so what is stopping anyone from going out and making that change now? Be the change you want to see and limit your car usage now.  ",
 'discourse_type': 'Concluding Statement',
 'discourse_effectiveness': 'Effective'}

In [None]:
# lets print the tokenized text
tok_ds_train[19]

{'discourse_effectiveness': 'Effective',
 'input_ids': [0,
  43174,
  512,
  304,
  965,
  75,
  1099,
  4,
  19079,
  9,
  2172,
  634,
  1677,
  6,
  959,
  6,
  16,
  182,
  1099,
  4,
  6068,
  5,
  9723,
  4,
  1648,
  95,
  442,
  10,
  10146,
  6514,
  1351,
  7,
  512,
  3716,
  6,
  4806,
  7,
  173,
  50,
  334,
  6,
  304,
  285,
  4264,
  6,
  50,
  304,
  103,
  2345,
  9,
  3626,
  4264,
  115,
  28,
  615,
  7,
  699,
  62,
  5,
  935,
  9,
  103,
  9,
  24,
  18,
  5035,
  6,
  489,
  36158,
  8,
  643,
  16265,
  8,
  540,
  5882,
  6,
  1871,
  86,
  8,
  418,
  6,
  8,
  1477,
  49,
  343,
  70,
  11,
  65,
  410,
  568,
  4,
  85,
  18,
  95,
  10,
  948,
  9,
  164,
  789,
  8,
  608,
  24,
  6,
  98,
  99,
  16,
  8197,
  1268,
  31,
  164,
  66,
  8,
  442,
  14,
  464,
  122,
  116,
  1456,
  5,
  464,
  47,
  236,
  7,
  192,
  8,
  3000,
  110,
  512,
  9453,
  122,
  4,
  1437,
  1437,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,

In [None]:
# let's convert the tokens into text and check if it is the same as the printed above
tokz.decode(ds["train"][19]["input_ids"])


"<s>Individual car use isn't bad. Millions of individuals using cars, however, is very bad. Note the emphasis. Even just making a concious effort to car pool, bike to work or school, use public transportation, or use some sort of alternative transportation could be enough to clear up the air of some of it's emissions, keep oneself and others happier and less stressed, save time and money, and improve their city all in one little decision. It's just a matter of going ahead and doing it, so what is stopping anyone from going out and making that change now? Be the change you want to see and limit your car usage now.  </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad