In [2]:
# To check the GPU config
!nvidia-smi

Fri Oct 20 18:22:12 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Install necessary libraries

In [3]:
!pip install transformers[sentencepiece] datasets evaluate

Collecting transformers[sentencepiece]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[sentencepiece])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[sentencepiece])
  Downloading tokenizers-0.14.1-cp310-cp310-man

In [4]:
#Necessary for running the training arguments
!pip install --upgrade accelerate
!pip install -y transformers accelerate
!pip install transformers accelerate

Collecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/258.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/258.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.23.0

Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: -y


In [None]:
# Data processing
import pandas as pd
import numpy as np

# Modeling
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Progress bar
from tqdm.auto import tqdm

# Hugging Face Dataset
from datasets import Dataset

# Model performance evaluation
import evaluate

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
train_df = pd.read_csv("/content/BBC News Train.csv")

In [None]:
train_df.Category.value_counts().to_frame()

Unnamed: 0,Category
sport,346
business,336
politics,274
entertainment,273
tech,261


In [None]:
#Encoding the labels
train_df['encoded_label'] = train_df['Category'].astype('category').cat.codes

train_df.head(10)

Unnamed: 0,ArticleId,Text,Category,encoded_label
0,1833,worldcom ex-boss launches defence lawyers defe...,business,0
1,154,german business confidence slides german busin...,business,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,0
3,1976,lifestyle governs mobile choice faster bett...,tech,4
4,917,enron bosses in $168m payout eighteen former e...,business,0
5,1582,howard truanted to play snooker conservative...,politics,2
6,651,wales silent on grand slam talk rhys williams ...,sport,3
7,1797,french honour for director parker british film...,entertainment,1
8,2034,car giant hit by mercedes slump a slump in pro...,business,0
9,1866,fockers fuel festive film chart comedy meet th...,entertainment,1


###Split the dataset

In [None]:
# Training dataset
train_data = train_df.sample(frac=0.8, random_state=42)

# Testing dataset
test_data = train_df.drop(train_data.index)

# Check the number of records in training and testing dataset.
print(f'The training dataset has {len(train_data)} records.')
print(f'The testing dataset has {len(test_data)} records.')

The training dataset has 1192 records.
The testing dataset has 298 records.


In [None]:
# Convert pyhton dataframe to Hugging Face arrow dataset
hg_train_data = Dataset.from_pandas(train_data)
hg_test_data = Dataset.from_pandas(test_data)

In [None]:
# Length of the Dataset
print(f'The length of hg_train_data is {len(hg_train_data)}.\n')

# Check one review
hg_train_data[0]

The length of hg_train_data is 1192.



{'ArticleId': 2160,
 'Category': 'entertainment',
 'encoded_label': 1,
 '__index_level_0__': 941}

### Tokenize the text

In [None]:
# Tokenizer from a pretrained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Take a look at the tokenizer
tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [None]:
# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["Text"],
                     max_length=512,
                     truncation=True,
                     padding="max_length")

# Tokenize the dataset
dataset_train = hg_train_data.map(tokenize_dataset)
dataset_test = hg_test_data.map(tokenize_dataset)

Map:   0%|          | 0/1192 [00:00<?, ? examples/s]

Map:   0%|          | 0/298 [00:00<?, ? examples/s]

In [None]:
# Take a look at the data
print(dataset_train)
print(dataset_test)

Dataset({
    features: ['ArticleId', 'Text', 'Category', 'encoded_label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1192
})
Dataset({
    features: ['ArticleId', 'Text', 'Category', 'encoded_label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 298
})


### Preprocess the dataset

In [None]:
# Remove the review and index columns because it will not be used in the model
dataset_train = dataset_train.remove_columns(["ArticleId", "Text", "Category", "__index_level_0__"])
dataset_test = dataset_test.remove_columns(["ArticleId", "Text", "Category", "__index_level_0__"])

# Rename label to labels because the model expects the name labels
dataset_train = dataset_train.rename_column("encoded_label", "labels")
dataset_test = dataset_test.rename_column("encoded_label", "labels")

# Change the format to PyTorch tensors
dataset_train.set_format("torch")
dataset_test.set_format("torch")

# Take a look at the data
print(dataset_train)
print(dataset_test)

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1192
})
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 298
})


In [None]:
# Check the first record
dataset_train[100]

{'labels': tensor(1),
 'input_ids': tensor([  101, 12297,  7867,  5260,  2866,  7436,  8069,  3505, 11797,  1055,
          2143, 12297,  7867,  2097,  2599,  2329,  8069,  2012,  2023,  2095,
          1055,  2914,  2982,  2044,  2893,  2093,  9930,  1012, 10047, 14273,
          2050,  2358,  4887, 15104,  2001,  4222,  2005,  2190,  3883,  2005,
          2014,  2535,  1999,  1996, 11324,  3689,  2096, 11797,  2363, 11232,
          2005,  2190,  2472,  1998,  2434,  9000,  1012,  5736,  5222,  7485,
          2001,  2036,  4222,  1999,  1996,  2190,  3883,  4696,  2005,  2014,
          2535,  1999, 10721,  9609,  1997,  1996,  3962,  3238,  2568,  1012,
          1998, 14675,  7291,  1998,  8234,  7929, 17799,  2080,  2119,  2288,
          4222,  2005,  4637,  4395,  1999,  3553,  1998,  3309, 17591,  4414,
          1012,  7291,  2038,  2525,  2042,  2081,  2338, 12088,  8837,  2005,
          2190,  4637,  3364,  2005,  1996,  2535,  1999,  3553,  2008,  2038,
          2525, 1

### DataLoader

In [None]:
# Empty cache
torch.cuda.empty_cache()

# DataLoader
train_dataloader = DataLoader(dataset=dataset_train, shuffle=True, batch_size=4)
eval_dataloader = DataLoader(dataset=dataset_test, batch_size=4)

### Load the Pretrained Model

In [None]:
id2label = {0: "business", 1: "entertainment", 2: "politics", 3: "sport", 4: "tech"}
label2id = {"business": 0, "entertainment": 1, "politics": 2, "sport": 3, "tech": 4 }

In [None]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5,id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

### Training

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="bert-base-cased",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_steps = 50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.190009,0.979866
2,No log,0.107408,0.97651
3,No log,0.060758,0.986577
4,No log,0.048019,0.989933
5,No log,0.051927,0.986577


TrainOutput(global_step=375, training_loss=0.19290401204427082, metrics={'train_runtime': 673.0898, 'train_samples_per_second': 8.855, 'train_steps_per_second': 0.557, 'total_flos': 1568184129085440.0, 'train_loss': 0.19290401204427082, 'epoch': 5.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.04801931604743004,
 'eval_accuracy': 0.9899328859060402,
 'eval_runtime': 10.2897,
 'eval_samples_per_second': 28.961,
 'eval_steps_per_second': 1.846,
 'epoch': 5.0}

In [None]:
# Saving the model and tokenizer
save_directory = "/saved_models"

model.save_pretrained(save_directory)

tokenizer.save_pretrained(save_directory)

('/saved_models/tokenizer_config.json',
 '/saved_models/special_tokens_map.json',
 '/saved_models/vocab.txt',
 '/saved_models/added_tokens.json',
 '/saved_models/tokenizer.json')

### Inferencing

In [None]:
#Loading the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(save_directory)

best_model = AutoModelForSequenceClassification.from_pretrained(save_directory)


In [None]:
# Loading the test data from csv
pred_df = pd.read_csv("/content/BBC News Test.csv")

# Convert pyhton dataframe to Hugging Face arrow dataset
hg_pred_data = Dataset.from_pandas(pred_df)
test_text = hg_pred_data[1]["Text"]
test_text

'software watching while you work software that can not only monitor every keystroke and action performed at a pc but also be used as legally binding evidence of wrong-doing has been unveiled.  worries about cyber-crime and sabotage have prompted many employers to consider monitoring employees. the developers behind the system claim it is a break-through in the way data is monitored and stored. but privacy advocates are concerned by the invasive nature of such software.  the system is a joint venture between security firm 3ami and storage specialists bridgehead software. they have joined forces to create a system which can monitor computer activity  store it and retrieve disputed files within minutes. more and more firms are finding themselves in deep water as a result of data misuse. sabotage and data theft are most commonly committed from within an organisation according to the national hi-tech crime unit (nhtcu) a survey conducted on its behalf by nop found evidence that more than 8

In [None]:
tokenized_prediction_input = tokenizer.encode(test_text, truncation = True, padding = True, return_tensors = 'pt')

tokenized_prediction_input

tensor([[  101,  4007,  3666,  2096,  2017,  2147,  4007,  2008,  2064,  2025,
          2069,  8080,  2296,  6309, 13181,  3489,  1998,  2895,  2864,  2012,
          1037,  7473,  2021,  2036,  2022,  2109,  2004, 10142,  8031,  3350,
          1997,  3308,  1011,  2725,  2038,  2042, 11521,  1012, 15508,  2055,
         16941,  1011,  4126,  1998, 20223,  2031,  9469,  2116, 12433,  2000,
          5136,  8822,  5126,  1012,  1996,  9797,  2369,  1996,  2291,  4366,
          2009,  2003,  1037,  3338,  1011,  2083,  1999,  1996,  2126,  2951,
          2003, 17785,  1998,  8250,  1012,  2021,  9394, 13010,  2024,  4986,
          2011,  1996, 17503,  3267,  1997,  2107,  4007,  1012,  1996,  2291,
          2003,  1037,  4101,  6957,  2090,  3036,  3813, 23842,  4328,  1998,
          5527, 15744,  2958,  4974,  4007,  1012,  2027,  2031,  2587,  2749,
          2000,  3443,  1037,  2291,  2029,  2064,  8080,  3274,  4023,  3573,
          2009,  1998, 12850, 11621,  6764,  2306,  

In [None]:
output = best_model(tokenized_prediction_input).logits

prediction_value_pt = torch.argmax(output, dim = 1 ).item()

prediction_value_pt


4

In [None]:
id2label = {0: "business", 1: "entertainment", 2: "politics", 3: "sport", 4: "tech"}
label2id = {"business": 0, "entertainment": 1, "politics": 2, "sport": 3, "tech": 4 }

In [None]:
print(id2label)

{0: 'business', 1: 'entertainment', 2: 'politics', 3: 'sport', 4: 'tech'}


In [None]:
best_model.config.id2label[prediction_value_pt]

'tech'