### ENV Setting

In [5]:
!module load conda
!conda activate myenv
!module load cudnn/9.1.0
!module load nccl/2.21.5

### Data preprocessing

In [6]:
import pandas as pd

df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
# Remove the break tags (<br />)
df['review_cleaned'] = df['review'].apply(lambda x: x.replace('<br />', ''))

# Remove unnecessary whitespace
df['review_cleaned'] = df['review_cleaned'].replace('\s+', ' ', regex=True)

# Compare 72 characters of the second review before and after cleaning
print('Before cleaning:')
print(df.iloc[1]['review'][0:72])

print('\nAfter cleaning:')
print(df.iloc[1]['review_cleaned'][0:72])

Before cleaning:
A wonderful little production. <br /><br />The filming technique is very

After cleaning:
A wonderful little production. The filming technique is very unassuming-


In [8]:
df['sentiment_encoded'] = df['sentiment'].apply(lambda x: 0 if x == 'negative' else 1)
df.head()

Unnamed: 0,review,sentiment,review_cleaned,sentiment_encoded
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is...",1


### Tokenize

In [9]:
#pip install 'transformers[torch]'

In [10]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print(tokenizer)

  from .autonotebook import tqdm as notebook_tqdm


BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [11]:
# Just for test - return tensor pt/tf/np for each type
sample_sentence = 'I liked this movie'
token_ids = tokenizer.encode(sample_sentence, return_tensors='np')[0]
print(f'Token IDs: {token_ids}')

tokens = tokenizer.convert_ids_to_tokens(token_ids)
print(f'Tokens   : {tokens}')

Token IDs: [ 101 1045 4669 2023 3185  102]
Tokens   : ['[CLS]', 'i', 'liked', 'this', 'movie', '[SEP]']


In [12]:
#pip install torch

In [13]:
review = df['review_cleaned'].iloc[0]

token_ids = tokenizer.encode(
    review,
    max_length = 512,
    padding = 'max_length',
    truncation = True,
    return_tensors = 'pt')

print(token_ids)

tensor([[  101,  2028,  1997,  1996,  2060, 15814,  2038,  3855,  2008,  2044,
          3666,  2074,  1015, 11472,  2792,  2017,  1005,  2222,  2022, 13322,
          1012,  2027,  2024,  2157,  1010,  2004,  2023,  2003,  3599,  2054,
          3047,  2007,  2033,  1012,  1996,  2034,  2518,  2008,  4930,  2033,
          2055, 11472,  2001,  2049, 24083,  1998,  4895, 10258,  2378,  8450,
          5019,  1997,  4808,  1010,  2029,  2275,  1999,  2157,  2013,  1996,
          2773,  2175,  1012,  3404,  2033,  1010,  2023,  2003,  2025,  1037,
          2265,  2005,  1996,  8143, 18627,  2030,  5199,  3593,  1012,  2023,
          2265,  8005,  2053, 17957,  2007, 12362,  2000,  5850,  1010,  3348,
          2030,  4808,  1012,  2049,  2003, 13076,  1010,  1999,  1996,  4438,
          2224,  1997,  1996,  2773,  1012,  2009,  2003,  2170, 11472,  2004,
          2008,  2003,  1996,  8367,  2445,  2000,  1996, 17411,  4555,  3036,
          2110,  7279,  4221, 12380,  2854,  1012,  

In [14]:
import os
import pickle

output_file = 'tokenized_data.pkl'
def tokenize_data(df):
    token_ids = []
    attention_masks = []
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    for i, review in enumerate(df['review_cleaned']):
        if i % 100 == 0:
            print(f"{i} data processed")
        
        batch_encoder = tokenizer.encode_plus(
            review,
            max_length = 512,
            padding = 'max_length',
            truncation = True,
            return_tensors = 'pt'
        )

        token_ids.append(batch_encoder['input_ids'])
        attention_masks.append(batch_encoder['attention_mask'])

    token_ids = torch.cat(token_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return token_ids, attention_masks

def save_tokenized_data(token_ids, attention_masks, output_file):
    data = {'token_ids' : token_ids, 'attention_masks' : attention_masks}

    with open(output_file, 'wb') as f:
        pickle.dump(data, f)
    print(f"Tokenized data saved as {output_file}")

def load_tokenized_data(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    print(f"Tokenized data loaded from {file_path}")
    return data['token_ids'], data['attention_masks']


if not os.path.exists(output_file):
    print("Preprocessing started")
    token_ids, attention_masks = tokenize_data(df)
    save_tokenized_data(token_ids, attention_masks, output_file)
else:
    print("Loaded Preprocessing data")
    token_ids, attention_masks = load_tokenized_data(output_file)

len(token_ids)
len(attention_masks)

Loaded Preprocessing data
Tokenized data loaded from tokenized_data.pkl


50000

### Dataloader

In [27]:
# Check if GPU is available for faster training time
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

device

device(type='cuda', index=0)

In [34]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

val_size = 0.1

# Split the token IDs
train_ids, val_ids = train_test_split(
                        token_ids,
                        test_size=val_size,
                        shuffle=False)

# Split the attention masks
train_masks, val_masks = train_test_split(
                            attention_masks,
                            test_size=val_size,
                            shuffle=False)

# Split the labels
labels = torch.tensor(df['sentiment_encoded'].values)
train_labels, val_labels = train_test_split(
                                labels,
                                test_size=val_size,
                                shuffle=False)

# Create the DataLoaders
train_ids = train_ids.to(device)
train_masks = train_masks.to(device)
train_labels = train_labels.to(device)
val_ids = val_ids.to(device)
val_masks = val_masks.to(device)
val_labels = val_labels.to(device)

train_data = TensorDataset(train_ids, train_masks, train_labels)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)
val_data = TensorDataset(val_ids, val_masks, val_labels)
test_dataloader = DataLoader(val_data, batch_size=16)

### Instantiate Model

In [35]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2)

model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Instantiate Optim, loss fn, scheduler

In [36]:
from torch.optim import AdamW
import torch.nn as nn
from transformers import get_linear_schedule_with_warmup

EPOCHS = 2

optimizer = AdamW(model.parameters())

loss_function = nn.CrossEntropyLoss()

num_training_steps = EPOCHS * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_training_steps)

### Training
It seems that it works well, but i think i have to make job and upload to GPU server

In [41]:
from tqdm import tqdm

for epoch in range(0, EPOCHS):

    model.train()
    training_loss = 0

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}'):

        model.zero_grad()
        
        batch_token_ids = batch[0].to(device)
        batch_attention_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)

        loss, logits = model(
            batch_token_ids,
            token_type_ids = None,
            attention_mask=batch_attention_mask,
            labels=batch_labels,
            return_dict=False)

        training_loss += loss.item()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    average_train_loss = training_loss / len(train_dataloader)

Epoch 1:   1%|          | 30/2813 [00:11<17:59,  2.58it/s]


KeyboardInterrupt: 

In [46]:
from tqdm import tqdm
import numpy as np

def calculate_accuracy(preds, labels):
    """ Calculate the accuracy of model predictions against true labels.

    Parameters:
        preds (np.array): The predicted label from the model
        labels (np.array): The true label

    Returns:
        accuracy (float): The accuracy as a percentage of the correct
            predictions.
    """
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    accuracy = np.sum(pred_flat == labels_flat) / len(labels_flat)

    return accuracy

def calculate_accuracy_gpu(preds, labels):
    pred_flat = torch.argmax(preds, dim=1).flatten()
    labels_flat = labels.flatten()
    accuracy = torch.sum(pred_flat == labels_flat).item() / len(labels_flat)
    return accuracy

model.to(device)
model.eval()
val_loss = 0
val_accuracy = 0

for batch in tqdm(test_dataloader):

    batch_token_ids = batch[0].to(device)
    batch_attention_mask = batch[1].to(device)
    batch_labels = batch[2].to(device)

    with torch.no_grad():
        (loss, logits) = model(
            batch_token_ids,
            attention_mask = batch_attention_mask,
            labels = batch_labels,
            token_type_ids = None,
            return_dict=False)

    # For CPU function
    # logits = logits.detach().cpu().numpy()
    # label_ids = batch_labels.to('cpu').numpy()
    # val_loss += loss.item()
    # val_accuracy += calculate_accuracy(logits, label_ids)

    val_loss += loss.item()
    val_accuracy += calculate_accuracy_gpu(logits, batch_labels)

average_val_accuracy = val_accuracy / len(test_dataloader)

100%|██████████| 313/313 [00:38<00:00,  8.07it/s]


In [47]:
average_val_accuracy

0.4936102236421725