In [167]:
import mlflow
import torch
import pandas as pd
from transformers import BertForSequenceClassification,  BertTokenizerFast
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
import random
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm


In [168]:
mlflow.set_tracking_uri(uri="http://localhost:5000")
mlflow.pytorch.autolog()

# 1. Model Extraction

In [169]:
model_id = "bert-base-uncased"
model = BertForSequenceClassification.from_pretrained(
    model_id,
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [201]:
label_dict = dict({'positive': 1, 'negative': 0})
data = pd.read_csv("data/imdb_dataset.csv")
data['sentiment'] = data['sentiment'].map(label_dict)
data = data[:1000]

# 2. Train Test Split

In [202]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['sentiment']), data['sentiment'], test_size=0.2, random_state=42)

# 3. Data Preparation

In [203]:
tokenizer = BertTokenizerFast.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

In [204]:

encoded_data_train = tokenizer.batch_encode_plus(
    list(X_train.review.values),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    max_length=256,
    return_tensors='pt',
    truncation=True
)

encoded_data_val = tokenizer.batch_encode_plus(
    list(X_test.review.values),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    max_length=256,
    return_tensors='pt',
    truncation=True

)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(y_test.values)

In [205]:
dataset_train = TensorDataset(input_ids_train,
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val,
                            attention_masks_val,
                            labels_val)

# 5. Creating Data Loaders


In [206]:
batch_size = 32

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size
)

# 6. Setting up Optimizer

In [207]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)



In [208]:
epochs = 10

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

# 7. Defining our Performance Metrics

In [209]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [234]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    negative_class = 0
    positive_class = 1
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

    return (len(y_preds[y_preds == negative_class])/ len(labels_flat[labels_flat==negative_class]), 
            len(y_preds[y_preds == positive_class])/ len(labels_flat[labels_flat==positive_class]))

In [235]:
accuracy_per_class(predictions, true_vals)

Class: negative
Accuracy:5/6

Class: positive
Accuracy:9/14



(0.8333333333333334, 0.6428571428571429)

# 8. Creating our Training Loop

In [211]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [212]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cpu


In [213]:
def evaluate(dataloader_val):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in tqdm(dataloader_val):

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                  }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [214]:
def train(model):
    for epoch in range(1, epochs + 1):
        model.train()
        loss_train_total = 0

        progress_bar = tqdm(dataloader_train,
                            desc=f'Epoch {epoch}',
                            leave=False,
                            disable=False)

        for batch_idx, batch in enumerate(progress_bar):
            model.zero_grad()
            batch = tuple(b.to(device) for b in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }

            outputs = model(**inputs)
            loss = outputs[0]

            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            progress_bar.set_postfix({'training_loss': f'{loss.item() / len(batch):.3f}'})

        tqdm.write(f'\nEpoch {epoch}')
        loss_train_avg = loss_train_total / len(dataloader_train)
        tqdm.write(f'Training loss: {loss_train_avg}')

        val_loss, predictions, true_vals = evaluate(dataloader_val)
        val_f1 = f1_score_func(predictions, true_vals)
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (weighted): {val_f1}')
        accuracy = accuracy_per_class(predictions, true_vals)

        mlflow.log_metrics({
            "val_loss": val_loss,
            "val_f1": val_f1,
            "accuracy_positive": accuracy[0],
            "accuracy_negative": accuracy[1]
        })

In [218]:
expr_name = "/Users/yuriivoievidka/bert-base-uncasedt-train"
s3_bucket = "s3://model-storage-05062024"

mlflow.create_experiment(expr_name, artifact_location=s3_bucket)

mlflow.set_experiment(expr_name)


with mlflow.start_run() as run:
    mlflow.set_tag("Training Info", "Training BERT on IMDB Dataset")
    mlflow.log_param("model_name", 'bert-base-uncased')
    mlflow.log_param("num_train_epochs", epochs)
    mlflow.log_param("per_device_train_batch_size", batch_size)

    train(model)

    model_name = "BERT_IMDB_Classification"
    result = mlflow.register_model(
        "runs:/{}/model".format(mlflow.active_run().info.run_id),
        model_name
    )

    print(run.info)

    artifact_path = f"runs:/{run.info.experiment_id}/{run.info.run_id}/artifacts/bert-base-uncased"
    mlflow.pytorch.log_model(model, artifact_path)
    print(mlflow.get_artifact_uri(artifact_path))

mlflow.end_run()

Epoch 1:   0%|          | 0/25 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
run.info

In [None]:
mlflow.pytorch.log_model(model, artifact_path)
print(mlflow.get_artifact_uri(artifact_path))


In [None]:
mlflow.end_run()
