In [1]:
import os
!pip install mlflow dagshub

Installing collected packages: rfc3986, fusepy, commonmark, aniso8601, treelib, rich, querystring-parser, pathvalidate, graphql-core, dacite, cachetools, requests-toolbelt, httpcore, gunicorn, graphql-relay, gql, botocore, httpx, graphene, mlflow, dagshub
  Attempting uninstall: rich
    Found existing installation: rich 13.7.0
    Uninstalling rich-13.7.0:
      Successfully uninstalled rich-13.7.0
  Attempting uninstall: dacite
    Found existing installation: dacite 1.8.1
    Uninstalling dacite-1.8.1:
      Successfully uninstalled dacite-1.8.1
  Attempting uninstall: cachetools
    Found existing installation: cachetools 4.2.4
    Uninstalling cachetools-4.2.4:
      Successfully uninstalled cachetools-4.2.4
  Attempting uninstall: requests-toolbelt
    Found existing installation: requests-toolbelt 0.10.1
    Uninstalling requests-toolbelt-0.10.1:
      Successfully uninstalled requests-toolbelt-0.10.1
  Attempting uninstall: httpcore
    Found existing installation: httpcore 1.0

In [None]:
!curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
!unzip -o awscliv2.zip 
!./aws/install --update

In [3]:
!aws configure set aws_access_key_id '<>'
!aws configure set aws_secret_access_key '<>'

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [None]:
from boto3.session import Session
import boto3

ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
SECRET_KEY = os.getenv("AWS_SECRET_KEY")

session = Session(aws_access_key_id=ACCESS_KEY,
              aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
your_bucket = s3.Bucket(os.getenv("AWS_BUCKET_NAME"))

for s3_file in your_bucket.objects.all():
    print(s3_file.key) # prints the contents of bucket

s3 = session.client('s3')

s3.download_file(os.getenv("AWS_BUCKET_NAME"),'data/imdb_dataset.csv',f'{data_path}/imdb_dataset.csv')

In [None]:
# mlflow.set_tracking_uri(uri="http://localhost:5000")
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
mlflow.pytorch.autolog()

# 1. Model Extraction

In [None]:
model_id = "bert-base-uncased"
model = BertForSequenceClassification.from_pretrained(
    model_id,
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)


In [None]:
label_dict = dict({'positive': 1, 'negative': 0})
data = pd.read_csv(f"{data_path}/imdb_dataset.csv")
data['sentiment'] = data['sentiment'].map(label_dict)
data = data[:10]

# 2. Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['sentiment']), data['sentiment'], test_size=0.2, random_state=42)

# 3. Data Preparation

In [None]:
tokenizer = BertTokenizerFast.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

In [None]:

encoded_data_train = tokenizer.batch_encode_plus(
    list(X_train.review.values),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    max_length=256,
    return_tensors='pt',
    truncation=True
)

encoded_data_val = tokenizer.batch_encode_plus(
    list(X_test.review.values),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    max_length=256,
    return_tensors='pt',
    truncation=True

)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(y_test.values)

In [None]:
dataset_train = TensorDataset(input_ids_train,
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val,
                            attention_masks_val,
                            labels_val)

# 5. Creating Data Loaders


In [None]:
batch_size = 32

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size
)

# 6. Setting up Optimizer

In [None]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)

In [None]:
epochs = 10

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

# 7. Defining our Performance Metrics

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [None]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    negative_class = 0
    positive_class = 1
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

    return (len(y_preds[y_preds == negative_class])/ len(labels_flat[labels_flat==negative_class]), 
            len(y_preds[y_preds == positive_class])/ len(labels_flat[labels_flat==positive_class]))

# 8. Creating our Training Loop

In [None]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

In [None]:
def evaluate(dataloader_val):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in tqdm(dataloader_val):

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                  }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [None]:
def train(model):
    for epoch in range(1, epochs + 1):
        model.train()
        loss_train_total = 0

        progress_bar = tqdm(dataloader_train,
                            desc=f'Epoch {epoch}',
                            leave=False,
                            disable=False)

        for batch_idx, batch in enumerate(progress_bar):
            model.zero_grad()
            batch = tuple(b.to(device) for b in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }

            outputs = model(**inputs)
            loss = outputs[0]

            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            progress_bar.set_postfix({'training_loss': f'{loss.item() / len(batch):.3f}'})

        tqdm.write(f'\nEpoch {epoch}')
        loss_train_avg = loss_train_total / len(dataloader_train)
        tqdm.write(f'Training loss: {loss_train_avg}')

        val_loss, predictions, true_vals = evaluate(dataloader_val)
        val_f1 = f1_score_func(predictions, true_vals)
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (weighted): {val_f1}')
        accuracy = accuracy_per_class(predictions, true_vals)

        mlflow.log_metrics({
            "val_loss": val_loss,
            "val_f1": val_f1,
            "accuracy_positive": accuracy[0],
            "accuracy_negative": accuracy[1]
        },  step=epoch)

In [None]:
expr_name = "/Users/yuriivoievidka/bert-base-uncased-train"
s3_bucket = "s3://model-storage-05062024"
# mlflow.create_experiment(expr_name, artifact_location=s3_bucket)
# mlflow.set_experiment(expr_name)
mlflow.set_experiment(expr_name)


In [None]:
with mlflow.start_run() as run:
    mlflow.set_tag("Training Info", "Training BERT on IMDB Dataset")
    mlflow.log_param("model_name", 'bert-base-uncased')
    mlflow.log_param("num_train_epochs", epochs)
    mlflow.log_param("per_device_train_batch_size", batch_size)

    train(model)

    model_name = "BERT_IMDB_Classification"
    result = mlflow.register_model(
        "runs:/{}/model".format(mlflow.active_run().info.run_id),
        model_name
    )

    print(run.info)

    artifact_path = f"bert-imdb-sentiment-analyser"
    mlflow.pytorch.log_model(model, artifact_path)
    print(mlflow.get_artifact_uri(artifact_path))

mlflow.end_run()