# Python and SageMaker Setup

In [None]:
!pip install sagemaker


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sagemaker
  Downloading sagemaker-2.150.0.tar.gz (747 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m747.7/747.7 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting attrs<23,>=20.3.0
  Downloading attrs-22.2.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3<2.0,>=1.26.28
  Downloading boto3-1.26.121-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf3-to-dict<1.0,>=0.1.5
  Downloading protobuf3-to-dict-0.1.5.tar.gz (3.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting smdebug_rulesconfig==1.0.1
  Downloading smdebug_rulesconfig-1.0.1-py2.py3-none

# New Section

In [None]:
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker                                  # Amazon SageMaker's Python SDK provides many helper functions
from sagemaker.predictor import csv_serializer    # Converts strings for HTTP POST requests on inference
from tqdm import tqdm

In [None]:
!pip install awscli

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting awscli
  Downloading awscli-1.27.121-py3-none-any.whl (4.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rsa<4.8,>=3.1.2
  Downloading rsa-4.7.2-py3-none-any.whl (34 kB)
Collecting colorama<0.4.5,>=0.2.5
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: rsa, colorama, awscli
  Attempting uninstall: rsa
    Found existing installation: rsa 4.9
    Uninstalling rsa-4.9:
      Successfully uninstalled rsa-4.9
Successfully installed awscli-1.27.121 colorama-0.4.4 rsa-4.7.2


In [None]:
# bucket = sagemaker.Session().default_bucket()
# prefix = 'sagemaker/amazon_fine_food_reviews'
 
# # Define IAM role
# import boto3
# import re
# from sagemaker import get_execution_role

# role = get_execution_role()
# region = boto3.Session().region_name 
# smclient = boto3.Session().client('sagemaker')

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Load Data 

In [None]:
df = pd.read_csv('gdrive/Shareddrives/CIS519/Reviews.csv')
print(df.shape)
df = df.sample(5000, random_state=42)
print(df.shape)

(568454, 10)
(5000, 10)


In [None]:
import boto3
import re

In [None]:
def clean_text(line):
    line = re.sub(r'-+',' ',line)
    line = re.sub(r'[^a-zA-Z, ]+'," ",line)
    line = re.sub(r'[ ]+'," ",line)
    line += "."
    return line

# retain necessary columns
columns_to_use = ["Score", "Summary", "Text"]
df = df[columns_to_use]

# convert score to contextual labels
score2labels = {1: "very negative", 2: "negative", 3: "neutral", 4: "positive", 5: "very positive"}
df["Labels"] = df["Score"].apply(lambda x : score2labels[x])
df["Score"] = df["Score"] - 1

# clean text
df["Summary"] = df["Summary"].astype(str)
df["Summary"] = df["Summary"].apply(clean_text)
df["Text"] = df["Text"].astype(str)
df["Text"] = df["Text"].apply(clean_text)

# remove comments that are smaller than 20 words
df = df[df['Text'].apply(lambda x: len(x.split(" "))) >= 20]
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Score,Summary,Text,Labels
0,4,Crunchy Good Gluten Free Sandwich Cookies .,Having tried a couple of other brands of glute...,very positive
1,4,great kitty treats.,My cat loves these treats If ever I can t find...,very positive
2,2,COFFEE TASTE.,A little less than I expected It tends to have...,neutral
3,1,So the Mini Wheats were too big .,"First there was Frosted Mini Wheats, in origin...",negative
4,4,Great Taste .,and I want to congratulate the graphic artist ...,very positive


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets (80/20 split)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Split the train data into train and validation sets (75/25 split)
train_df, valid_df = train_test_split(train_df, test_size=0.25, random_state=42)

# Print the resulting sizes of each set
print("Training set size:", len(train_df))
print("Validation set size:", len(valid_df))
print("Test set size:", len(test_df))

Training set size: 2935
Validation set size: 979
Test set size: 979


# BERT for sequence classification

In [None]:
train_batch_size = 32
val_batch_size = 32
test_batch_size = 32
epochs = 5
seed = 42
learning_rate = 2e-5

feature = "text"
feature = "summary"
base_model = "roberta-base"
model_name = feature + "_" + base_model
print(model_name)

summary_roberta-base


In [None]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
import torch
from torch import nn
from transformers import (BertTokenizer, BertForSequenceClassification,
                          RobertaTokenizer, RobertaForSequenceClassification,
                          DistilBertTokenizer, DistilBertForSequenceClassification,
                          set_seed, AdamW, get_linear_schedule_with_warmup)
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler, random_split
from torch.nn import CrossEntropyLoss

set_seed(seed)

# Load the tokenizer and model
if base_model == 'bert-base-uncased':
    tokenizer = BertTokenizer.from_pretrained(base_model)
    model = BertForSequenceClassification.from_pretrained(base_model, 
                                                          num_labels=5)
elif base_model == "roberta-base":
    tokenizer = RobertaTokenizer.from_pretrained(base_model)
    model = RobertaForSequenceClassification.from_pretrained(base_model, 
                                                              num_labels=5)
elif base_model == "distilbert-base-uncased":
    tokenizer = DistilBertTokenizer.from_pretrained(base_model)
    model = DistilBertForSequenceClassification.from_pretrained(base_model,
                                                                 num_labels=5)
else:
    raise ValueError("Invalid base_model value. Supported models: 'bert-base-uncased', 'roberta-base', 'distilbert-base-uncased'")

# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

cuda


In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Function to calculate the accuracy of our predictions vs labels
def flat_matrices(preds, labels):
    preds = nn.functional.softmax(preds, dim=1).squeeze().to('cpu').numpy().reshape(-1, 5)
    preds = np.argmax(preds, axis=1)
    labels_flat = labels.flatten()
    return accuracy_score(preds, labels_flat), f1_score(preds, labels_flat, average='weighted')


In [None]:
def prepare_input(text):
    inputs = tokenizer(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=256,
        padding='max_length', 
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs

class custom_dataset(Dataset):
    def __init__(self, df, feature):
        if (feature == "text"):
            self.text = df['Text'].values
        elif (feature == "summary"):
            self.text = df['Summary'].values

        self.label = df['Score'].values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        inputs = prepare_input(self.text[item])
        target = torch.tensor(self.label[item], dtype=torch.long)
        return inputs, target

train_dataset = custom_dataset(train_df, feature)
train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size=train_batch_size,
)    

val_dataset = custom_dataset(valid_df, feature)
val_dataloader = DataLoader(
    val_dataset,
    sampler = SequentialSampler(val_dataset),
    batch_size=val_batch_size,
)    

In [None]:
total_steps = len(train_dataloader) * epochs
warmup_steps = int(total_steps * 0.2)

optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = 1e-8
                )

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)
loss_fn = CrossEntropyLoss()



In [None]:
import os

models_dir = './models'
if not os.path.exists(models_dir):
    os.makedirs(models_dir)


In [None]:
best_eval_accuracy = 0

for epoch_i in range(epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    total_train_loss = 0
    model.train()
    for step, (inputs, target) in enumerate(tqdm(train_dataloader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        optimizer.zero_grad()
        output = model(**inputs).logits
        loss = loss_fn(output, target)
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    print("Average training loss: {0:.2f}".format(avg_train_loss))

    total_eval_accuracy = 0
    model.eval()
    with torch.no_grad():
        for step, (inputs, target) in enumerate(tqdm(val_dataloader)):
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            optimizer.zero_grad()
            output = model(**inputs).logits

            acc, _ = flat_matrices(output, target)
            total_eval_accuracy += acc 

        avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
        print("Average validation accuracy: {0:.2f}".format(avg_val_accuracy))
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, './models/'+model_name)
        best_eval_accuracy = avg_val_accuracy
        
print("")
print("Training complete!")

Training...


100%|██████████| 92/92 [01:56<00:00,  1.27s/it]


Average training loss: 1.28


100%|██████████| 31/31 [00:13<00:00,  2.24it/s]


Average validation accuracy: 0.65
Training...


100%|██████████| 92/92 [01:59<00:00,  1.30s/it]


Average training loss: 0.93


100%|██████████| 31/31 [00:13<00:00,  2.29it/s]


Average validation accuracy: 0.70
Training...


100%|██████████| 92/92 [01:59<00:00,  1.30s/it]


Average training loss: 0.77


100%|██████████| 31/31 [00:13<00:00,  2.29it/s]


Average validation accuracy: 0.70
Training...


100%|██████████| 92/92 [02:00<00:00,  1.31s/it]


Average training loss: 0.67


100%|██████████| 31/31 [00:13<00:00,  2.29it/s]


Average validation accuracy: 0.70
Training...


100%|██████████| 92/92 [02:00<00:00,  1.30s/it]


Average training loss: 0.61


100%|██████████| 31/31 [00:13<00:00,  2.29it/s]


Average validation accuracy: 0.71

Training complete!


## Test 

In [None]:
test_dataset = custom_dataset(test_df, feature)
test_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = test_batch_size # Evaluate with this batch size.
        )


model = torch.load('./models/'+model_name)
model.eval()

total_test_accuracy = 0
total_test_f1 = 0

with torch.no_grad():
    for step, (inputs, target) in enumerate(tqdm(test_dataloader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        
        y_preds = model(**inputs).logits

        acc, f1 = flat_matrices(y_preds, target)
        total_test_accuracy += acc 
        total_test_f1 += f1

avg_test_accuracy = total_test_accuracy / len(test_dataloader)
print("Accuracy: {0:.4f}".format(avg_test_accuracy))   

avg_test_f1 = total_test_f1 / len(test_dataloader)
print("F1: {0:.4f}".format(avg_test_f1))   


100%|██████████| 31/31 [00:13<00:00,  2.30it/s]

Accuracy: 0.7016
F1: 0.7312



