Load Data

In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import pandas as pd
data = pd.read_csv('IMDB-Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
from datasets import Dataset
dataset= Dataset.from_pandas(data)
dataset

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})

In [4]:
dataset= dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

In [5]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [6]:
label2id = {'negative': 0, 'positive': 1}
id2label = {0: 'negative', 1: 'positive'}
dataset = dataset.map(lambda x:{'label': label2id[x['sentiment']]})

Map: 100%|██████████| 35000/35000 [00:02<00:00, 14748.55 examples/s]
Map: 100%|██████████| 15000/15000 [00:01<00:00, 13867.40 examples/s]


Data Tokenization

In [21]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda' if torch.cuda.is_available() else torch.device('cpu'))
model_ckpt = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

In [12]:
tokenizer

BertTokenizerFast(name_or_path='huawei-noah/TinyBERT_General_4L_312D', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [22]:
tokenizer(dataset['train'][0]['review'])

{'input_ids': [101, 2025, 1037, 2919, 7761, 2840, 2143, 1012, 2954, 5019, 2020, 2204, 1012, 8709, 18816, 18719, 2106, 1037, 2204, 3105, 9855, 2010, 2034, 2143, 2302, 3158, 5477, 4168, 1012, 2466, 2499, 2302, 12487, 2653, 1998, 2205, 2172, 2668, 1012, 11167, 3744, 7585, 10360, 15782, 2038, 1037, 2204, 2240, 2000, 1996, 2466, 2008, 2573, 1012, 2009, 2052, 2022, 2307, 2000, 2156, 2242, 2842, 2013, 2014, 1999, 1996, 2168, 6907, 1012, 2016, 7777, 1996, 2396, 1998, 2383, 2844, 2308, 20877, 8189, 3372, 1012, 2009, 2001, 5791, 4276, 3666, 1012, 1045, 16755, 1996, 2143, 2000, 2035, 3689, 1998, 7761, 2840, 10205, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [23]:
def tokenize_function(batch):
    temp = tokenizer(batch['review'], padding=True, truncation=True, max_length=512)
    return temp
dataset = dataset.map(tokenize_function, batched=True, batch_size=None)

Map: 100%|██████████| 35000/35000 [00:17<00:00, 1948.60 examples/s]
Map: 100%|██████████| 15000/15000 [00:07<00:00, 1892.04 examples/s]


In [24]:
dataset['train'][0].keys()

dict_keys(['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

Building Model Evaluation Functions

In [27]:
import evaluate
import numpy as np

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [29]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [30]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-1

In [34]:
args = TrainingArguments(
    output_dir = 'train_dir',
    overwrite_output_dir = True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy='epoch'
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [35]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.3332,0.291536,0.876733
2,0.2735,0.278437,0.890067
3,0.2377,0.281084,0.888933




TrainOutput(global_step=3282, training_loss=0.29807736654240935, metrics={'train_runtime': 62016.5047, 'train_samples_per_second': 1.693, 'train_steps_per_second': 0.053, 'total_flos': 1505594603520000.0, 'train_loss': 0.29807736654240935, 'epoch': 3.0})

In [36]:
trainer.evaluate()



{'eval_loss': 0.2810837924480438,
 'eval_accuracy': 0.8889333333333334,
 'eval_runtime': 1850.3495,
 'eval_samples_per_second': 8.107,
 'eval_steps_per_second': 0.253,
 'epoch': 3.0}

Model Save and Load For Inference

In [37]:
trainer.save_model('tinybert-sentiment-analysis')

In [40]:
data = ['Learning MlOps is not that hard',
        'Sometimes learning MlOps cab be very hard',
        'Well what do i know, I dont even know if you are willing to learn MlOps']

In [41]:
from transformers import pipeline
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline('text-classification', model='tinybert-sentiment-analysis', device=device)

classifier(data)

Device set to use cpu


[{'label': 'positive', 'score': 0.6510012149810791},
 {'label': 'negative', 'score': 0.723044216632843},
 {'label': 'negative', 'score': 0.7455148100852966}]

Push Model To S3

In [44]:
import boto3

s3 = boto3.client('s3')

bucket_name = 'bucket-for-practice-vineet'

def create_bucket(bucket_name):
    response = s3.list_buckets()
    buckets = [buck['Name'] for buck in response['Buckets']]
    if bucket_name not in buckets:
        s3.create_bucket(Bucket=bucket_name)
        print("Bucket is created")

    else:
        print("Bucket already exists in your account!!! Feel free to use it.")

create_bucket(bucket_name)

Bucket is created


In [46]:
import os
import boto3

s3 = boto3.client('s3')
bucket_name = 'bucket-for-practice-vineet'

def upload_directory(directory_path, s3_prefix):
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file).replace("\\", "/")
            relpath = os.path.relpath(file_path, directory_path)
            s3_key = os.path.join(s3_prefix, relpath).replace("\\", "/")
            
            s3.upload_file(file_path, bucket_name, s3_key)


upload_directory('tinybert-sentiment-analysis', 'ml-models/tinybert-sentiment-analysis')