## Colab_setup

In [None]:
from pathlib import Path
import os
from google.colab import drive

In [None]:
def create_path(path):
    if not os.path.isdir(path):
        path.mkdir(parents=True, exist_ok=True)
    return path

In [None]:
from google.colab import drive
drive.mount('/content/drive')
root_dir = Path('/content/drive/My Drive')
base_path = create_path(root_dir/'Bert')
base_path

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


PosixPath('/content/drive/My Drive/Bert')

In [None]:
colab_path = Path('/content')

In [None]:
data_path = create_path(base_path/'dataset')

In [None]:
model_path = create_path(base_path/'models')

In [None]:
bert_path = (create_path(colab_path/'input/bert_uncased'))

In [None]:
############join all above to below

## Download Data

In [None]:
url = "'https://storage.googleapis.com/kaggle-data-sets/134715/320111/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1587033973&Signature=ldODSCHFe%2FEkLQ1K%2F7Jp8zS3%2B8C4WwUXdtaUk0rQv8sS%2BSvNjLynwk1%2FHqQgPbXc8VxIfZUTa%2F5ZnuE2sdqa0jsBmfvoQMII%2Fg6RRkvzx0APSFeajiVEnWf5dMMZTb1JDRKvM6DM4900brshBalN0%2BiwsXmdngokJ9FHQiNvcZKHlVhsUtqQeHidYDqyUVlXgSBCT6ZEtdGhJLSAEvHqSNabRsXR5VjiMpJqAb26HCm1R%2F7%2FIKpXUyJzF5BmxW%2BhZoydukE5QHTjXjlwbEdfHTjKooX2lNq13Z%2BCQCEeC8b3pwbPSEnyGwnZiZwEVoKCQnzF2LrFnB3CubBXDE5qbA%3D%3D&response-content-disposition=attachment%3B+filename%3Dimdb-dataset-of-50k-movie-reviews.zip'"

In [None]:
os.chdir(data_path)
!wget -q {str(url)} -O temp.zip && unzip -q temp.zip && rm 'temp.zip'
os.chdir(colab_path)

## config.py

In [None]:
!pip -q install transformers

[K     |████████████████████████████████| 645kB 17.5MB/s 
[K     |████████████████████████████████| 3.8MB 50.0MB/s 
[K     |████████████████████████████████| 1.0MB 47.2MB/s 
[K     |████████████████████████████████| 890kB 40.0MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
import transformers
class Config():
    def __init__(self):
        self.MAX_LEN = 512
        self.SAVE_MODEL_PATH = str(model_path/'finetuned-bert.pth')
        self.DATA_PATH = str(data_path/'IMDB Dataset.csv')
        self.BERT_PATH = str(bert_path/'finetuned-bert-2.pth')
        self.TRAIN_BATCH_SIZE = 8
        self.VALID_BATCH_SIZE = 4
        self.NUM_EPOCHS = 10
        self.MODEL_NAME = 'bert-base-uncased'
        self.TOKENIZER = transformers.BertTokenizer.from_pretrained(
            pretrained_model_name_or_path=self.MODEL_NAME,
            do_lower_case=True,
            force_download = True,
        )

config = Config()

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




##model.py

In [None]:
import transformers
import torch
from torch import nn

# make a pythorch model
class Bert(nn.Module):
    def __init__(self):
        super(Bert, self).__init__()
        # load a pretrained bert model arch
        self.bert = transformers.BertModel.from_pretrained(config.MODEL_NAME)
        #  dropout should be applied
        self.drop = nn.Dropout(0.3)
        # a classifier head should be placed
        self.head = nn.Linear(768, 1)

        # by default sigmoid will be placed after this head

    def forward(self, stoi, mask, token_type_ids):
        final_hidden, output = self.bert(input_ids=stoi, attention_mask=mask, token_type_ids=token_type_ids)
        # pass into dropout
        output = self.drop(output)
        # pass into classifier head
        output = self.head(output)

        return output

## data.py

In [None]:
class BertDataset(torch.utils.data.Dataset):
    def __init__(self, text, targ):
        self.text = text
        self.targ = targ

        # use bert default tokenizer
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
    
    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        # sanity check
        text = str(self.text[idx])
        text = ' '.join(text.split())
        
        inputs = self.tokenizer.encode_plus(text=text, text_pair=None,
                                            add_special_tokens=True, 
                                            max_length=self.max_len, 
                                            pad_to_max_length=True, 
                                            padding_side='right'
                                            )
        
        # input_ids is the integer repr of every token
        stoi = inputs['input_ids']
        # the attention_mask is the integer repr of the parts of the text to be attended to by the model. 
        # 1 means attend and 0 means not. 0 usually covers all the padding
        mask = inputs['attention_mask']
        # used to show question and answer in question answering pair where question is repr with 0 and answer with 1
        token_type_ids = inputs['token_type_ids']

        # # # use zero-right padding for the `inputs` keys
        # if len(stoi) <= self.max_len:
        #     padding_size = self.max_len - len(stoi)
        #     stoi = stoi + ([0]*padding_size)
        #     mask = mask + ([0]*padding_size)
        #     token_type_ids = token_type_ids + ([0]*padding_size)
        # else:
        #     stoi = stoi[:self.max_len]
        #     mask = mask[:self.max_len]
        #     token_type_ids = token_type_ids[:self.max_len]
        
        return {
            'stoi': torch.tensor(stoi).long(),
            'mask': torch.tensor(mask).long(),
            'token_type_ids': torch.tensor(token_type_ids).long(),
            'target': torch.tensor(self.targ[idx])
                }

In [None]:
example_text = 'The sheep jumped over the fence'
config.TOKENIZER.encode(text=example_text, add_special_tokens=True, max_lenght=512, )

[101, 1996, 8351, 5598, 2058, 1996, 8638, 102]

In [None]:
config.TOKENIZER.decode([101, 1996, 8351, 5598, 2058, 1996, 8638, 102])

'[CLS] the sheep jumped over the fence [SEP]'

In [None]:
config.TOKENIZER.convert_tokens_to_ids('[CLS] the sheep jumped over the fence [SEP]'.split())

[101, 1996, 8351, 5598, 2058, 1996, 8638, 102]

In [None]:
# itos
config.TOKENIZER.convert_ids_to_tokens([101, 1996, 8351, 5598, 2058, 1996, 8638, 102])

['[CLS]', 'the', 'sheep', 'jumped', 'over', 'the', 'fence', '[SEP]']

## train_utils.py

In [None]:
from tqdm import tqdm

loss_fn = nn.BCEWithLogitsLoss()

def training(data_loader, model, optimizer, scheduler, device):
    model.train()
    for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = data['stoi']
        mask = data['mask']
        token_type_ids = data['token_type_ids']
        target = data['target']

        #put on device
        ids = ids.to(device).long()
        mask = mask.to(device).long()
        token_type_ids = token_type_ids.to(device).long()
        target = target.to(device).float()

        #clear optimizer grads
        optimizer.zero_grad()

        output = model(
            stoi = ids,
            mask = mask,
            token_type_ids = token_type_ids
        )

        loss = loss_fn(output, target.view(-1, 1))
        loss.backward()

        optimizer.step()
        scheduler.step()
    

def evaluate(data_loader, model, optimizer, scheduler, device):
    model.eval()
    # track the targets and the outputs
    last_output, last_target = [], []

    # when doing evaluation, it is important to remember to not track the gradients
    with torch.no_grad():
        for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = data['stoi']
            mask = data['mask']
            token_type_ids = data['token_type_ids']
            target = data['target']

            #put on device
            ids = ids.to(device).long()
            mask = mask.to(device).long()
            token_type_ids = token_type_ids.to(device).long()
            target = target.to(device).float()

            output = model(
                stoi = ids,
                mask = mask,
                token_type_ids = token_type_ids
            )

            # detach and convert to arrays
            last_target.extend(target.cpu().detach().numpy())
            last_output.extend(output.cpu().detach().numpy())

    return last_target, last_output

##train.py

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

df = pd.read_csv(config.DATA_PATH)
df = df[['review', 'sentiment']]

le.fit(df.sentiment.values)
df.sentiment = le.transform(df.sentiment.values)

# split it
train, valid = train_test_split(df, test_size= 0.2, random_state=42, 
                                stratify=df.sentiment.values)

# reset the index in the dfs
train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

# make train and valid dataloaders
train_dataset = BertDataset(train.review.values, train.sentiment.values)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)

valid_dataset = BertDataset(valid.review.values, valid.sentiment.values)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=config.VALID_BATCH_SIZE, shuffle=True, num_workers=0)

In [None]:
# set the device
device = torch.device('cuda')

model = Bert().to(device)

In [None]:
# parmas you want optimized
param_optimizer = list(model.named_parameters())

# we don't want weight decay for these
no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']

In [None]:
optimizer_params = [
        {'params': [p for n, p in param_optimizer if n not in no_decay], 
         'weight_decay':0.001},
        #  no weight decay should be applied
        {'params': [p for n, p in param_optimizer if n in no_decay],
         'weight_decay':0.0}
]

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

num_train_steps = int(len(train) / config.TRAIN_BATCH_SIZE * config.NUM_EPOCHS)
optimizer = AdamW(optimizer_params, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer = optimizer,
    num_training_steps = num_train_steps,
    # no warmup
    num_warmup_steps = 0 
)

In [None]:
import gc
gc.collect()

231

In [None]:
from sklearn.metrics import accuracy_score

best_accuracy = 0
for epoch in range(config.NUM_EPOCHS):
    # train
    training(train_dataloader, model, optimizer, scheduler, device)

    # eval
    target, output = evaluate(valid_dataloader, model, optimizer, scheduler, device)

    # we have to check if the output gotten is greater than o.5 or not. 
    # Because we use sigmoid in model final layer
    output =  np.array(output) >= 0.5 #returns bool

    # calculate accuracy
    accuracy =  accuracy_score(target, output)
    print(f'\n Accuracy Score: {accuracy}')

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), config.SAVE_MODEL_PATH)

100%|██████████| 5000/5000 [39:26<00:00,  2.11it/s]
100%|██████████| 2500/2500 [03:45<00:00, 11.08it/s]



 Accuracy Score: 0.9395


 23%|██▎       | 1131/5000 [08:57<30:59,  2.08it/s]

In [None]:
def predictor(sentence, model, device=device):
    # instantiate tokenizer
    tokenizer = config.TOKENIZER
    max_len = config.MAX_LEN

    inputs = tokenizer.encode_plus(text=sentence[:max_len], text_pair=None,
                                        add_special_tokens=True, 
                                        max_lenght=max_len, 
                                        pad_to_max_lenght=True
                                        )
    
    stoi = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs['token_type_ids']

    # # use zero-right padding for the `inputs` keys
    if len(stoi) <= max_len:
        padding_size = max_len - len(stoi)
        stoi = stoi + ([0]*padding_size)
        mask = mask + ([0]*padding_size)
        token_type_ids = token_type_ids + ([0]*padding_size)
    else:
        stoi = stoi[:max_len]
        mask = mask[:max_len]
        token_type_ids = token_type_ids[:max_len]
    
    # dont' forget to add an extra batch
    stoi =  torch.tensor(stoi).long().unsqueeze(0)
    mask = torch.tensor(mask).long().unsqueeze(0)
    token_type_ids =  torch.tensor(token_type_ids).long().unsqueeze(0)

    ids = stoi.to(device).long()
    mask = mask.to(device).long()
    token_type_ids = token_type_ids.to(device).long()

    output = model(
            stoi = ids,
            mask = mask,
            token_type_ids = token_type_ids
        )
    
    # limit the result to within 0 and 1 using sigmoid
    output = torch.sigmoid(output).cpu().detach().numpy()
    print(output)
    return output[0][0]

In [None]:
MODEL = Bert().to(device)
MODEL.load_state_dict(torch.load(config.SAVE_MODEL_PATH))

In [None]:
sentence = 'This is a bad movie'

In [None]:
pos_pred = predictor(sentence, model=MODEL, device=device)
neg_pred = 1-pos_pred
response = dict()
response['response'] = {
    'Poistive: ': str(pos_pred),
    'Negative: ': str(neg_pred),
    'Sentence:': str(sentence)
}
response

[[0.6289537]]


{'response': {'Negative: ': '0.3710463047027588',
  'Poistive: ': '0.6289537',
  'Sentence:': 'This is a bad movie'}}

##app.py

In [None]:
 import flask
 from flask import Flask, request

In [None]:
# initialize app
app = Flask()

MODEL = None
device = 'cpu'

def predictor(sentence, model, device=device):
    # instantiate tokenizer
    tokenizer = config.TOKENIZER
    max_len = config.MAX_LEN

    inputs = tokenizer.encode_plus(text=sentence[:max_len], text_pair=None,
                                        add_special_tokens=True, 
                                        max_lenght=max_len, 
                                        pad_to_max_lenght=True
                                        )
    
    stoi = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs['token_type_ids']

    # # use zero-right padding for the `inputs` keys
    if len(stoi) <= max_len:
        padding_size = max_len - len(stoi)
        stoi = stoi + ([0]*padding_size)
        mask = mask + ([0]*padding_size)
        token_type_ids = token_type_ids + ([0]*padding_size)
    else:
        stoi = stoi[:max_len]
        mask = mask[:max_len]
        token_type_ids = token_type_ids[:max_len]
    
    # dont' forget to add an extra batch
    stoi =  torch.tensor(stoi).long().unsqueeze(0)
    mask = torch.tensor(mask).long().unsqueeze(0)
    token_type_ids =  torch.tensor(token_type_ids).long().unsqueeze(0)

    ids = ids.to(device).long()
    mask = mask.to(device).long()
    token_type_ids = token_type_ids.to(device).long()

    output = model(
            stoi = ids,
            mask = mask,
            token_type_ids = token_type_ids
        )
    
    # limit the result to within 0 and 1 using sigmoid
    output = torch.sigmoid(output).numpy()
    print(output)
    return output[0][0]

@app.route('/predict')
def predict():
     sentence = request.args.get('sentence')
     pos_pred = predictor(sentence, model=model, device=device)
     neg_pred = 1-pos_pred
     response = dict()
     response['response'] = {
         'Poistive: ': str(pos_pred),
         'Negative: ': str(neg_pred),
         'Sentence:': str(sentence)
     }
     return response
 
if __name__ == '__main__':
    MODEL = BERT().to(device)
    MODEL.eval()

    # load the model state dict
    MODEL = torch.load_state_dict(torch.load(config.SAVE_MODEL_PATH))
    app.run(debug=True)