In [None]:
%%capture
!pip install transformers
!pip install pytorch-transformers
!pip install kaggle

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls "/content/drive/MyDrive/Deep_Learning/NLP_Vol3/Part_2/"

kaggle.json  Part_1.ipynb  Part_2.ipynb


In [None]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
!cp "/content/drive/MyDrive/Deep_Learning/NLP_Vol3/Part_2/kaggle.json" "/root/.kaggle"

In [None]:
!ls /root/.kaggle

kaggle.json


In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch 
import transformers
import zipfile
import gc

from warnings import filterwarnings
filterwarnings("ignore")

In [None]:
zip_file = zipfile.ZipFile("/content/imdb-dataset-of-50k-movie-reviews.zip")
zip_file.extractall("./")
zip_file.close()

In [None]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10 
ACCUMULATION = 2
BERT_PATH = "bert-base-uncased"
MODEL_PATH = "model.bin"
TRAINING_FILE = "/content/IMDB Dataset.csv"
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH, 
    do_lower_case = True
    )

In [None]:
class BERTBaseUncased(torch.nn.Module):
    def __init__(self):
        super(BERTBaseUncased,self).__init__()

        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.bert_drop = torch.nn.Dropout(p = 0.3)
        self.out = torch.nn.Linear(768,1)

    def forward(self,ids, mask , token_type_ids):

        result = self.bert(ids, 
                           attention_mask = mask, 
                           token_type_ids = token_type_ids
                              )
        bo = self.bert_drop(result["pooler_output"])
        output = self.out(bo)
        return output

In [None]:
class BERTDataset:

    def __init__(self, review, target):
        self.review = review
        self.target = target 
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN


    def __len__(self):
        return len(self.review)

    def __getitem__(self, item):
        review = str(self.review[item])
        review = " ".join(review.split())

        inputs = self.tokenizer.encode_plus(
            review,
            None,
            truncation = True,
            padding = "max_length",
            add_special_tokens = True,
            max_length = self.max_len,
            return_tensors = "pt"
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        #padding_length = self.max_len - len(ids)

        #ids = ids + ([0] * padding_length)
        #mask = mask + ([0] * padding_length)
        #token_type_ids = token_type_ids + ([0] * padding_length)

        return {
            "ids": torch.tensor(ids, dtype = torch.long).flatten(),
            "mask" : torch.tensor(mask, dtype = torch.long).flatten(),
            "token_type_ids" : torch.tensor(token_type_ids, dtype = torch.long).flatten(),
            "targets" : torch.tensor(self.target[item],dtype = torch.float)
        }

In [None]:
from tqdm import tqdm


def loss_fn(outputs,targets):
    return torch.nn.BCEWithLogitsLoss()(outputs,targets.view(-1,1))

def train_fn(data_loader,model,optimizer,device,schedular):
    model.train()

    for bi, d in tqdm(enumerate(data_loader), total = len(data_loader)):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device,dtype = torch.long)
        token_type_ids = token_type_ids.to(device,dtype = torch.long)
        mask = mask.to(device,dtype = torch.long)
        targets = targets.to(device,dtype = torch.float)

        optimizer.zero_grad()
        outputs = model(
            ids = ids,
            mask = mask,
            token_type_ids = token_type_ids
        )

        loss = loss_fn(outputs,targets)

        loss.backward()
        optimizer.step()
        schedular.step()



def eval_fn(data_loader,model,device):
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total = len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device,dtype = torch.long)
            token_type_ids = token_type_ids.to(device,dtype = torch.long)
            mask = mask.to(device,dtype = torch.long)
            targets = targets.to(device,dtype = torch.float)

            outputs = model(
                ids = ids,
                mask = mask,
                token_type_ids = token_type_ids
            )

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

    return fin_outputs,fin_targets

In [None]:
torch.cuda.empty_cache()
gc.collect()

54

In [None]:
def run():
    dfx = pd.read_csv(TRAINING_FILE).fillna("none")
    dfx.sentiment = dfx.sentiment.apply(
        lambda x : 1 if x == "positive" else 0 
    )
    
    dfx = dfx.iloc[:1000,:]
    
    df_train, df_valid = train_test_split(
        dfx,
        test_size = 0.1,
        random_state = 42,
        stratify = dfx.sentiment.values
    )
    
    df_train = df_train.reset_index(drop = True)
    df_valid = df_valid.reset_index(drop = True)

    train_dataset = BERTDataset(
        review = df_train.review.values,
        target = df_train.sentiment.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = TRAIN_BATCH_SIZE,
        num_workers = os.cpu_count(),
        shuffle = True
    )

    valid_dataset = BERTDataset(
        review = df_valid.review.values,
        target = df_valid.sentiment.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = VALID_BATCH_SIZE,
        num_workers =1 ,
        shuffle = False
    )

    device = "cuda" if torch.cuda.is_available() else "cpu" 

    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias","LayerNorm.bias","LayerNorm.weight"]
    optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]

    num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS) 
    
    optimizer = transformers.AdamW(
        optimizer_parameters,
        lr = 3e-5 
    )

    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = 0,
        num_training_steps = num_train_steps
    )
    

    best_accuracy = 0
    for epoch in range(EPOCHS):
        # --------------------------- #
        ####torch.cuda.empty_cache()
        ####gc.collect()
        # --------------------------- #
        train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = accuracy_score(targets,outputs) 
        print(f"Accuracy Score : {accuracy:.4f}\n")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), MODEL_PATH)
            best_accuracy = accuracy


if __name__ == "__main__":
    run()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 113/113 [00:53<00:00,  2.12it/s]
100%|██████████| 25/25 [00:02<00:00, 12.19it/s]


Accuracy Score : 0.8300



100%|██████████| 113/113 [00:48<00:00,  2.31it/s]
100%|██████████| 25/25 [00:02<00:00, 12.08it/s]


Accuracy Score : 0.8600



  4%|▎         | 4/113 [00:02<01:02,  1.74it/s]


KeyboardInterrupt: ignored

# Flask APP

In [None]:
%%capture
!pip install flask-ngrok

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu" 

MODEL = BERTBaseUncased()
MODEL.load_state_dict(torch.load("/content/model.bin"))
MODEL.to(device)
MODEL.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"

def sentence_prediction(sentence):
    tokenizer = TOKENIZER
    max_length = MAX_LEN
    review = str(sentence)
    review = " ".join(review.split())

    inputs = tokenizer.encode_plus(
        review,
        None,
        truncation = True,
        padding = "max_length",
        add_special_tokens = True,
        max_length = max_length,
        return_tensors = "pt"
    )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]

    ids = torch.tensor(ids, dtype = torch.long)
    mask = torch.tensor(mask, dtype = torch.long)
    token_type_ids = torch.tensor(token_type_ids, dtype = torch.long)
    
    ids = ids.to(device,dtype = torch.long)
    mask = mask.to(device,dtype = torch.long)
    token_type_ids = token_type_ids.to(device, dtype = torch.long)


    outputs = MODEL(
        ids = ids, 
        mask = mask,
        token_type_ids = token_type_ids)

    outputs = torch.sigmoid(outputs)

    return outputs[0][0].item()


sentence_prediction("I love this film.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.9854877591133118

In [None]:
from flask import Flask,request
import flask
import requests
import time
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)


@app.route("/predict")
def predict():
    sentence = request.args.get("sentence")
    start_time = time.time()
    positive_prediction = sentence_prediction(sentence)
    negative_prediction = 1 - positive_prediction
    response = {}
    response["response"] = {
        
        "positive": str(positive_prediction),
        "negative": str(negative_prediction),
        "sentence": str(sentence),
        "time_taken": str(time.time() - start_time)
    }

    return flask.jsonfiy(response)

app.run()