In [1]:
!pip install -qq transformers

[K     |████████████████████████████████| 2.9 MB 7.1 MB/s 
[K     |████████████████████████████████| 636 kB 95.0 MB/s 
[K     |████████████████████████████████| 895 kB 62.6 MB/s 
[K     |████████████████████████████████| 3.3 MB 59.7 MB/s 
[K     |████████████████████████████████| 56 kB 6.5 MB/s 
[K     |████████████████████████████████| 109 kB 83.2 MB/s 
[K     |████████████████████████████████| 546 kB 67.6 MB/s 
[?25h

# Importing Libraries

In [2]:
from tqdm import tqdm

In [3]:
import numpy as np
import pandas as pd

In [34]:
import random
import time

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

In [6]:
import transformers
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

In [7]:
from transformers import logging
logging.set_verbosity_error()

In [8]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset,DataLoader, RandomSampler, SequentialSampler

In [10]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


In [11]:
import re
import string

# Loading and cleaning data

In [12]:
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")
valid = pd.read_csv("Valid.csv")

In [13]:
train.shape

(40000, 2)

In [14]:
train.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [15]:
train.isnull().sum()

text     0
label    0
dtype: int64

# BERT

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [17]:
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean,'',text)

In [18]:
def preprocessing_for_bert(data):
    input_ids = []
    attention_masks = []

    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            text=clean_html(sent), 
            add_special_tokens=True,       
            max_length=128,                  
            pad_to_max_length=True,                   
            return_attention_mask=True     
            )
        
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [19]:
X_train=train["text"]
y_train=train["label"]
X_val=valid["text"]
y_val=valid["label"]

In [20]:
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)

In [21]:
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [38]:
class IMDbClassifier(nn.Module):
  
    def __init__(self, freeze_bert=False):
        super(IMDbClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.classifier = nn.Sequential(
            nn.Linear(768, 150),
            nn.ReLU(),
            nn.Linear(150, 50),
            nn.ReLU(),
            nn.Linear(50, 2),
        )

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
       
        last_hidden_state_cls = outputs[0][:, 0, :]

        logits = self.classifier(last_hidden_state_cls)

        return logits

In [43]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    
    imdb_classifier = IMDbClassifier(freeze_bert=False)

    imdb_classifier.to(device)

  
    optimizer = AdamW(imdb_classifier.parameters(),lr=2e-5, eps=1e-8)

    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0, num_training_steps=total_steps)
    return imdb_classifier, optimizer, scheduler

In [44]:
loss_fn = nn.CrossEntropyLoss()

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    
    for epoch_i in range(epochs):
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")

        t0_epoch, t0_batch = time.time(), time.time()

        total_loss, batch_loss, batch_counts = 0, 0, 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()

            logits = model(b_input_ids, b_attn_mask)

            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
               
                time_elapsed = time.time() - t0_batch

                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        avg_train_loss = total_loss / len(train_dataloader)

        if evaluation == True:
           
            val_loss, val_accuracy = evaluate(model, val_dataloader)

        
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")

        torch.save(model.state_dict(), 'hw1_newmodel_tanvi.bin')
          

In [41]:
def evaluate(model, val_dataloader):
    model.eval()

    val_accuracy = []
    val_loss = []

    for batch in val_dataloader:
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()

        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

 
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)
    

    return val_loss, val_accuracy

In [45]:
imdb_classifier, optimizer, scheduler = initialize_model(epochs=1)
train(imdb_classifier, train_dataloader, val_dataloader, epochs=1, evaluation=True)

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
   1    |   20    |   0.683876   |     -      |     -     |   8.06   
   1    |   40    |   0.581084   |     -      |     -     |   7.63   
   1    |   60    |   0.497053   |     -      |     -     |   7.67   
   1    |   80    |   0.475778   |     -      |     -     |   7.65   
   1    |   100   |   0.434598   |     -      |     -     |   7.66   
   1    |   120   |   0.403947   |     -      |     -     |   7.66   
   1    |   140   |   0.420490   |     -      |     -     |   7.65   
   1    |   160   |   0.342294   |     -      |     -     |   7.65   
   1    |   180   |   0.392365   |     -      |     -     |   7.65   
   1    |   200   |   0.332920   |     -      |     -     |   7.67   
   1    |   220   |   0.341450   |     -      |     -     |   7.66   
   1    |   240   |   0.341606   |     -      |     -     |   7.66   
   1    |   260   |   0.344045   |     -      |     -     |   7.64   
   1    |   280   | 

In [46]:
import torch.nn.functional as F

def imdb_predict(model, test_dataloader):
    model.eval()

    all_logits = []

    for batch in test_dataloader:
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    all_logits = torch.cat(all_logits, dim=0)

    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [47]:
from sklearn.metrics import accuracy_score, roc_curve, auc

def metrics_eval(p, y_true):

    preds = p[:, 1]
 
    y_pred = np.where(preds >= 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy*100:.2f}%')
    


In [48]:
probs = imdb_predict(imdb_classifier, val_dataloader)

metrics_eval(probs, y_val)

Accuracy: 88.96%


In [49]:
test_inputs, test_masks = preprocessing_for_bert(test.text)

test_dataset = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=32)

In [50]:
def evaluate_test(p, y_true):
  preds = p[:, 1]
 
  y_pred = np.where(preds >= 0.5, 1, 0)
  accuracy = accuracy_score(y_true, y_pred)
  print(f'Accuracy: {accuracy*100:.2f}%')
  print(classification_report(y_true, y_pred))
  


In [51]:
X_test=test["text"]
y_test=test["label"]

In [52]:
probs = imdb_predict(imdb_classifier, test_dataloader)

evaluate_test(probs, y_test)

Accuracy: 89.52%
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      2495
           1       0.88      0.91      0.90      2505

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000

