<a href="https://colab.research.google.com/github/sumanthd17/aspect-based-sentiment/blob/master/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Gen RAM Free: 12.7 GB  | Proc size: 160.6 MB
GPU RAM Free: 15079MB | Used: 0MB | Util   0% | Total 15079MB


In [2]:
!pip install transformers



In [3]:
import time
import datetime
import random
from tqdm import tqdm

import numpy as np
import pandas as pd

import nltk
nltk.download('punkt')

import torch
import torch.nn.functional as F
import transformers as optimus

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
!git clone https://github.com/sumanthd17/aspect-based-sentiment.git

fatal: destination path 'aspect-based-sentiment' already exists and is not an empty directory.


In [5]:
cd aspect-based-sentiment

/content/aspect-based-sentiment


In [6]:
def load_train_data(input_dir):
    df = pd.read_csv(input_dir + "train-QA.csv", sep="\t", names=['id', 'ques', 'ans', 'sentiment'])
    return df

In [7]:
def load_val_data(input_dir):
    df = pd.read_csv(input_dir + "val-QA.csv", sep="\t", names=['id', 'ques', 'ans', 'sentiment'])
    return df

In [8]:
def hyper_params():
    BATCH_SIZE = 32
    MAX_SEQ_LENGTH = 256
    LEARNING_RATE = 2e-5
    EPOCHS = 5
    WARMUP = 0.1
    return BATCH_SIZE, MAX_SEQ_LENGTH, LEARNING_RATE, WARMUP, EPOCHS

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
!python create_data.py

2977
747
1491


In [11]:
train_data = load_train_data('QA_pairs/')
val_data = load_val_data('QA_pairs/')

batch_size, max_seq_len, lr, warmup, epochs = hyper_params()
num_training_steps = int(len(train_data) / batch_size) * epochs
num_warmup_steps = warmup * num_training_steps
print(len(train_data))
print(len(val_data))

45025
11246


In [12]:
train_data = train_data[:10000]
val_data = val_data[:5000]

In [13]:
tokenizer_class, pretrained_weights = (
    optimus.BertTokenizer,
    "bert-base-uncased",
)

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

In [14]:
sent2idx = {
    'None': 0,
    'Positive': 1,
    'Negative': 2
}

In [15]:
train = pd.DataFrame()

question_tokens = train_data['ques'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
q_segment_tokens = question_tokens.apply(lambda x: [0]*len(x))

answer_tokens = train_data['ans'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
answer_tokens = answer_tokens.apply(lambda x: [i for i in x if i != 101])
a_segment_tokens = answer_tokens.apply(lambda x: [1]*len(x))

train['input_ids'] = question_tokens + answer_tokens
train['segment_ids'] = q_segment_tokens + a_segment_tokens

train['input_mask'] = ''
for i, row in train.iterrows():
  train.iloc[i]['input_mask'] = [1]*len(train.iloc[i]['input_ids'])

train['input_mask'] = train['input_mask'].apply(lambda x: x + (max_seq_len - len(x))*[0])
train['input_ids'] = train['input_ids'].apply(lambda x: x + (max_seq_len - len(x))*[0])
train['segment_ids'] = train['segment_ids'].apply(lambda x: x + (max_seq_len - len(x))*[0])

train['label'] = ''
train['label'] = train_data['sentiment'].apply(lambda x: sent2idx[x])

In [16]:
val = pd.DataFrame()

question_tokens = val_data['ques'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
q_segment_tokens = question_tokens.apply(lambda x: [0]*len(x))

answer_tokens = val_data['ans'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
answer_tokens = answer_tokens.apply(lambda x: [i for i in x if i != 101])
a_segment_tokens = answer_tokens.apply(lambda x: [1]*len(x))

val['input_ids'] = question_tokens + answer_tokens
val['segment_ids'] = q_segment_tokens + a_segment_tokens

val['input_mask'] = ''
for i, row in val.iterrows():
  val.iloc[i]['input_mask'] = [1]*len(val.iloc[i]['input_ids'])

val['input_mask'] = val['input_mask'].apply(lambda x: x + (max_seq_len - len(x))*[0])
val['input_ids'] = val['input_ids'].apply(lambda x: x + (max_seq_len - len(x))*[0])
val['segment_ids'] = val['segment_ids'].apply(lambda x: x + (max_seq_len - len(x))*[0])

val['label'] = ''
val['label'] = train_data['sentiment'].apply(lambda x: sent2idx[x])

In [17]:
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.sampler import RandomSampler

In [18]:
input_ids = torch.tensor([r for r in train['input_ids']], dtype=torch.long)
input_masks = torch.tensor([r for r in train['input_mask']], dtype=torch.long)
label_ids = torch.tensor([r for r in train['label']], dtype=torch.long)
segment_ids = torch.tensor([r for r in train['segment_ids']], dtype=torch.long)

In [19]:
train_dataset = TensorDataset(input_ids, input_masks, segment_ids, label_ids)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

In [20]:
input_ids = torch.tensor([r for r in val['input_ids']], dtype=torch.long)
input_masks = torch.tensor([r for r in val['input_mask']], dtype=torch.long)
label_ids = torch.tensor([r for r in val['label']], dtype=torch.long)
segment_ids = torch.tensor([r for r in val['segment_ids']], dtype=torch.long)

In [21]:
val_dataset = TensorDataset(input_ids, input_masks, segment_ids, label_ids)
val_sampler = RandomSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch_size)

In [22]:
model_class, pretrained_weights = (
    optimus.BertForSequenceClassification,
    "bert-base-uncased",
)

model = model_class.from_pretrained(pretrained_weights, num_labels=3)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [23]:
model = model.to(device)

In [24]:
optimizer_params = [
  {'params': [p for n, p in model.named_parameters()], 'weight_decay_rate': 0.0}
]

In [25]:
optimizer = optimus.AdamW(model.parameters(), lr=lr, correct_bias=False)
scheduler = optimus.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

In [26]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [27]:
training_stats = []

total_t0 = time.time()

for epoch_i in range(0, 5):
    ## TRAINING
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(tqdm(train_dataloader)):

        if step % 100 == 0 and not step == 0:
            elapsed = str(datetime.timedelta(seconds=int(round(time.time() - t0))))
            
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        input_ids, input_mask, segment_ids, label_ids = batch
  
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        model.zero_grad()        

        loss, _ = model(input_ids=input_ids,
                        attention_mask=input_mask,
                        token_type_ids=segment_ids,
                        labels=label_ids)

        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)          
    
    training_time = str(datetime.timedelta(seconds=int(round(time.time() - t0))))

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    ## VALIDATION

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for step, batch in enumerate(tqdm(val_dataloader)):
        
        input_ids, input_mask, segment_ids, label_ids = batch
  
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)
        
        with torch.no_grad():        
            loss, logits = model(input_ids=input_ids,
                        attention_mask=input_mask,
                        token_type_ids=segment_ids,
                        labels=label_ids)
            
        total_eval_loss += loss.item()

        logits = F.softmax(logits, dim=-1)
        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()
        outputs = np.argmax(logits, axis=1)

        total_eval_accuracy += np.sum(outputs == label_ids)
        

    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(val_dataloader)
    
    validation_time = str(datetime.timedelta(seconds=int(round(time.time() - t0))))
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(str(datetime.timedelta(seconds=int(round(time.time() - total_t0))))))

  0%|          | 0/313 [00:00<?, ?it/s]

Training...


 32%|███▏      | 100/313 [02:19<05:00,  1.41s/it]

  Batch   100  of    313.    Elapsed: 0:02:20.


 64%|██████▍   | 200/313 [04:40<02:39,  1.41s/it]

  Batch   200  of    313.    Elapsed: 0:04:40.


 96%|█████████▌| 300/313 [07:01<00:18,  1.40s/it]

  Batch   300  of    313.    Elapsed: 0:07:01.


100%|██████████| 313/313 [07:18<00:00,  1.40s/it]
  0%|          | 0/157 [00:00<?, ?it/s]


  Average training loss: 0.40
  Training epcoh took: 0:07:19

Running Validation...


100%|██████████| 157/157 [01:21<00:00,  1.92it/s]
  0%|          | 0/313 [00:00<?, ?it/s]

  Accuracy: 28.32
  Validation Loss: 0.50
  Validation took: 0:01:22
Training...


 32%|███▏      | 100/313 [02:20<04:59,  1.41s/it]

  Batch   100  of    313.    Elapsed: 0:02:21.


 64%|██████▍   | 200/313 [04:41<02:38,  1.40s/it]

  Batch   200  of    313.    Elapsed: 0:04:41.


 96%|█████████▌| 300/313 [07:01<00:18,  1.41s/it]

  Batch   300  of    313.    Elapsed: 0:07:02.


100%|██████████| 313/313 [07:19<00:00,  1.40s/it]
  0%|          | 0/157 [00:00<?, ?it/s]


  Average training loss: 0.16
  Training epcoh took: 0:07:20

Running Validation...


100%|██████████| 157/157 [01:21<00:00,  1.92it/s]
  0%|          | 0/313 [00:00<?, ?it/s]

  Accuracy: 27.37
  Validation Loss: 0.56
  Validation took: 0:01:22
Training...


 32%|███▏      | 100/313 [02:20<05:00,  1.41s/it]

  Batch   100  of    313.    Elapsed: 0:02:20.


 64%|██████▍   | 200/313 [04:41<02:39,  1.41s/it]

  Batch   200  of    313.    Elapsed: 0:04:41.


 96%|█████████▌| 300/313 [07:01<00:18,  1.40s/it]

  Batch   300  of    313.    Elapsed: 0:07:02.


100%|██████████| 313/313 [07:19<00:00,  1.40s/it]
  0%|          | 0/157 [00:00<?, ?it/s]


  Average training loss: 0.12
  Training epcoh took: 0:07:19

Running Validation...


100%|██████████| 157/157 [01:21<00:00,  1.92it/s]
  0%|          | 0/313 [00:00<?, ?it/s]

  Accuracy: 27.71
  Validation Loss: 0.61
  Validation took: 0:01:22
Training...


 32%|███▏      | 100/313 [02:20<04:59,  1.40s/it]

  Batch   100  of    313.    Elapsed: 0:02:20.


 64%|██████▍   | 200/313 [04:40<02:38,  1.40s/it]

  Batch   200  of    313.    Elapsed: 0:04:41.


 96%|█████████▌| 300/313 [07:00<00:18,  1.40s/it]

  Batch   300  of    313.    Elapsed: 0:07:01.


100%|██████████| 313/313 [07:18<00:00,  1.40s/it]
  0%|          | 0/157 [00:00<?, ?it/s]


  Average training loss: 0.08
  Training epcoh took: 0:07:18

Running Validation...


100%|██████████| 157/157 [01:21<00:00,  1.93it/s]
  0%|          | 0/313 [00:00<?, ?it/s]

  Accuracy: 27.73
  Validation Loss: 0.72
  Validation took: 0:01:22
Training...


 32%|███▏      | 100/313 [02:20<04:59,  1.41s/it]

  Batch   100  of    313.    Elapsed: 0:02:20.


 64%|██████▍   | 200/313 [04:40<02:38,  1.40s/it]

  Batch   200  of    313.    Elapsed: 0:04:41.


 96%|█████████▌| 300/313 [07:00<00:18,  1.40s/it]

  Batch   300  of    313.    Elapsed: 0:07:01.


100%|██████████| 313/313 [07:18<00:00,  1.40s/it]
  0%|          | 0/157 [00:00<?, ?it/s]


  Average training loss: 0.06
  Training epcoh took: 0:07:18

Running Validation...


100%|██████████| 157/157 [01:21<00:00,  1.91it/s]

  Accuracy: 27.90
  Validation Loss: 0.64
  Validation took: 0:01:22

Training complete!
Total training took 0:43:23 (h:mm:ss)





In [28]:
## INFERENCE

def load_test_data(input_dir):
    df = pd.read_csv(input_dir + "test-QA.csv", sep="\t", names=['id', 'ques', 'ans', 'sentiment'])
    return df

In [29]:
test_data = load_test_data('QA_pairs/')

In [30]:
idx2sentiment = {
    0: "None",
    1: "Positive",
    2: "Negative"
}

In [31]:
all_aspects = ['price',
               'shopping',
               'transit-location', 
               'safety',
               'nightlife',
               'live',
               'multiculture',
               'green-nature',
               'touristy',
               'quiet',
               'dining',
               'general']

In [32]:
test_grouped_by_id = test_data.groupby(['id'])

In [34]:
model.eval()

final_preds = pd.DataFrame()

for id, group in tqdm(test_grouped_by_id):
  test = pd.DataFrame()

  question_tokens = group['ques'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
  q_segment_tokens = question_tokens.apply(lambda x: [0]*len(x))

  answer_tokens = group['ans'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
  answer_tokens = answer_tokens.apply(lambda x: [i for i in x if i != 101])
  a_segment_tokens = answer_tokens.apply(lambda x: [1]*len(x))

  test['input_ids'] = question_tokens + answer_tokens
  test['segment_ids'] = q_segment_tokens + a_segment_tokens

  test['input_mask'] = test['input_ids'].apply(lambda x: [1]*len(x))

  test['input_mask'] = test['input_mask'].apply(lambda x: x + (max_seq_len - len(x))*[0])
  test['input_ids'] = test['input_ids'].apply(lambda x: x + (max_seq_len - len(x))*[0])
  test['segment_ids'] = test['segment_ids'].apply(lambda x: x + (max_seq_len - len(x))*[0])

  test['label'] = ''
  test['label'] = group['sentiment'].apply(lambda x: sent2idx[x])

  input_ids = torch.tensor([r for r in test['input_ids']], dtype=torch.long)
  input_mask = torch.tensor([r for r in test['input_mask']], dtype=torch.long)
  label_ids = torch.tensor([r for r in test['label']], dtype=torch.long)
  segment_ids = torch.tensor([r for r in test['segment_ids']], dtype=torch.long)

  input_ids = input_ids.to(device)
  input_mask = input_mask.to(device)
  segment_ids = segment_ids.to(device)
  label_ids = label_ids.to(device)

  with torch.no_grad():        
    loss, logits = model(input_ids=input_ids,
                        attention_mask=input_mask,
                        token_type_ids=segment_ids,
                        labels=label_ids)

  logits = F.softmax(logits, dim=-1)
  logits = logits.detach().cpu().numpy()
  label_ids = label_ids.to('cpu').numpy()
  outputs = np.argmax(logits, axis=1)

  test['pred'] = outputs

  res = [idx for idx, val in enumerate(outputs) if val != 0]
  for val in res:
    d = {}
    d['id'] = id
    d['text'] = group.iloc[val]['ans']
    d['aspect'] = group.iloc[val]['ques'].split(' ')[6]
    d['sentiment'] = idx2sentiment[outputs[val]]
    d['target'] = 'LOCATION1' if 'LOCATION1' in group.iloc[val]['ans'] else 'LOCATION2'
    final_preds = final_preds.append(d, ignore_index=True)

100%|██████████| 1491/1491 [06:32<00:00,  3.80it/s]


In [36]:
!nvidia-smi

Tue Jul 28 17:27:35 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    32W /  70W |  11353MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [37]:
final_preds.to_csv('prediction.csv', index=False)