In [151]:
import json
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4
import tqdm

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

from transformers import RobertaModel, RobertaTokenizer
from transformers import RobertaForSequenceClassification, RobertaConfig


In [152]:
def prepare_features(seq_1, max_seq_length = 140, zero_pad = True, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)
    

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    ## Input Mask 
    input_mask = [1] * len(input_ids)
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask
                     
                     

In [153]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.text[index]
        label = self.data.label[index]
        X, _  = prepare_features(utterance)
        y = label_to_inx[self.data.label[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [154]:
def get_result(pred, lst_true):
    from sklearn.metrics import accuracy_score, f1_score
     
    acc = accuracy_score(lst_true, pred)
    f1_micro = f1_score(lst_true, pred, average='micro')
    f1_macro = f1_score(lst_true, pred, average='macro')
    
    return acc, f1_micro, f1_macro

In [155]:
label_to_inx = {'unsustainable':0,'sustainable':1}

In [156]:
config = RobertaConfig.from_pretrained('roberta-base')

In [157]:
## if errors: /tmp/.cache/torch permission just give sudo chmod -R a+rw xxx/xx

In [158]:
config.num_labels = 2

In [159]:
config

RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [160]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [161]:
model = RobertaForSequenceClassification(config)

In [162]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.cuda()

In [163]:
df_train = pd.read_csv('../data/train.csv')
df_valid = pd.read_csv('../data/valid.csv')

In [164]:
training_set = Intents(df_train)
testing_set = Intents(df_valid)

In [165]:
# len(training_set), len(testing_set)
t, l = training_set[0]
t, l

(tensor([[    0,   100,  1729,   597,  1364,     7,  8924,  3942,  4918,     8,
           4921,     7,  7677, 35552,    31,  9092,     8,  9440,  1787,  9781,
             11,   391,   730,     4,     2,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   

In [166]:
params = {'batch_size': 16,
          'shuffle': True,
          'drop_last': False,
          'num_workers': 0}
# params = {'batch_size': 16,
#           'shuffle': True,
#           'drop_last': False}

In [170]:
training_loader = DataLoader(dataset=training_set, **params)
testing_loader = DataLoader(dataset=testing_set, **params)

In [177]:
training_loader = iter(training_loader)
testing_loader = iter(testing_loader)

In [178]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [179]:
# inp = training_set.__getitem__(0)[0].cuda()
inp = training_set.__getitem__(0)[0]
output = model(inp)[0]
print(output.shape)

torch.Size([1, 2])


In [195]:
max_epochs = 30
model = model.train()
for epoch in tqdm.notebook.tqdm(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, batch in enumerate(training_loader):
      sent, label = batch[0], batch[1]
      optimizer.zero_grad()
      sent = sent.squeeze(0)
      if torch.cuda.is_available():
        sent = sent.cuda()
        label = label.cuda()
      output = model(sent)[0]
      _, predicted = torch.max(output, 1)
      
      loss = loss_function(output, label)
      loss.backward()
      optimizer.step()
      
      if i%100 == 0:
          correct = 0
          total = 0
          for sent, label in testing_loader:
              sent = sent.squeeze(0)
              if torch.cuda.is_available():
                sent = sent.cuda()
                label = label.cuda()
              output = model.forward(sent)[0]
              _, predicted = torch.max(output.data, 1)
              total += label.size(0)
              correct += (predicted.cpu() == label.cpu()).sum()
          accuracy = 100.00 * correct.numpy() / total
          print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))


  0%|          | 0/30 [00:00<?, ?it/s]

EPOCH -- 0
EPOCH -- 1
EPOCH -- 2
EPOCH -- 3
EPOCH -- 4
EPOCH -- 5
EPOCH -- 6
EPOCH -- 7
EPOCH -- 8
EPOCH -- 9
EPOCH -- 10
EPOCH -- 11
EPOCH -- 12
EPOCH -- 13
EPOCH -- 14
EPOCH -- 15
EPOCH -- 16
EPOCH -- 17
EPOCH -- 18
EPOCH -- 19
EPOCH -- 20
EPOCH -- 21
EPOCH -- 22
EPOCH -- 23
EPOCH -- 24
EPOCH -- 25
EPOCH -- 26
EPOCH -- 27
EPOCH -- 28
EPOCH -- 29


In [197]:
max_epochs = 30
model = model.train()
for epoch in range(max_epochs):
    print("EPOCH -- {}".format(epoch))
    for i, batch in enumerate(training_loader):
        sent, label = batch[0], batch[1]
        optimizer.zero_grad()
        sent = sent.squeeze(1)
        if torch.cuda.is_available():
            sent = sent.cuda()
            label = label.cuda()
        output = model(sent)[0]
        _, predicted = torch.max(output, 1)

        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            model.eval()
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(1)
                if torch.cuda.is_available():
                    sent = sent.cuda()
                    label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

EPOCH -- 0
EPOCH -- 1
EPOCH -- 2
EPOCH -- 3
EPOCH -- 4
EPOCH -- 5
EPOCH -- 6
EPOCH -- 7
EPOCH -- 8
EPOCH -- 9
EPOCH -- 10
EPOCH -- 11
EPOCH -- 12
EPOCH -- 13
EPOCH -- 14
EPOCH -- 15
EPOCH -- 16
EPOCH -- 17
EPOCH -- 18
EPOCH -- 19
EPOCH -- 20
EPOCH -- 21
EPOCH -- 22
EPOCH -- 23
EPOCH -- 24
EPOCH -- 25
EPOCH -- 26
EPOCH -- 27
EPOCH -- 28
EPOCH -- 29


In [187]:
outputs = []
lst_prediction =[]
lst_test = list(df_valid['text'])
model.eval()
for msg in lst_test:
    input_msg, _ = prepare_features(msg)
    if torch.cuda.is_available():
        input_msg = input_msg.cuda()
        output = model(input_msg)[0]
        outputs.append(output)
        _, pred_label = torch.max(output.data, 1)
        prediction=list(label_to_inx.keys())[pred_label]
        lst_prediction.append(prediction)

In [188]:
outputs = [o.to('cpu').detach().numpy().copy() for o in outputs]

In [189]:
lst_class = ['unsustainable','sustainable']

In [190]:
predictions2 = [] 
[predictions2.append([x[1] for x in [sorted(zip(example[0], lst_class), reverse=True)][0]]) for example in outputs]

[]

In [191]:
predictions3 = [a[0] for a in predictions2]

In [192]:
lst_true = list(df_valid['label'])

In [193]:
acc, f1_micro, f1_macro = get_result(predictions3, lst_true)

ValueError: Found input variables with inconsistent numbers of samples: [266, 0]

In [54]:
acc, f1_micro, f1_macro 

(0.8533834586466166, 0.8533834586466166, 0.8518571408171134)