In [2]:
import json
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader


In [3]:
def prepare_features(seq_1, max_seq_length = 140, zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask
                     
                     

In [4]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.text[index]
        label = self.data.label[index]
        X, _  = prepare_features(utterance)
        y = label_to_inx[self.data.label[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [None]:
def get_result(pred, lst_true):
    from sklearn.metrics import accuracy_score, f1_score
     
    acc = accuracy_score(lst_true, pred)
    f1_micro = f1_score(lst_true, pred, average='micro')
    f1_macro = f1_score(lst_true, pred, average='macro')
    
    return acc, f1_micro, f1_macro

In [6]:
label_to_inx = {'unsustainable':0,'sustainable':1}

In [9]:
config = RobertaConfig.from_pretrained('roberta-base')

In [None]:
## if errors: /tmp/.cache/torch permission just give sudo chmod -R a+rw xxx/xx

In [10]:
config.num_labels = 2

In [None]:
config

In [11]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [12]:
model = RobertaForSequenceClassification(config)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

In [24]:
df_train = pd.read_csv('../data/train.csv')
df_valid = pd.read_csv('../data/valid.csv')

In [25]:
training_set = Intents(df_train)
testing_set = Intents(df_valid)

In [26]:
params = {'batch_size': 16,
          'shuffle': True,
          'drop_last': False,
          'num_workers': 1}

In [33]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [28]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [29]:
inp = training_set.__getitem__(0)[0].cuda()
output = model(inp)[0]
print(output.shape)

torch.Size([1, 2])


In [30]:
max_epochs = 30
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/30 [00:00<?, ?it/s]

EPOCH -- 0
Iteration: 0. Loss: 0.1651824712753296. Accuracy: 47.36842105263158%
Iteration: 100. Loss: 0.48423951864242554. Accuracy: 47.36842105263158%
Iteration: 200. Loss: 0.8175522089004517. Accuracy: 55.26315789473684%
Iteration: 300. Loss: 0.6403018832206726. Accuracy: 48.49624060150376%
Iteration: 400. Loss: 0.5755423307418823. Accuracy: 53.00751879699248%
Iteration: 500. Loss: 0.9929331541061401. Accuracy: 51.12781954887218%
Iteration: 600. Loss: 0.6193049550056458. Accuracy: 51.8796992481203%
Iteration: 700. Loss: 0.3519832193851471. Accuracy: 52.63157894736842%
Iteration: 800. Loss: 0.4879077672958374. Accuracy: 52.255639097744364%
Iteration: 900. Loss: 0.7298197150230408. Accuracy: 51.12781954887218%
Iteration: 1000. Loss: 0.4408475160598755. Accuracy: 52.63157894736842%
Iteration: 1100. Loss: 0.8674148321151733. Accuracy: 55.26315789473684%
Iteration: 1200. Loss: 0.8232302069664001. Accuracy: 53.7593984962406%
Iteration: 1300. Loss: 0.7793020009994507. Accuracy: 49.248120300

Iteration: 1400. Loss: 0.0064970930106937885. Accuracy: 86.46616541353383%
Iteration: 1500. Loss: 1.4212934970855713. Accuracy: 85.71428571428571%
Iteration: 200. Loss: 0.7029555439949036. Accuracy: 87.96992481203007%
Iteration: 300. Loss: 0.01260469015687704. Accuracy: 85.71428571428571%
Iteration: 400. Loss: 0.023288190364837646. Accuracy: 87.96992481203007%
Iteration: 500. Loss: 0.016081858426332474. Accuracy: 82.70676691729324%
Iteration: 600. Loss: 0.0060133966617286205. Accuracy: 84.21052631578948%
Iteration: 700. Loss: 0.0031439659651368856. Accuracy: 86.84210526315789%
Iteration: 800. Loss: 0.0072953966446220875. Accuracy: 89.47368421052632%
Iteration: 900. Loss: 1.3164324760437012. Accuracy: 87.21804511278195%
Iteration: 1000. Loss: 0.020978815853595734. Accuracy: 77.06766917293233%
Iteration: 1100. Loss: 0.003361648181453347. Accuracy: 86.46616541353383%
Iteration: 1200. Loss: 0.005521285813301802. Accuracy: 88.34586466165413%
Iteration: 1300. Loss: 1.5787038803100586. Accura

Iteration: 1200. Loss: 0.0014174662064760923. Accuracy: 87.21804511278195%
Iteration: 1300. Loss: 0.001042894902639091. Accuracy: 86.46616541353383%
Iteration: 1400. Loss: 0.0013633014168590307. Accuracy: 87.21804511278195%
Iteration: 1500. Loss: 0.0016487350221723318. Accuracy: 90.6015037593985%
Iteration: 1600. Loss: 0.0029856651090085506. Accuracy: 89.09774436090225%
Iteration: 1700. Loss: 0.0025914679281413555. Accuracy: 89.47368421052632%
Iteration: 1800. Loss: 0.0018995827995240688. Accuracy: 88.7218045112782%
Iteration: 1900. Loss: 0.002393835224211216. Accuracy: 86.09022556390977%
EPOCH -- 12
Iteration: 0. Loss: 0.001536618685349822. Accuracy: 87.21804511278195%
Iteration: 100. Loss: 0.0034092895220965147. Accuracy: 86.46616541353383%
Iteration: 200. Loss: 0.0011102947173640132. Accuracy: 86.84210526315789%
Iteration: 300. Loss: 0.005297551397234201. Accuracy: 80.45112781954887%
Iteration: 400. Loss: 0.010045211762189865. Accuracy: 87.59398496240601%
Iteration: 500. Loss: 0.005

Iteration: 300. Loss: 0.0006800960982218385. Accuracy: 88.7218045112782%
Iteration: 400. Loss: 0.0029420447535812855. Accuracy: 88.7218045112782%
Iteration: 500. Loss: 0.0036427113227546215. Accuracy: 87.59398496240601%
Iteration: 600. Loss: 0.0026144154835492373. Accuracy: 68.04511278195488%
Iteration: 700. Loss: 0.0034623933024704456. Accuracy: 87.96992481203007%
Iteration: 800. Loss: 0.004420509096235037. Accuracy: 89.47368421052632%
Iteration: 900. Loss: 0.5754057765007019. Accuracy: 90.22556390977444%
Iteration: 1000. Loss: 0.08577845990657806. Accuracy: 88.34586466165413%
Iteration: 1100. Loss: 0.0018199799815192819. Accuracy: 89.09774436090225%
Iteration: 1200. Loss: 0.005014343652874231. Accuracy: 88.7218045112782%
Iteration: 1300. Loss: 0.00646381126716733. Accuracy: 89.47368421052632%
Iteration: 1400. Loss: 0.013326265849173069. Accuracy: 87.21804511278195%
Iteration: 1500. Loss: 0.0014467497821897268. Accuracy: 86.09022556390977%
Iteration: 1600. Loss: 5.802181720733643. Acc

Iteration: 1500. Loss: 0.0006669919239357114. Accuracy: 81.57894736842105%
Iteration: 1600. Loss: 0.20719191431999207. Accuracy: 80.45112781954887%
Iteration: 1700. Loss: 0.0006990373367443681. Accuracy: 81.57894736842105%
Iteration: 1800. Loss: 0.03556769713759422. Accuracy: 79.69924812030075%
Iteration: 1900. Loss: 0.04713219031691551. Accuracy: 80.82706766917293%
EPOCH -- 23
Iteration: 0. Loss: 0.055448729544878006. Accuracy: 80.82706766917293%
Iteration: 100. Loss: 0.0002824861148837954. Accuracy: 81.95488721804512%
Iteration: 200. Loss: 0.049611642956733704. Accuracy: 79.69924812030075%
Iteration: 300. Loss: 0.04654180258512497. Accuracy: 79.32330827067669%
Iteration: 400. Loss: 0.07550942897796631. Accuracy: 79.69924812030075%
Iteration: 500. Loss: 0.07230474054813385. Accuracy: 80.07518796992481%
Iteration: 600. Loss: 0.9977675676345825. Accuracy: 52.255639097744364%
Iteration: 700. Loss: 0.46485599875450134. Accuracy: 74.06015037593986%
Iteration: 800. Loss: 1.512055516242981. 

Iteration: 600. Loss: 0.0015650654677301645. Accuracy: 84.9624060150376%
Iteration: 700. Loss: 0.001279369811527431. Accuracy: 84.21052631578948%
Iteration: 800. Loss: 0.010949527844786644. Accuracy: 84.21052631578948%
Iteration: 900. Loss: 0.001276512397453189. Accuracy: 84.21052631578948%
Iteration: 1000. Loss: 0.0006652049487456679. Accuracy: 81.95488721804512%
Iteration: 1100. Loss: 0.0010040724882856011. Accuracy: 86.46616541353383%
Iteration: 1200. Loss: 0.0011632826644927263. Accuracy: 81.95488721804512%
Iteration: 1300. Loss: 0.0006046851049177349. Accuracy: 81.57894736842105%
Iteration: 1400. Loss: 0.0006491222884505987. Accuracy: 81.57894736842105%
Iteration: 1500. Loss: 0.002354946220293641. Accuracy: 82.33082706766918%
Iteration: 1600. Loss: 0.0029948167502880096. Accuracy: 85.71428571428571%
Iteration: 1700. Loss: 0.0008821171941235662. Accuracy: 82.70676691729324%
Iteration: 1800. Loss: 0.033363908529281616. Accuracy: 74.43609022556392%
Iteration: 1900. Loss: 0.0209606047

In [35]:
outputs = []
lst_prediction =[]
lst_test = list(df_valid['text'])
model.eval()
for msg in lst_test:
    input_msg, _ = prepare_features(msg)
    if torch.cuda.is_available():
        input_msg = input_msg.cuda()
        output = model(input_msg)[0]
        outputs.append(output)
        _, pred_label = torch.max(output.data, 1)
        prediction=list(label_to_inx.keys())[pred_label]
        lst_prediction.append(prediction)

In [38]:
outputs = [o.to('cpu').detach().numpy().copy() for o in outputs]

In [40]:
lst_class = ['unsustainable','sustainable']

In [42]:
predictions2 = [] 
[predictions2.append([x[1] for x in [sorted(zip(example[0], lst_class), reverse=True)][0]]) for example in outputs]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [45]:
predictions3 = [a[0] for a in predictions2]

In [47]:
lst_true = list(df_valid['label'])

In [53]:
acc, f1_micro, f1_macro = get_result(predictions3, lst_true)

In [54]:
acc, f1_micro, f1_macro 

(0.8533834586466166, 0.8533834586466166, 0.8518571408171134)