<a href="https://colab.research.google.com/github/yellow951321/aicup_paper_classification/blob/master/roberta_colab_eight_calss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OneHotEncoder

## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pytorch-transformers



In [0]:
## PyTorch Transformer
from pytorch_transformers import RobertaModel, RobertaTokenizer
from pytorch_transformers import RobertaForSequenceClassification, RobertaConfig

In [4]:
## Check if Cuda is Available
print(torch.cuda.is_available())

True


In [5]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130


In [0]:
dataset_path = "drive/My Drive/data_paper_classification/"

In [7]:
dataset = pd.read_csv(dataset_path + "task2_trainset.csv", dtype=str)
dataset.head()

Unnamed: 0,Id,Title,Abstract,Authors,Categories,Created Date,Task 2
0,D00001,A Brain-Inspired Trust Management Model to Ass...,Rapid popularity of Internet of Things (IoT) a...,Mahmud/Kaiser/Rahman/Rahman/Shabut/Al-Mamun/Hu...,cs.CR/cs.AI/q-bio.NC,2018-01-11,THEORETICAL
1,D00002,On Efficient Computation of Shortest Dubins Pa...,"In this paper, we address the problem of compu...",Sadeghi/Smith,cs.SY/cs.RO/math.OC,2016-09-21,THEORETICAL
2,D00003,Data-driven Upsampling of Point Clouds,High quality upsampling of sparse 3D point clo...,Zhang/Jiang/Yang/Yamakawa/Shimada/Kara,cs.CV,2018-07-07,ENGINEERING
3,D00004,Accessibility or Usability of InteractSE? A He...,Internet is the main source of information now...,Aqle/Khowaja/Al-Thani,cs.HC,2018-08-29,EMPIRICAL
4,D00005,Spatio-Temporal Facial Expression Recognition ...,Automated Facial Expression Recognition (FER) ...,Hasani/Mahoor,cs.CV,2017-03-20,ENGINEERING


In [0]:
dataset.drop('Title',axis=1,inplace=True)
dataset.drop('Categories',axis=1,inplace=True)
dataset.drop('Created Date',axis=1, inplace=True)
dataset.drop('Authors',axis=1,inplace=True)

In [9]:
label_to_ix = {}
for label in dataset["Task 2"]:
      if label not in label_to_ix:
          label_to_ix[label]=len(label_to_ix)
label_to_ix

{'EMPIRICAL': 2,
 'ENGINEERING': 1,
 'ENGINEERING EMPIRICAL': 4,
 'OTHERS': 6,
 'THEORETICAL': 0,
 'THEORETICAL EMPIRICAL': 3,
 'THEORETICAL ENGINEERING': 5,
 'THEORETICAL ENGINEERING EMPIRICAL': 7}

In [0]:
# enc_array = []
# for key in label_to_ix:
#   enc_array.append([key])
# print(enc_array)
# enc = OneHotEncoder(handle_unknown='ignore')
# enc.fit(enc_array)

In [0]:
# enc.transform([['THEORETICAL']]).toarray()

In [0]:
# dataset.head()

In [0]:
# def label_to_one_hot(label):
#   label = [[label]]
#   return enc.transform(label).toarray()[0]

# dataset["Task 2"] = dataset["Task 2"].apply(label_to_one_hot)
# dataset.head()

In [10]:
config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = len(list(label_to_ix.values()))
config

{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 8,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [0]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification(config)

In [0]:
def prepare_features(seq_1, max_seq_length = 300, 
             zero_pad = True, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [13]:
msg = "Hello world!"
prepare_features(msg)

(tensor([[    0, 20920,   232,   328,     2,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   

In [0]:
## Dataset Loader Classes

In [0]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        abstract = self.data.Abstract[index]
        label = self.data["Task 2"][index]
        X, _  = prepare_features(abstract)
        y = label_to_ix[label]
        return X, y
    
    def __len__(self):
        return self.len

In [0]:
train_size = 0.8
train_dataset=dataset.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

In [16]:
print("FULL Dataset: {}".format(dataset.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (7000, 3)
TRAIN Dataset: (5600, 3)
TEST Dataset: (1400, 3)


In [0]:
training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

In [18]:
training_set.__getitem__(101)

(tensor([[    0, 30331,   154,  1703, 15744,    30, 17746,   583,    86,  1703,
           7964,    16,    41,   505,   936,    25,    24,  3315,     7,  2375,
            304,     9,  4240,  1915,     4, 42654,  1629, 28565,  1546,  1639,
            335,    59,  1713,     9,  5868,     8,   592,  1061,     4, 42654,
           1629, 42702,     6,    19,     5,   244,     9,   592,  1546,     6,
             52,    64, 14660,    61,  5868,    40,  2725,    10,  1989,   515,
             36,   179,   583,    86,    43,     8,    64,  3278,  3041,     9,
           1703,   716,    15,    24,     4, 42654,  1629,   713,  5699,    62,
             10,  1810,   443,     9,   557,    61,  9748,   240,     7,    33,
             10,  7208,    13,  1703,  1052,    14,    64,  5604,  4499, 17294,
              9,   588,    12,  5367,  5759,     8,   694,    10,   169,     7,
          41393,   877,  2115,     8, 10516,    92,  2956,     4, 42654,  1629,
           1121,    42,  2225,     6,   

In [19]:
model(training_set.__getitem__(0)[0])

(tensor([[ 0.2655,  0.2495, -0.4939, -0.3939,  0.1698,  0.1903, -0.1979, -0.3050]],
        grad_fn=<AddmmBackward>),)

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

In [0]:
# Parameters
params = {'batch_size': 1,
          'shuffle': True,
          'drop_last': False,
          'num_workers': 1}

In [0]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [0]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [24]:
## Test Forward Pass
inp = training_set.__getitem__(0)[0].cuda()
output = model(inp)[0]
print(output.shape)

torch.Size([1, 8])


In [25]:
max_epochs = 3
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

EPOCH -- 0
Iteration: 0. Loss: 2.502314329147339. Accuracy: 14.214285714285714%
Iteration: 100. Loss: 1.710564136505127. Accuracy: 25.642857142857142%
Iteration: 200. Loss: 1.870490312576294. Accuracy: 28.928571428571427%
Iteration: 300. Loss: 2.017847776412964. Accuracy: 29.285714285714285%
Iteration: 400. Loss: 2.0381150245666504. Accuracy: 26.0%
Iteration: 500. Loss: 1.2430567741394043. Accuracy: 15.714285714285714%
Iteration: 600. Loss: 3.1389307975769043. Accuracy: 29.5%
Iteration: 700. Loss: 1.5796668529510498. Accuracy: 29.071428571428573%
Iteration: 800. Loss: 3.07033634185791. Accuracy: 27.214285714285715%
Iteration: 900. Loss: 2.183943271636963. Accuracy: 25.785714285714285%
Iteration: 1000. Loss: 2.9822802543640137. Accuracy: 28.785714285714285%
Iteration: 1100. Loss: 1.7851871252059937. Accuracy: 21.928571428571427%
Iteration: 1200. Loss: 2.884763479232788. Accuracy: 18.071428571428573%
Iteration: 1300. Loss: 1.472130537033081. Accuracy: 29.571428571428573%
Iteration: 1400.

In [0]:
 torch.save(model.state_dict(), dataset_path + str(uuid4())+'.pth')

In [42]:
model_path =  dataset_path + 'c19bcada-ee79-4886-9afa-32d8e06730b4.pth'
model.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

In [0]:
def eval(msg):
  model.eval()
  input_msg, _ = prepare_features(msg)
  if torch.cuda.is_available():
    input_msg = input_msg.cuda()
  with torch.no_grad():  
    output = model(input_msg)[0]
  return output

In [27]:
model(testing_set.__getitem__(31)[0].cuda())

(tensor([[ 1.1024,  0.9449,  0.1461, -0.7214, -0.2943,  0.4871, -1.1799, -2.0058]],
        device='cuda:0', grad_fn=<AddmmBackward>),)

In [28]:
testing_set.__getitem__(39)[1]

5

In [0]:
dataset = pd.read_csv(dataset_path + "task2_public_testset.csv", dtype=str)

In [0]:
dataset.drop('Title',axis=1,inplace=True)
dataset.drop('Categories',axis=1,inplace=True)
dataset.drop('Created Date',axis=1, inplace=True)
dataset.drop('Authors',axis=1,inplace=True)

In [33]:
## Test Forward Pass
inp = training_set.__getitem__(0)[0].cuda()
output = model(inp)[0]
print(output)

tensor([[ 0.8069,  1.3533,  0.1872, -0.7318, -0.4117,  0.4433, -1.0847, -1.4918]],
       device='cuda:0', grad_fn=<AddmmBackward>)


In [34]:
print(inp.shape)

torch.Size([1, 300])


In [0]:
submit_data = pd.read_csv(dataset_path + "task2_sample_submission.csv", dtype=str)

In [32]:
submit_data.head()

Unnamed: 0,order_id,THEORETICAL,ENGINEERING,EMPIRICAL,OTHERS
0,T00001,0,0,0,0
1,T00002,0,0,0,0
2,T00003,0,0,0,0
3,T00004,0,0,0,0
4,T00005,0,0,0,0


In [0]:
output = []
for index in range(dataset.shape[0]):
  output.append(eval(dataset.iloc[index]['Abstract']))

In [45]:
print(output)

[tensor([[ 0.9697,  0.9960,  0.0640, -0.5408, -0.1688,  0.3359, -1.2588, -1.6480]],
       device='cuda:0'), tensor([[ 1.0649,  0.9861,  0.0355, -0.5775, -0.1805,  0.3195, -1.2055, -1.7371]],
       device='cuda:0'), tensor([[ 0.9966,  0.9933,  0.0557, -0.5502, -0.1709,  0.3292, -1.2435, -1.6755]],
       device='cuda:0'), tensor([[ 1.0000,  0.9930,  0.0546, -0.5514, -0.1712,  0.3284, -1.2416, -1.6789]],
       device='cuda:0'), tensor([[ 2.0155,  0.6475, -0.3245, -1.1057, -0.4894,  0.4272, -0.4425, -2.2719]],
       device='cuda:0'), tensor([[ 0.9722,  0.9957,  0.0632, -0.5416, -0.1689,  0.3351, -1.2574, -1.6508]],
       device='cuda:0'), tensor([[ 1.1334,  0.9763,  0.0156, -0.6064, -0.1939,  0.3148, -1.1671, -1.7918]],
       device='cuda:0'), tensor([[ 0.9794,  0.9950,  0.0609, -0.5441, -0.1695,  0.3332, -1.2533, -1.6582]],
       device='cuda:0'), tensor([[ 0.9703,  0.9959,  0.0638, -0.5410, -0.1688,  0.3357, -1.2584, -1.6488]],
       device='cuda:0'), tensor([[ 0.9754,  0.9954, 

In [57]:
count = [0 for x in range(len(label_to_ix))]
print(count)
for item in output:
  _, pred_label = torch.max(item, 1)
  count[pred_label] += 1

[0, 0, 0, 0, 0, 0, 0, 0]


In [58]:
count

[8602, 11398, 0, 0, 0, 0, 0, 0]