In [0]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# check the availability of the GPU
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

## Imports

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import csv

import warnings
warnings.filterwarnings('ignore')

import gensim
import re
import seaborn as sns
from collections import Counter

import nltk
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer, WordPunctTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

from sklearn.decomposition import PCA
import pickle

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Wrangling

In [0]:
# import the critical findings dataset which is in csv format
df_critical = pd.read_csv('/content/drive/My Drive/Critical_Findings/critical-findings-sample-data-20180601-20180901.csv')
df_critical.replace(['Complete Critical Finding', 'Complete Physician Decline'],['complete_critical_findings','complete_physician_decline'],inplace=True)
df_critical.head()

Unnamed: 0,Modality,Critical_Finding,Category,Data
0,CT,complete_critical_findings,Significant Vascular Pathology,STUDY: CT CHEST WITH CONTRAST<br /><br />REA...
1,CT,complete_critical_findings,Significant Vascular Pathology,STUDY: CT CHEST WITH CONTRAST<br /><br />REA...
2,CT,complete_critical_findings,Significant Vascular Pathology,STUDY: CT CHEST WITH CONTRAST<br /><br />REA...
3,CT,complete_critical_findings,Acute Vascular Event,HISTORY: left sided weakness<br /><br />TECHNI...
4,CT,complete_critical_findings,Acute Vascular Event,HISTORY: left sided weakness<br /><br />TECHNI...


In [0]:
# import the non-critical dataset which is in csv format
df_noncritical = pd.read_csv('/content/drive/My Drive/Critical_Findings/non-critical-findings-sample-data-20180601-20180901.csv')
df_noncritical.Critical_Finding.replace('None','no_critical_finding',inplace=True)
df_noncritical.Category.replace(np.nan,'no_category',inplace=True)
df_noncritical.head()

Unnamed: 0,Modality,Critical_Finding,Category,Data
0,CR,no_critical_finding,no_category,STUDY: X-RAY - LEFT HAND<br /><br />REASON F...
1,US,no_critical_finding,no_category,STUDY: VENOUS DOPPLER ULTRASOUND -LEFT UPPER...
2,CR,no_critical_finding,no_category,XR Chest 1 View<br /><br />INDICATION: for com...
3,CR,no_critical_finding,no_category,STUDY: X-RAY CHEST<br /><br />REASON FOR EXA...
4,US,no_critical_finding,no_category,STUDY: VENOUS DOPPLER ULTRASOUND - LEFT LOWE...


### Merge both datasets

In [0]:
# append both datasets into one dataframe 
df = df_critical.append(df_noncritical)

# free memory by deleting partial datasets
del df_noncritical
del df_critical
df.head()

Unnamed: 0,Modality,Critical_Finding,Category,Data
0,CT,complete_critical_findings,Significant Vascular Pathology,STUDY: CT CHEST WITH CONTRAST<br /><br />REA...
1,CT,complete_critical_findings,Significant Vascular Pathology,STUDY: CT CHEST WITH CONTRAST<br /><br />REA...
2,CT,complete_critical_findings,Significant Vascular Pathology,STUDY: CT CHEST WITH CONTRAST<br /><br />REA...
3,CT,complete_critical_findings,Acute Vascular Event,HISTORY: left sided weakness<br /><br />TECHNI...
4,CT,complete_critical_findings,Acute Vascular Event,HISTORY: left sided weakness<br /><br />TECHNI...


### Data Cleaning

**Steps in Text Preprocessing**
1. **tags removal** : \&gt;   \&lt;    \&quot;   \<br />
2. **Lower Case** 
3. **Deontraction**
4. **Replacing words not in Word Embeddings**
5. **Punctuations and Numbers Removal**:
6. **Stopwords Removal**
7. **Removal of Smaller length words (len <= 2)**
8. **Tokenization**
9. **Lematization**


In [0]:
# import stop words for english
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [0]:
# tokenizer
tok = WordPunctTokenizer()

# replacement dictionary
replace_dict ={'c-collar':'cervical collar',
               'c-section':'caesarean section',
               'c-spine':'cervical spine',
               'chest-shortness':'chest pain and shortness of breath',
               'cul-de-sac':'dead end',
               'csf':'cerebrospinal fluid',
               'd-dimer':'ddimer',
               'x-ray':'xray',
               'x-rays':'xray',
               'ct':'computed tomography',               
               'barchie':'',
               'kamholtz':'',
               'neuroforamina':'',
               'bibasilar':'',
               'breckwoldt': '',
               'costophrenic': 'places where the diaphragm meets the ribs',
               'ctdivol': 'volume computed tomography dose index',
               'hyperdensity': 'high hemoglobin content of retracted clot or blood',
               'mgycm': 'absorbed radiation per kilogram per centimeter',
               'neuroforamina': 'compression of a spinal nerve',
               'nonobstructing': 'non obstructing',
               'periappendiceal': 'near the appendix'
              }

# tags found in data
tags = ['&gt;','&lt;','&quot;','<br />']

In [0]:
# function for text-preprocessing 
def text_preprocessor(text):
  
  #tag removal
  for tag in tags:
    text = text.replace(tag, ' ')
  
  # decontract the contractions
  def decontract(phrase):
    #phrase = re.sub(r"n\'t", " not", phrase)
    #phrase = re.sub(r"\'re", " are", phrase)
    #phrase = re.sub(r"\'s", " is", phrase)
    #phrase = re.sub(r"\'d", " would", phrase)
    #phrase = re.sub(r"\'ll", " will", phrase)
    #phrase = re.sub(r"\'t", " not", phrase)
    #phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase
  decontracted = decontract(text)

  # lower case
  lower_case = decontracted.lower()

  # replacements
  for before, after in replace_dict.items():
    lower_case = ' '.join([after if before==word else word for word in lower_case.split() ])
  
  # punctuations and numeric removal
  letters_only = re.sub("[^a-zA-Z]", " ", lower_case)
  
  # remove words with 2 or fewer characters
  removed = re.sub(r'\b\w{1,2}\b', '', letters_only)
  
  # tokenize
  words = tok.tokenize(removed)

  # stop words removal
  #stop_word_removed = [word for word in words if word not in stopword_list]
  
  # lemmatize words using WordNet
  lmtzr = WordNetLemmatizer()
  lemmatized_list = [lmtzr.lemmatize(word) for word in words]
  lemmatized_sentence = ' '.join(lemmatized_list)
  
  return lemmatized_sentence

In [0]:
# Cleaning the text on entire df
df.Data = df.Data.apply(text_preprocessor)

# XL Net
- XLNet makes use of a permutation operation during training time that allows context to consists of tokens from both left and right, capturing the bidirectional context, making it a generalized order-aware AR language model.
- During pretraining, XLNet adopts the segment recurrent mechanism and relative encoding scheme proposed in Transformer-XL.

In [0]:
# configurations
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [0]:
# installing bert and transformers
!pip install pytorch-pretrained-bert
!pip install pytorch_transformers



In [0]:
# imports for XLNet

import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_pretrained_bert import BertAdam
from pytorch_transformers import (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)

In [0]:
# retain only required columns
df = df[['Data','Critical_Finding']]

# manual label encoding
df.Critical_Finding = df.Critical_Finding.replace(['complete_critical_findings', 'complete_physician_decline','no_critical_finding'],[1,0,0])

# data preparation
df.Critical_Finding = df.Critical_Finding.astype('int32')
df.rename(columns={'Data':'texts', 'Critical_Finding':'labels'}, inplace=True)

In [0]:
df.labels.value_counts()

0    509
1    491
Name: labels, dtype: int64

## Parser

In [0]:
# Get sentence data
sentences = df.texts.to_list()
sentences[0]

'study computed tomography chest with contrast reason for exam male year old chest pain and esophageal dilation radiation dosage supplied facility volume computed tomography dose index mgy dlp absorbed radiation per kilogram per centimeter technique transaxial imaging wa performed following intravenous administration isovue contrast material individualized dose optimization technique were used for this comparison none finding the lung are normal there demonstrated pleural abnormality normal heart and pericardium normal mediastinum normal hilar region pulmonary embolus are present right lower lobe arterial branch normal aorta arch and descending thoracic aorta normal osseous structure there demonstrated abnormality the visualized upper abdomen impression pulmonary embolus are present right lower lobe arterial branch evidence acute pulmonary mediastinal pathology'

In [0]:
# Get tag labels data
labels = df.labels.to_list()
print(labels[:10])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [0]:
# Set a dict for mapping id to tag name
tag2idx={'0': 0,
 '1': 1}

In [0]:
# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}
tag2name

{0: '0', 1: '1'}

## Make training data

In [0]:
# check the number of gpus
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

1

In [0]:
# Download the vocab for xlnet
!wget https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model
vocabulary = 'xlnet-base-cased-spiece.model'

--2019-08-20 08:11:41--  https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.81.35
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.81.35|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 798011 (779K) [binary/octet-stream]
Saving to: ‘xlnet-base-cased-spiece.model.1’


2019-08-20 08:11:41 (2.36 MB/s) - ‘xlnet-base-cased-spiece.model.1’ saved [798011/798011]



In [0]:
# Len of the sentence must be the same as the training model
# See model's 'max_position_embeddings' = 512
max_len  = 64

In [0]:
# With cased model, set do_lower_case = False
tokenizer = XLNetTokenizer(vocab_file=vocabulary,do_lower_case=True)

In [0]:
full_input_ids = []
full_input_masks = []
full_segment_ids = []

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

UNK_ID = tokenizer.encode("<unk>")[0]
CLS_ID = tokenizer.encode("<cls>")[0]
SEP_ID = tokenizer.encode("<sep>")[0]
MASK_ID = tokenizer.encode("<mask>")[0]
EOD_ID = tokenizer.encode("<eod>")[0]

for i,sentence in enumerate(sentences):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)
    
    # Trim the len of text
    if(len(tokens_a)>max_len-2):
        tokens_a = tokens_a[:max_len-2]
        
        
    tokens = []
    segment_ids = []
    
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)
        
    # Add <sep> token 
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)
    
    
    # Add <cls> token
    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)
    
    input_ids = tokens
    
    # The mask has 0 for real tokens and 1 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length at fornt
    if len(input_ids) < max_len:
        delta_len = max_len - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_len
    assert len(input_mask) == max_len
    assert len(segment_ids) == max_len
    
    full_input_ids.append(input_ids)
    full_input_masks.append(input_mask)
    full_segment_ids.append(segment_ids)
    
    if 3 > i:
        print("No.:%d"%(i))
        print("sentence: %s"%(sentence))
        print("input_ids:%s"%(input_ids))
        print("attention_masks:%s"%(input_mask))
        print("segment_ids:%s"%(segment_ids))
        print("\n")

No.:0
sentence: study computed tomography chest with contrast reason for exam male year old chest pain and esophageal dilation radiation dosage supplied facility volume computed tomography dose index mgy dlp absorbed radiation per kilogram per centimeter technique transaxial imaging wa performed following intravenous administration isovue contrast material individualized dose optimization technique were used for this comparison none finding the lung are normal there demonstrated pleural abnormality normal heart and pericardium normal mediastinum normal hilar region pulmonary embolus are present right lower lobe arterial branch normal aorta arch and descending thoracic aorta normal osseous structure there demonstrated abnormality the visualized upper abdomen impression pulmonary embolus are present right lower lobe arterial branch evidence acute pulmonary mediastinal pathology
input_ids:[757, 19712, 66, 22, 98, 6336, 2876, 33, 3377, 994, 28, 6105, 2725, 119, 532, 2876, 1593, 21, 17, 202

In [0]:
# Set label embedding
tags = [tag2idx[str(lab)] for lab in labels]
print(tags[0:10])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


## Split data into train and validate

In [0]:
tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = train_test_split(full_input_ids, tags,full_input_masks,full_segment_ids, random_state=4, test_size=0.3)

In [0]:
len(tr_inputs),len(val_inputs),len(tr_segs),len(val_segs)

(700, 300, 700, 300)

In [0]:
# Set data into tensor
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)

In [0]:
# Set batch num
batch_num = 32

In [0]:
# Set token embedding, attention embedding, segment embedding
train_data = TensorDataset(tr_inputs, tr_masks,tr_segs, tr_tags)
train_sampler = RandomSampler(train_data)

# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num,drop_last=True)
valid_data = TensorDataset(val_inputs, val_masks,val_segs, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

## Train model

In [0]:
# download the config and the pretrained model for xlnet
!wget https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin
!wget https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json

--2019-08-20 08:11:46--  https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.160.93
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.160.93|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 467042463 (445M) [application/octet-stream]
Saving to: ‘xlnet-base-cased-pytorch_model.bin’


2019-08-20 08:11:57 (42.2 MB/s) - ‘xlnet-base-cased-pytorch_model.bin’ saved [467042463/467042463]

--2019-08-20 08:11:58--  https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.107.126
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.107.126|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 641 [application/json]
Saving to: ‘xlnet-base-cased-config.json’


2019-08-20 08:11:59 (13.6 MB/s) - ‘xlnet-base-cased-config.json’ saved [641/641]



In [0]:
# rename the files
!mv xlnet-base-cased-config.json config.json
!mv xlnet-base-cased-pytorch_model.bin pytorch_model.bin

In [0]:
model_file_address = './'

In [0]:
# model definition
model = XLNetForSequenceClassification.from_pretrained(model_file_address,num_labels=len(tag2idx))

In [0]:
# load model to GPU
model.to(device)

In [0]:
# Add multi GPU support
if n_gpu >1:
    model = torch.nn.DataParallel(model)

In [0]:
# Set epoch and grad max num
epochs = 50
max_grad_norm = 1.0

In [0]:
# Cacluate train optimiazaion num
num_train_optimization_steps = int( math.ceil(len(tr_inputs) / batch_num) / 1) * epochs

## Set fine tuning method

In [0]:
# True: fine tuning all the layers 
# False: only fine tuning the classifier layers
FULL_FINETUNING = True

In [0]:
if FULL_FINETUNING:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

## Fine-tuing model

In [0]:
# TRAIN loop
model.train();

In [0]:
# training code

print("***** Running training *****")
print("  Num examples = %d"%(len(tr_inputs)))
print("  Batch size = %d"%(batch_num))
print("  Num steps = %d"%(num_train_optimization_steps))
for _ in trange(epochs,desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_segs,b_labels = batch
        
        # forward pass
        outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
        loss, logits = outputs[:2]
        if n_gpu>1:
            # When multi gpu, average it
            loss = loss.mean()
        
        # backward pass
        loss.backward()
        
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
        optimizer.zero_grad()
        
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 700
  Batch size = 32
  Num steps = 1100


Epoch:   2%|▏         | 1/50 [00:21<17:52, 21.90s/it]

Train loss: 0.0434840577876284


Epoch:   4%|▍         | 2/50 [00:43<17:29, 21.86s/it]

Train loss: 0.02763700547317664


Epoch:   6%|▌         | 3/50 [01:05<17:06, 21.83s/it]

Train loss: 0.08767268230162915


Epoch:   8%|▊         | 4/50 [01:27<16:42, 21.79s/it]

Train loss: 0.04060234626134237


Epoch:  10%|█         | 5/50 [01:48<16:18, 21.75s/it]

Train loss: 0.03447526289771


Epoch:  12%|█▏        | 6/50 [02:10<15:57, 21.75s/it]

Train loss: 0.07827247173658439


Epoch:  14%|█▍        | 7/50 [02:32<15:35, 21.74s/it]

Train loss: 0.07294946390071086


Epoch:  16%|█▌        | 8/50 [02:54<15:13, 21.74s/it]

Train loss: 0.06283354125029984


Epoch:  18%|█▊        | 9/50 [03:15<14:50, 21.72s/it]

Train loss: 0.045368382574192116


Epoch:  20%|██        | 10/50 [03:37<14:29, 21.73s/it]

Train loss: 0.054584335624462084


Epoch:  22%|██▏       | 11/50 [03:59<14:07, 21.72s/it]

Train loss: 0.04626942590056431


Epoch:  24%|██▍       | 12/50 [04:20<13:45, 21.73s/it]

Train loss: 0.048989996269700076


Epoch:  26%|██▌       | 13/50 [04:42<13:23, 21.73s/it]

Train loss: 0.024575138775010903


Epoch:  28%|██▊       | 14/50 [05:04<13:02, 21.73s/it]

Train loss: 0.03625965269193763


Epoch:  30%|███       | 15/50 [05:25<12:38, 21.68s/it]

Train loss: 0.02548893230656783


Epoch:  32%|███▏      | 16/50 [05:47<12:16, 21.67s/it]

Train loss: 0.023971654785176117


Epoch:  34%|███▍      | 17/50 [06:09<11:55, 21.67s/it]

Train loss: 0.020384643226861954


Epoch:  36%|███▌      | 18/50 [06:30<11:33, 21.68s/it]

Train loss: 0.031213146457005115


Epoch:  38%|███▊      | 19/50 [06:52<11:12, 21.69s/it]

Train loss: 0.022357652451665627


Epoch:  40%|████      | 20/50 [07:14<10:50, 21.69s/it]

Train loss: 0.018051324323529287


Epoch:  42%|████▏     | 21/50 [07:36<10:29, 21.70s/it]

Train loss: 0.05166825182026341


Epoch:  44%|████▍     | 22/50 [07:57<10:08, 21.72s/it]

Train loss: 0.04621710827840226


Epoch:  46%|████▌     | 23/50 [08:19<09:46, 21.72s/it]

Train loss: 0.03773940199365219


Epoch:  48%|████▊     | 24/50 [08:41<09:24, 21.72s/it]

Train loss: 0.052271101490727494


Epoch:  50%|█████     | 25/50 [09:03<09:03, 21.74s/it]

Train loss: 0.03850399720526877


Epoch:  52%|█████▏    | 26/50 [09:24<08:41, 21.74s/it]

Train loss: 0.04236631995687882


Epoch:  54%|█████▍    | 27/50 [09:46<08:19, 21.74s/it]

Train loss: 0.03977356363265287


Epoch:  56%|█████▌    | 28/50 [10:08<07:58, 21.73s/it]

Train loss: 0.03367176847088905


Epoch:  58%|█████▊    | 29/50 [10:29<07:36, 21.73s/it]

Train loss: 0.04862955061807519


Epoch:  60%|██████    | 30/50 [10:51<07:14, 21.74s/it]

Train loss: 0.0406802153835694


Epoch:  62%|██████▏   | 31/50 [11:13<06:52, 21.74s/it]

Train loss: 0.029954807389350163


Epoch:  64%|██████▍   | 32/50 [11:35<06:30, 21.72s/it]

Train loss: 0.03618876024016312


Epoch:  66%|██████▌   | 33/50 [11:56<06:09, 21.72s/it]

Train loss: 0.025428134859317823


Epoch:  68%|██████▊   | 34/50 [12:18<05:47, 21.71s/it]

Train loss: 0.024632032871955915


Epoch:  70%|███████   | 35/50 [12:40<05:25, 21.70s/it]

Train loss: 0.03668307539607797


Epoch:  72%|███████▏  | 36/50 [13:01<05:03, 21.70s/it]

Train loss: 0.025823348955739112


Epoch:  74%|███████▍  | 37/50 [13:23<04:42, 21.69s/it]

Train loss: 0.02042389080105793


Epoch:  76%|███████▌  | 38/50 [13:45<04:20, 21.70s/it]

Train loss: 0.020382489831674667


Epoch:  78%|███████▊  | 39/50 [14:06<03:58, 21.70s/it]

Train loss: 0.024589041336661295


Epoch:  80%|████████  | 40/50 [14:28<03:36, 21.68s/it]

Train loss: 0.0332695757526727


Epoch:  82%|████████▏ | 41/50 [14:50<03:15, 21.68s/it]

Train loss: 0.02270529791712761


Epoch:  84%|████████▍ | 42/50 [15:12<02:53, 21.69s/it]

Train loss: 0.035145257599651814


Epoch:  86%|████████▌ | 43/50 [15:33<02:31, 21.68s/it]

Train loss: 0.02177392052752631


Epoch:  88%|████████▊ | 44/50 [15:55<02:09, 21.66s/it]

Train loss: 0.020429343960824468


Epoch:  90%|█████████ | 45/50 [16:16<01:48, 21.67s/it]

Train loss: 0.021364473116894562


Epoch:  92%|█████████▏| 46/50 [16:38<01:26, 21.66s/it]

Train loss: 0.020013450423166865


Epoch:  94%|█████████▍| 47/50 [17:00<01:05, 21.67s/it]

Train loss: 0.026364826020740327


Epoch:  96%|█████████▌| 48/50 [17:21<00:43, 21.67s/it]

Train loss: 0.02445637984644799


Epoch:  98%|█████████▊| 49/50 [17:43<00:21, 21.68s/it]

Train loss: 0.0212004031719906


Epoch: 100%|██████████| 50/50 [18:05<00:00, 21.69s/it]

Train loss: 0.021344519619430815





In [0]:
# output directory
xlnet_out_address = 'xlnet_out_model'

if not os.path.exists(xlnet_out_address):
  os.makedirs(xlnet_out_address)

In [0]:
# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

In [0]:
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(xlnet_out_address, "pytorch_model.bin")
output_config_file = os.path.join(xlnet_out_address, "config.json")

In [0]:
# Save model into file
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(xlnet_out_address)

('xlnet_out_model/spiece.model',)

## Load model

In [0]:
model = XLNetForSequenceClassification.from_pretrained(xlnet_out_address,num_labels=len(tag2idx))

In [0]:
# Set model to GPU
model.to(device);

In [0]:
# make use of multiple gpus if present
if n_gpu >1:
    model = torch.nn.DataParallel(model)

## Eval model

In [0]:
# Evalue loop
model.eval();

In [0]:
# Set acc funtion
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

In [0]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

y_true = []
y_predict = []
print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_segs,b_labels = batch
    
    with torch.no_grad():
        outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
        tmp_eval_loss, logits = outputs[:2]
    
    # Get textclassification predict result
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = accuracy(logits, label_ids)
#     print(tmp_eval_accuracy)
#     print(np.argmax(logits, axis=1))
#     print(label_ids)
    
    # Save predict and real label reuslt for analyze
    for predict in np.argmax(logits, axis=1):
        y_predict.append(predict)
        
    for real_result in label_ids.tolist():
        y_true.append(real_result)

    
    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy
   
    nb_eval_steps += 1
    
    
eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / len(val_inputs)
loss = tr_loss/nb_tr_steps 
result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'loss': loss}
report = classification_report(y_pred=np.array(y_predict),y_true=np.array(y_true))

# Save the report into file
output_eval_file = os.path.join(xlnet_out_address, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in sorted(result.keys()):
        print("  %s = %s"%(key, str(result[key])))
        writer.write("%s = %s\n" % (key, str(result[key])))
        
    print(report)
    writer.write("\n\n")  
    writer.write(report)

***** Running evaluation *****
  Num examples =300
  Batch size = 32
***** Eval results *****
  eval_accuracy = 0.8766666666666667
  eval_loss = 1.284832239151001
  loss = 0.021344519619430815
              precision    recall  f1-score   support

           0       0.88      0.87      0.88       151
           1       0.87      0.88      0.88       149

    accuracy                           0.88       300
   macro avg       0.88      0.88      0.88       300
weighted avg       0.88      0.88      0.88       300

