In [13]:
import pandas as pd
import math
import numpy as np
from sklearn.metrics import classification_report
import torch.nn.functional as F

In [14]:

import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

In [15]:
from transformers import (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)

In [19]:
df_data = pd.read_csv('awd_data.csv',sep=",",encoding = "ISO-8859-1", engine='python',names=['labels','texts'],header=0)
df_data

Unnamed: 0,labels,texts
0,0,Prof. Jean Paul Allain completed his Ph.D. deg...
1,0,Prof. Allain joined Argonne National Laborator...
2,0,Prof. Allain joined the faculty at the Univers...
3,0,He is an affiliate faculty with the Department...
4,0,During this 5 years he was working closely wit...
...,...,...
1507,0,Although I am interested in how these question...
1508,0,"At CFI, I am currently looking at ethical and ..."
1509,0,Many commentators have worried that such syste...
1510,0,My research seeks to analyse and explicate the...


In [20]:
df_data.labels.unique()

array([0, 1], dtype=int64)

In [21]:
df_data.labels.value_counts()

0    1277
1     235
Name: labels, dtype: int64

In [22]:
# Get sentence data
sentences = df_data.texts.to_list()
sentences[0]

'Prof. Jean Paul Allain completed his Ph.D. degree from the Department of Nuclear, Plasma and Radiological Engineering at the University of Illinois, Urbana-Champaign.'

In [23]:
# Get tag labels data
labels = df_data.labels.to_list()
print(labels[0])

0


In [25]:
# Set a dict for mapping id to tag name
#tag2idx = {t: i for i, t in enumerate(tags_vals)}

# Recommend to set it by manual define, good for reusing
# 0:negative, 1: positive
tag2idx={'0': 0,'1': 1}

In [26]:
# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

1

In [29]:
vocabulary = 'xlnet-base-cased-spiece.model'


In [30]:
# Len of the sentence must be the same as the training model
# See model's 'max_position_embeddings' = 512
max_len  = 64

In [31]:
# With cased model, set do_lower_case = False
tokenizer = XLNetTokenizer(vocab_file=vocabulary,do_lower_case=False)

In [33]:
max_len  = 64

full_input_ids = []
full_input_masks = []
full_segment_ids = []

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

UNK_ID = tokenizer.encode("")[0]
CLS_ID = tokenizer.encode("")[0]
SEP_ID = tokenizer.encode("")[0]
MASK_ID = tokenizer.encode("")[0]
EOD_ID = tokenizer.encode("")[0]

for i,sentence in enumerate(sentences):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)
    
    # Trim the len of text
    if(len(tokens_a)>max_len-2):
        tokens_a = tokens_a[:max_len-2]
        
        
    tokens = []
    segment_ids = []
    
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)
        
    # Add  token 
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)
    
    
    # Add  token
    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)
    
    input_ids = tokens
    
    # The mask has 0 for real tokens and 1 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length at fornt
    if len(input_ids) < max_len:
        delta_len = max_len - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_len
    assert len(input_mask) == max_len
    assert len(segment_ids) == max_len
    
    full_input_ids.append(input_ids)
    full_input_masks.append(input_mask)
    full_segment_ids.append(segment_ids)
    
    if 3 > i:
        print("No.:%d"%(i))
        print("sentence: %s"%(sentence))
        print("input_ids:%s"%(input_ids))
        print("attention_masks:%s"%(input_mask))
        print("segment_ids:%s"%(segment_ids))
        print("\n")

No.:0
sentence: Prof. Jean Paul Allain completed his Ph.D. degree from the Department of Nuclear, Plasma and Radiological Engineering at the University of Illinois, Urbana-Champaign.
input_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16514, 9, 3290, 1209, 394, 7018, 1570, 45, 4714, 9, 417, 9, 1693, 40, 18, 760, 20, 14223, 19, 8104, 23, 661, 21, 3402, 10838, 6150, 38, 18, 315, 20, 3900, 19, 9359, 101, 13, 323, 1714, 1831, 5486, 9, 4, 3, 4, 4]
attention_masks:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids:[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]


No.:1
sentence: Prof. Allain joined Argonne National Laboratory as a staff scientist in 2003 and joined the faculty in

In [34]:
# Make label into id
tags = [tag2idx[str(lab)] for lab in labels]
print(tags[0])

0


In [35]:
tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = train_test_split(full_input_ids, tags,full_input_masks,full_segment_ids, 
                                                            random_state=4, test_size=0.3)

In [36]:
len(tr_inputs),len(val_inputs),len(tr_segs),len(val_segs)

(1058, 454, 1058, 454)

In [37]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)


In [38]:
# Set batch num
batch_num = 32

In [39]:
# Set token embedding, attention embedding, segment embedding
train_data = TensorDataset(tr_inputs, tr_masks,tr_segs, tr_tags)
train_sampler = RandomSampler(train_data)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num,drop_last=True)

valid_data = TensorDataset(val_inputs, val_masks,val_segs, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

In [40]:
# In this document, contain confg(txt) and weight(bin) files
# The folder must contain: pytorch_model.bin, config.json
model_file_address = 'models'

In [41]:
model = XLNetForSequenceClassification.from_pretrained(model_file_address,num_labels=len(tag2idx))

Some weights of the model checkpoint at models were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at models and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
# Set model to GPU,if you are using GPU machine
model.to(device);
if n_gpu >1:
    model = torch.nn.DataParallel(model)

In [44]:
epochs = 5
max_grad_norm = 1.0

In [45]:
num_train_optimization_steps = int( math.ceil(len(tr_inputs) / batch_num) / 1) * epochs

In [46]:
FULL_FINETUNING = True

In [47]:
if FULL_FINETUNING:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [48]:
# TRAIN loop
model.train();

In [49]:
print("***** Running training *****")
print("  Num examples = %d"%(len(tr_inputs)))
print("  Batch size = %d"%(batch_num))
print("  Num steps = %d"%(num_train_optimization_steps))
for _ in trange(epochs,desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_segs,b_labels = batch
        
        # forward pass
        outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
        loss, logits = outputs[:2]
        if n_gpu>1:
            # When multi gpu, average it
            loss = loss.mean()
        
        # backward pass
        loss.backward()
        
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
        optimizer.zero_grad()
        
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))

***** Running training *****
  Num examples = 1058
  Batch size = 32
  Num steps = 170


Epoch:  20%|███████████████▍                                                             | 1/5 [00:11<00:45, 11.32s/it]

Train loss: 0.2817374214981542


Epoch:  40%|██████████████████████████████▊                                              | 2/5 [00:18<00:26,  8.80s/it]

Train loss: 0.10308056566017595


Epoch:  60%|██████████████████████████████████████████████▏                              | 3/5 [00:25<00:15,  7.95s/it]

Train loss: 0.08654007317046776


Epoch:  80%|█████████████████████████████████████████████████████████████▌               | 4/5 [00:32<00:07,  7.57s/it]

Train loss: 0.06254861220562209


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [00:39<00:00,  7.85s/it]

Train loss: 0.05578007993189561





In [50]:
# Evalue loop
model.eval();


In [51]:
# Set acc funtion
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

In [55]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

y_true = []
y_predict = []
print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_segs,b_labels = batch
    
    with torch.no_grad():
        outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
        tmp_eval_loss, logits = outputs[:2]
    
    # Get textclassification predict result
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = accuracy(logits, label_ids)
#     print(tmp_eval_accuracy)
#     print(np.argmax(logits, axis=1))
#     print(label_ids)
    
    # Save predict and real label reuslt for analyze
    for predict in np.argmax(logits, axis=1):
        y_predict.append(predict)
        
    for real_result in label_ids.tolist():
        y_true.append(real_result)

    
    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy
   
    nb_eval_steps += 1
    
    
eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / len(val_inputs)
loss = tr_loss/nb_tr_steps 
result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'loss': loss}
report = classification_report(y_pred=np.array(y_predict),y_true=np.array(y_true))

# Save the report into file

with open("eval_results.txt", "w") as writer:
    print("***** Eval results *****")
    for key in sorted(result.keys()):
        print("  %s = %s"%(key, str(result[key])))
        writer.write("%s = %s\n" % (key, str(result[key])))
        
    print(report)
    writer.write("\n\n")  
    writer.write(report)

***** Running evaluation *****
  Num examples =454
  Batch size = 32
***** Eval results *****
  eval_accuracy = 0.9779735682819384
  eval_loss = 0.08050526253258189
  loss = 0.05578007993189561
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       385
           1       0.92      0.94      0.93        69

    accuracy                           0.98       454
   macro avg       0.95      0.96      0.96       454
weighted avg       0.98      0.98      0.98       454

