In [1]:
import pandas as pd
import math
import numpy as np
from sklearn.metrics import classification_report
import torch.nn.functional as F

In [2]:

import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

In [3]:
from transformers import (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)

In [4]:
df_data = pd.read_csv('int_data.csv',sep=",",encoding = "ISO-8859-1", engine='python',names=['labels','texts'],header=0)
df_data

Unnamed: 0,labels,texts
0,0,Prof. Allain joined Argonne National Laborator...
1,0,Prof. Allain joined the faculty at the Univers...
2,0,During this 5 years he was working closely wit...
3,0,"From 2003-present, Allen has a number of stude..."
4,0,In the last 10 years Allen and his students ha...
...,...,...
1430,0,I am especially interested in reasoning relate...
1431,0,Although I am interested in how these question...
1432,0,"At CFI, I am currently looking at ethical and ..."
1433,0,Many commentators have worried that such syste...


In [5]:
df_data.labels.unique()

array([0, 1], dtype=int64)

In [6]:
df_data.labels.value_counts()

0    1251
1     184
Name: labels, dtype: int64

In [7]:
# Get sentence data
sentences = df_data.texts.to_list()
sentences[0]

'Prof. Allain joined Argonne National Laboratory as a staff scientist in 2003 and joined the faculty in the School of Nuclear Engineering at Purdue University in Fall of 2007 with a courtesy appointment with the School of Materials Engineering.'

In [8]:
# Get tag labels data
labels = df_data.labels.to_list()
print(labels[0])

0


In [9]:
# Set a dict for mapping id to tag name
#tag2idx = {t: i for i, t in enumerate(tags_vals)}

# Recommend to set it by manual define, good for reusing
# 0:negative, 1: positive
tag2idx={'0': 0,'1': 1}

In [10]:
# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

1

In [12]:
vocabulary = 'xlnet-base-cased-spiece.model'


In [13]:
# Len of the sentence must be the same as the training model
# See model's 'max_position_embeddings' = 512
max_len  = 64

In [14]:
# With cased model, set do_lower_case = False
tokenizer = XLNetTokenizer(vocab_file=vocabulary,do_lower_case=False)

In [15]:
max_len  = 64

full_input_ids = []
full_input_masks = []
full_segment_ids = []

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

UNK_ID = tokenizer.encode("")[0]
CLS_ID = tokenizer.encode("")[0]
SEP_ID = tokenizer.encode("")[0]
MASK_ID = tokenizer.encode("")[0]
EOD_ID = tokenizer.encode("")[0]

for i,sentence in enumerate(sentences):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)
    
    # Trim the len of text
    if(len(tokens_a)>max_len-2):
        tokens_a = tokens_a[:max_len-2]
        
        
    tokens = []
    segment_ids = []
    
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)
        
    # Add  token 
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)
    
    
    # Add  token
    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)
    
    input_ids = tokens
    
    # The mask has 0 for real tokens and 1 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length at fornt
    if len(input_ids) < max_len:
        delta_len = max_len - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_len
    assert len(input_mask) == max_len
    assert len(segment_ids) == max_len
    
    full_input_ids.append(input_ids)
    full_input_masks.append(input_mask)
    full_segment_ids.append(segment_ids)
    
    if 3 > i:
        print("No.:%d"%(i))
        print("sentence: %s"%(sentence))
        print("input_ids:%s"%(input_ids))
        print("attention_masks:%s"%(input_mask))
        print("segment_ids:%s"%(segment_ids))
        print("\n")

No.:0
sentence: Prof. Allain joined Argonne National Laboratory as a staff scientist in 2003 and joined the faculty in the School of Nuclear Engineering at Purdue University in Fall of 2007 with a courtesy appointment with the School of Materials Engineering.
input_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16514, 9, 394, 7018, 1274, 17, 27195, 16006, 360, 11842, 34, 24, 891, 8388, 25, 1684, 21, 1274, 18, 4429, 25, 18, 696, 20, 14223, 6150, 38, 24450, 315, 25, 7870, 20, 1327, 33, 24, 14209, 5031, 33, 18, 696, 20, 19093, 6150, 9, 4, 3, 4, 4]
attention_masks:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids:[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]


No.:1
sentence: Prof. Allai

In [16]:
# Make label into id
tags = [tag2idx[str(lab)] for lab in labels]
print(tags[0])

0


In [17]:
tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = train_test_split(full_input_ids, tags,full_input_masks,full_segment_ids, 
                                                            random_state=4, test_size=0.3)

In [18]:
len(tr_inputs),len(val_inputs),len(tr_segs),len(val_segs)

(1004, 431, 1004, 431)

In [19]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)


In [20]:
# Set batch num
batch_num = 32

In [21]:
# Set token embedding, attention embedding, segment embedding
train_data = TensorDataset(tr_inputs, tr_masks,tr_segs, tr_tags)
train_sampler = RandomSampler(train_data)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num,drop_last=True)

valid_data = TensorDataset(val_inputs, val_masks,val_segs, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

In [22]:
# In this document, contain confg(txt) and weight(bin) files
# The folder must contain: pytorch_model.bin, config.json
model_file_address = 'models'

In [24]:
model = XLNetForSequenceClassification.from_pretrained(model_file_address,num_labels=len(tag2idx))

Some weights of the model checkpoint at models were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at models and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# Set model to GPU,if you are using GPU machine
model.to(device);
if n_gpu >1:
    model = torch.nn.DataParallel(model)

In [27]:
epochs = 20
max_grad_norm = 1.0

In [28]:
num_train_optimization_steps = int( math.ceil(len(tr_inputs) / batch_num) / 1) * epochs

In [29]:
FULL_FINETUNING = True

In [30]:
if FULL_FINETUNING:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [31]:
# TRAIN loop
model.train();

In [32]:
print("***** Running training *****")
print("  Num examples = %d"%(len(tr_inputs)))
print("  Batch size = %d"%(batch_num))
print("  Num steps = %d"%(num_train_optimization_steps))
for _ in trange(epochs,desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_segs,b_labels = batch
        
        # forward pass
        outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
        loss, logits = outputs[:2]
        if n_gpu>1:
            # When multi gpu, average it
            loss = loss.mean()
        
        # backward pass
        loss.backward()
        
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
        optimizer.zero_grad()
        
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))

***** Running training *****
  Num examples = 1004
  Batch size = 32
  Num steps = 640


Epoch:   5%|███▊                                                                        | 1/20 [00:08<02:35,  8.17s/it]

Train loss: 0.3345604293769406



Epoch:  10%|███████▌                                                                    | 2/20 [00:14<02:09,  7.17s/it]

Train loss: 0.20075638351901884



Epoch:  15%|███████████▍                                                                | 3/20 [00:20<01:55,  6.78s/it]

Train loss: 0.14506366422339792



Epoch:  20%|███████████████▏                                                            | 4/20 [00:27<01:44,  6.55s/it]

Train loss: 0.12473487616666863



Epoch:  25%|███████████████████                                                         | 5/20 [00:34<01:40,  6.69s/it]

Train loss: 0.10569265157532608



Epoch:  30%|██████████████████████▊                                                     | 6/20 [00:41<01:34,  6.77s/it]

Train loss: 0.08795369270768377



Epoch:  35%|██████████████████████████▌                                                 | 7/20 [00:47<01:27,  6.72s/it]

Train loss: 0.09026772210059027



Epoch:  40%|██████████████████████████████▍                                             | 8/20 [00:54<01:22,  6.88s/it]

Train loss: 0.07497744024130365



Epoch:  45%|██████████████████████████████████▏                                         | 9/20 [01:01<01:15,  6.84s/it]

Train loss: 0.048960871235918135



Epoch:  50%|█████████████████████████████████████▌                                     | 10/20 [01:08<01:09,  6.92s/it]

Train loss: 0.043751173448042885



Epoch:  55%|█████████████████████████████████████████▎                                 | 11/20 [01:15<01:01,  6.88s/it]

Train loss: 0.04206807208397696



Epoch:  60%|█████████████████████████████████████████████                              | 12/20 [01:22<00:54,  6.81s/it]

Train loss: 0.03450527352575709



Epoch:  65%|████████████████████████████████████████████████▊                          | 13/20 [01:28<00:46,  6.70s/it]

Train loss: 0.02994080425349004



Epoch:  70%|████████████████████████████████████████████████████▌                      | 14/20 [01:35<00:40,  6.68s/it]

Train loss: 0.050098093958137815



Epoch:  75%|████████████████████████████████████████████████████████▎                  | 15/20 [01:41<00:32,  6.58s/it]

Train loss: 0.027162483550660733



Epoch:  80%|████████████████████████████████████████████████████████████               | 16/20 [01:47<00:26,  6.52s/it]

Train loss: 0.030392862973742246



Epoch:  85%|███████████████████████████████████████████████████████████████▊           | 17/20 [01:54<00:19,  6.63s/it]

Train loss: 0.030029947966883606



Epoch:  90%|███████████████████████████████████████████████████████████████████▌       | 18/20 [02:01<00:13,  6.61s/it]

Train loss: 0.02511460006456717



Epoch:  95%|███████████████████████████████████████████████████████████████████████▎   | 19/20 [02:07<00:06,  6.55s/it]

Train loss: 0.025745229859650512


Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 20/20 [02:14<00:00,  6.71s/it]

Train loss: 0.02187146436253604





In [33]:
# Evalue loop
model.eval();


In [34]:
# Set acc funtion
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

In [35]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

y_true = []
y_predict = []
print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_segs,b_labels = batch
    
    with torch.no_grad():
        outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
        tmp_eval_loss, logits = outputs[:2]
    
    # Get textclassification predict result
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = accuracy(logits, label_ids)
#     print(tmp_eval_accuracy)
#     print(np.argmax(logits, axis=1))
#     print(label_ids)
    
    # Save predict and real label reuslt for analyze
    for predict in np.argmax(logits, axis=1):
        y_predict.append(predict)
        
    for real_result in label_ids.tolist():
        y_true.append(real_result)

    
    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy
   
    nb_eval_steps += 1
    
    
eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / len(val_inputs)
loss = tr_loss/nb_tr_steps 
result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'loss': loss}
report = classification_report(y_pred=np.array(y_predict),y_true=np.array(y_true))

# Save the report into file

with open("eval_results3.txt", "w") as writer:
    print("***** Eval results *****")
    for key in sorted(result.keys()):
        print("  %s = %s"%(key, str(result[key])))
        writer.write("%s = %s\n" % (key, str(result[key])))
        
    print(report)
    writer.write("\n\n")  
    writer.write(report)

***** Running evaluation *****
  Num examples =431
  Batch size = 32
***** Eval results *****
  eval_accuracy = 0.9280742459396751
  eval_loss = 0.5692870940048513
  loss = 0.02187146436253604
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       374
           1       0.70      0.81      0.75        57

    accuracy                           0.93       431
   macro avg       0.83      0.88      0.85       431
weighted avg       0.93      0.93      0.93       431



In [34]:
for i in range(len(y_true)):
    if y_true[i] == 0 and y_predict[i] == 1:
        print(df_data.iloc[i]['texts'])
        print("---------------------")

During this 5 years he was working closely with clinical audiologists and speech and hearing scientists, and with several hearing aid manufactures (Starkey, Phonak, Etymotic), who subsequently funded Allen's work.
---------------------
Dr. Marie Agathe Charpagne was born and studied in France, where she was a first generation college student.
---------------------
After this she joined the Energy Frontier Research Center for Inverse Design as a postdoctoral fellow developing p-type transparent conducting oxides and synthesizing missing materials.
---------------------
In another path, we develop biomaterials that can home and manipulate immune cells in vivo, and apply them to the development of cancer vaccines, cell therapies, and medical devices.
---------------------
Students can get involved in projects in various stages, through a variety of ways.
---------------------
Little is known, however, about the interpersonal mechanisms through which children's attachment and friend relati