<a href="https://colab.research.google.com/github/aliza-miller/NLP_In_Action/blob/master/BERT_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [0]:
!pip install pandas tqdm scipy scikit-learn transformers tensorboardx simpletransformers apex

In [4]:
!pip install torch 



In [0]:
!pip install keras==2.3.1

In [6]:
!pip install sentencepiece



In [0]:
import pandas as pd
import math
import numpy as np
from sklearn.metrics import classification_report
import torch.nn.functional as F

In [0]:
import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

In [9]:
from transformers import (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)

In [10]:
# Check library version
!pip list | grep -E 'transformers|torch|Keras'

Keras                    2.3.1          
Keras-Applications       1.0.8          
Keras-Preprocessing      1.1.0          
simpletransformers       0.22.0         
torch                    1.4.0          
torchsummary             1.5.1          
torchtext                0.3.1          
torchvision              0.5.0          
transformers             2.5.1          


In [11]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving all.csv to all.csv
User uploaded file "all.csv" with length 4301161 bytes


In [25]:
import pandas as pd
import logging


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Train and Evaluation data needs to be in a Pandas Dataframe of two columns. The second column is the text with type str, and the first column is the label with type int.
all_data = pd.read_csv('all.csv',sep=",",encoding="utf-8",names=['labels','texts'])
data_df = pd.DataFrame(all_data)
print(data_df)

       labels                                              texts
0           1  The deals come just over a year after the comp...
1           0   The NHS Personal Demographics Service, the na...
2           0                      BayAdvisor acquired GrayBoxx.
3           1  Seesmic 's Loic Le Meur, who laid off seven pe...
4           1  (Again, 1/3 seems to be the magic number – Jiv...
...       ...                                                ...
24804       0  Charlotte child-products company BRICA acquire...
24805       1  Movalia was eventually fired, but employees I ...
24806       0   Founded by Ravi Gururaj, who is also an entre...
24807       0  Ellis previously led operations at marketing n...
24808       1   Often dismissed as naughty or rude in the pas...

[24809 rows x 2 columns]


In [26]:
# Have a look at the dataset
data_df.head(n=20)

Unnamed: 0,labels,texts
0,1,The deals come just over a year after the comp...
1,0,"The NHS Personal Demographics Service, the na..."
2,0,BayAdvisor acquired GrayBoxx.
3,1,"Seesmic 's Loic Le Meur, who laid off seven pe..."
4,1,"(Again, 1/3 seems to be the magic number – Jiv..."
5,1,"eBay Drop-off Store AuctionDrop Closes Stores,..."
6,0,Digital media company BUZZMEDIA has acquired X...
7,0,Biz Stone: Google Acquires Genius Labs.
8,1,"Earlier this week, Kickstarter phenom Pebble ..."
9,1,LivingSocial is laying off half of its staff j...


In [27]:
# Have a look labels 
data_df.labels.unique()

array([1, 0])

In [28]:
# Analyse the labels distribution
data_df.labels.value_counts()

0    12457
1    12352
Name: labels, dtype: int64

In [29]:
# Get sentence data
sentences = data_df.texts.to_list()
sentences[0]

'The deals come just over a year after the company — which owns music sites like Buzznet and Lyrics.com — laid off 20 percent of its staff and announced it would slow down a plan to expand into other entertainment sectors, citing the “macroeconomic environment.”.'

In [30]:
# Get tag labels data
labels = data_df.labels.to_list()
print(labels[0])

1


In [0]:
# Set a dict for mapping id to tag name
#tag2idx = {t: i for i, t in enumerate(tags_vals)}

# Recommend to set it by manual define, good for reusing
# 0:negative, 1: positive
tag2idx={'0': 0,
 '1': 1}

In [33]:
tag2idx

{'0': 0, '1': 1}

In [0]:
# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

In [34]:
tag2name

{0: '0', 1: '1'}

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [36]:
n_gpu

1

In [38]:
# Manual define vocabulary address, if you download the model in local
# The vocabulary can download from "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model"
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving xlnet-base-cased-spiece.model to xlnet-base-cased-spiece.model
User uploaded file "xlnet-base-cased-spiece.model" with length 798011 bytes


In [0]:
vocabulary = 'xlnet-base-cased-spiece.model'

In [0]:
# Len of the sentence must be the same as the training model
# See model's 'max_position_embeddings' = 512
max_len  = 64

In [0]:
# With cased model, set do_lower_case = False
tokenizer = XLNetTokenizer(vocab_file=vocabulary,do_lower_case=False)

In [44]:
max_len  = 64

full_input_ids = []
full_input_masks = []
full_segment_ids = []

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

UNK_ID = tokenizer.encode("<unk>")[0]
CLS_ID = tokenizer.encode("<cls>")[0]
SEP_ID = tokenizer.encode("<sep>")[0]
MASK_ID = tokenizer.encode("<mask>")[0]
EOD_ID = tokenizer.encode("<eod>")[0]

for i,sentence in enumerate(sentences):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)
    
    # Trim the len of text
    if(len(tokens_a)>max_len-2):
        tokens_a = tokens_a[:max_len-2]
        
        
    tokens = []
    segment_ids = []
    
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)
        
    # Add <sep> token 
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)
    
    
    # Add <cls> token
    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)
    
    input_ids = tokens
    
    # The mask has 0 for real tokens and 1 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length at fornt
    if len(input_ids) < max_len:
        delta_len = max_len - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_len
    assert len(input_mask) == max_len
    assert len(segment_ids) == max_len
    
    full_input_ids.append(input_ids)
    full_input_masks.append(input_mask)
    full_segment_ids.append(segment_ids)
    
    if 3 > i:
        print("No.:%d"%(i))
        print("sentence: %s"%(sentence))
        print("input_ids:%s"%(input_ids))
        print("attention_masks:%s"%(input_mask))
        print("segment_ids:%s"%(segment_ids))
        print("\n")

No.:0
sentence: The deals come just over a year after the company — which owns music sites like Buzznet and Lyrics.com — laid off 20 percent of its staff and announced it would slow down a plan to expand into other entertainment sectors, citing the “macroeconomic environment.”.
input_ids:[0, 0, 32, 4108, 280, 125, 95, 24, 119, 99, 18, 226, 17, 1559, 59, 9318, 571, 1405, 115, 23665, 1942, 21, 25627, 23, 9, 756, 17, 1559, 3514, 177, 378, 141, 20, 81, 891, 21, 709, 36, 74, 2208, 151, 24, 493, 22, 3491, 91, 86, 3956, 6819, 19, 6048, 18, 221, 661, 4736, 15108, 1536, 9, 407, 9, 4, 3, 7739, 7739]
attention_masks:[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids:[4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [45]:
# Make label into id
tags = [tag2idx[str(lab)] for lab in labels]
print(tags[0])

1


In [0]:
tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = train_test_split(full_input_ids, tags,full_input_masks,full_segment_ids, 
                                                            random_state=4, test_size=0.3)

In [47]:
len(tr_inputs),len(val_inputs),len(tr_segs),len(val_segs)

(17366, 7443, 17366, 7443)

In [0]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)

In [0]:
# Set batch num
batch_num = 32

In [0]:
# Set token embedding, attention embedding, segment embedding
train_data = TensorDataset(tr_inputs, tr_masks,tr_segs, tr_tags)
train_sampler = RandomSampler(train_data)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num,drop_last=True)

valid_data = TensorDataset(val_inputs, val_masks,val_segs, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)