In [1]:
import pandas as pd
import math
import numpy as np
from sklearn.metrics import classification_report
import torch.nn.functional as F
import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
from tqdm.notebook import tqdm

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

0

In [2]:
xlnet_out_address = '../models/out/c4/'
len_tag2ix = 19

In [8]:
tag2idx = {'Noble': 0,
 'PlayboyPlaymate': 1,
 'Engineer': 2,
 'Astronaut': 3,
 'MilitaryPerson': 4,
 'Chef': 5,
 'HorseTrainer': 6,
 'OfficeHolder': 7,
 'Economist': 8,
 'Religious': 9,
 'Ambassador': 10,
 'BeautyQueen': 11,
 'Model': 12,
 'Judge': 13,
 'Philosopher': 14,
 'Monarch': 15,
 'Journalist': 16,
 'BusinessPerson': 17,
 'Architect': 18}
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

19

In [9]:
model = XLNetForSequenceClassification.from_pretrained(xlnet_out_address,num_labels=len(tag2name))
model.to(device)

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e

In [6]:
model.eval()

def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

In [67]:
sentences = ['Gisela Bolaños Scarton (5 April 1935 - 8 November 2013) was a Venezuelan pageant titleholder She is the Miss Venezuela titleholder for 1953, and was the official representative of Venezuela to the Miss Universe 1953 pageant held in Long Beach, California, USA, on July 17, 1953.',
 'Mallory Christina Ervin (born October 26, 1985) is an American entertainer and former beauty pageant titleholder from Morganfield, Kentucky. She held the title of Miss Kentucky 2009 and was 4th runner-up to Miss America 2010. In September 2010, it was announced that she competed on The Amazing Race 17 with her father Gary Ervin. The Amazing Race was filmed in May and June 2010 and premiered on September 26, 2010. Gary and Mallory were also one of the teams to return for The Amazing Race 18, which premiered on February 20, 2011. Ervin returned to the race to join Mark Jackson in The Amazing Race 24: All Stars, after Jackson\'s teammate William \\"Bopper\\" Minton was deemed unfit to race.',
 'Sylvie Tellier, (born 28 May 1978, in Nantes, Loire-Atlantique) is a French lawyer, actress, model and beauty pageant titleholder, She was elected Miss Lyon 2001, and Miss France 2002. Since 2007, she is the general director of both the Society Miss France and Miss Europe Organization. She got married to Mr. Camille Le Maux on 23 June 2007 in Gordes in Luberon.',
 'Lee Ji-sun (Hangul: 이지선, born April 6, 1983), also known as Sun Lee in the Western media, is a South Korean who attended Parsons The New School for Design. She is the reigning Miss Korea 2007 and represented her country at the Miss Universe 2008 pageant held in Nha Trang, Vietnam with 79 other delegates.',
 'Betina Faurbye (born 1982) is a Danish beauty pageant titleholder who was crowned Miss Universe Denmark 2006 and represented Denmark in Miss Universe 2006, placing in the Top 20.',
 'Jacqueline \\"Jackie\\" Loughery (born April 18, 1930) is an American actress and beauty queen best known as the first Miss New York USA and winner of the first Miss USA beauty pageant, in Long Beach, California. In 1952, she won the title only after a second ballot broke a first-place tie. Loughery, a red head, went on to represent the USA at the very first Miss Universe pageant, where she placed ninth. Part of her prize package as Miss USA included a contract with Universal Pictures, which led to a career in movies and television. She adopted the stage name Evelyn Avery, but is more often credited with her own name (the name she used when she won the Miss USA title). In 1951 she appeared in the short-lived variety show Seven at Eleven. In 1954, she was Johnny Carson\'s assistant in the short lived game show \\"Earn Your Vacation\\". She appeared in several films, including the 1956 comedy Pardners with Martin and Lewis and the 1957 drama The D.I. (\\"Drill Instructor\\"), with Jack Webb, whom she would marry. She was featured in 1957\'s Eighteen and Anxious and top-billed in the following year\'s The Hot Angel.']

sentences = df[df['l3'] == 'BeautyQueen']['text'].to_list()
labels = ['Noble'] * len(sentences)

## Set input embedding

In [68]:
vocabulary = '../models/xlnet-base-cased/spiece.model'
max_len  = 64
tokenizer = XLNetTokenizer(vocab_file=vocabulary,do_lower_case=False)

In [69]:
max_len  = 64

full_input_ids = []
full_input_masks = []
full_segment_ids = []

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

UNK_ID = tokenizer.encode("<unk>")[0]
CLS_ID = tokenizer.encode("<cls>")[0]
SEP_ID = tokenizer.encode("<sep>")[0]
MASK_ID = tokenizer.encode("<mask>")[0]
EOD_ID = tokenizer.encode("<eod>")[0]

for i,sentence in tqdm(enumerate(sentences), total=len(sentences)):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)
    
    # Trim the len of text
    if(len(tokens_a)>max_len-2):
        tokens_a = tokens_a[:max_len-2]
        
        
    tokens = []
    segment_ids = []
    
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)
        
    # Add <sep> token 
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)
    
    
    # Add <cls> token
    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)
    
    input_ids = tokens
    
    # The mask has 0 for real tokens and 1 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length at fornt
    if len(input_ids) < max_len:
        delta_len = max_len - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_len
    assert len(input_mask) == max_len
    assert len(segment_ids) == max_len
    
    full_input_ids.append(input_ids)
    full_input_masks.append(input_mask)
    full_segment_ids.append(segment_ids)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=466.0), HTML(value='')))




In [70]:
tags = [tag2idx[str(lab)] for lab in labels]

In [71]:
inputs = torch.tensor(full_input_ids)
tags = torch.tensor(tags)
masks = torch.tensor(full_input_masks)
segs = torch.tensor(full_segment_ids)

In [72]:
# Set batch num
batch_num = 32

data = TensorDataset(inputs, masks, segs, tags)
sampler = SequentialSampler(data)
dataloader = DataLoader(data, sampler=sampler, batch_size=batch_num)

In [73]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

y_true = []
y_predict = []
print("***** Running evaluation *****")
print("  Num examples ={}".format(len(inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_segs,b_labels = batch
    
    with torch.no_grad():
        outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
        tmp_eval_loss, logits = outputs[:2]
    
    # Get textclassification predict result
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = accuracy(logits, label_ids)

    # Save predict and real label reuslt for analyze
    for predict in np.argmax(logits, axis=1):
        y_predict.append(predict)
        
    for real_result in label_ids.tolist():
        y_true.append(real_result)

    
    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy
   
    nb_eval_steps += 1

***** Running evaluation *****
  Num examples =466
  Batch size = 32


In [74]:
for pred in y_predict:
    print(tag2name[pred])

BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
Model
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
Model
Model
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
Model
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
Model
OfficeHolder
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
Journalist
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQueen
BeautyQuee

In [75]:
# get data samples from test set
df = pd.read_csv('../data/test.csv')

In [76]:
df[df['l3'] == 'BeautyQueen']['text'].to_list()

['Gisela Bolaños Scarton (5 April 1935 - 8 November 2013) was a Venezuelan pageant titleholder She is the Miss Venezuela titleholder for 1953, and was the official representative of Venezuela to the Miss Universe 1953 pageant held in Long Beach, California, USA, on July 17, 1953.',
 'Mallory Christina Ervin (born October 26, 1985) is an American entertainer and former beauty pageant titleholder from Morganfield, Kentucky. She held the title of Miss Kentucky 2009 and was 4th runner-up to Miss America 2010. In September 2010, it was announced that she competed on The Amazing Race 17 with her father Gary Ervin. The Amazing Race was filmed in May and June 2010 and premiered on September 26, 2010. Gary and Mallory were also one of the teams to return for The Amazing Race 18, which premiered on February 20, 2011. Ervin returned to the race to join Mark Jackson in The Amazing Race 24: All Stars, after Jackson\'s teammate William \\"Bopper\\" Minton was deemed unfit to race.',
 'Sylvie Tellier