In [1]:
import sys
sys.path.append('/home/ez-flow/big_data/python/')
import bigquery_etl as bq
import confusion_matrix_customized as cm_customize
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy  as np
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import random

In [2]:
## (Setp 1-1) Load Data
def convert_lowercase(df):
    df_1 =  df.apply(lambda x: x.astype(str).str.lower() if(x.dtype == 'object') else x)
    upper_list = ['reviewId','asin','size','cmpl_fc1_cd']
    cols = list(set(upper_list)& set(df_1.columns))
    df_1[cols] = df_1[cols].apply(lambda x: x.astype(str).str.upper() if(x.dtype == 'object') else x)
    return df_1

bert_train_sql = '''
            SELECT *
            FROM taxonomy.bert_train_input_reviews
            '''
df   = convert_lowercase(bq.select_query(bert_train_sql))
df   = df[['reviewId','review_text','cmpl_fc1']].drop_duplicates()
print(df.shape)

(16407, 3)


# make Label Dictionary (priority-> class number)

In [7]:

##(Step 1-2) Load complain factor priority and label
## make Label Dictionary (priority-> class number)
label_dict = {  'recovery'        : 0, 
                'durability'      : 1,
                'defect'          : 2, 
                'too hard'        : 3,  
                'too soft'        : 4, 
                'missing parts'   : 5,
                'odor'            : 6,
                'sound'           : 7,
                'uncomfortable'   : 8,
                'size issue'      : 9,
                'shipping damage' : 10,
                'delivery'        : 11,
                'fiberglass'      : 12,
                'hard to set up'  : 13, 
                'slipping'        : 14, 
                'cover issue'     : 15, 
                'customer service': 16, 
                'springs felt'    : 17,
                'overall quality' : 18,
                'no support'      : 19,
                'customer error'  : 20,
                'structure design': 21,
                'others'          : 22, 
           }

df['label'] = df['cmpl_fc1']
df = df.replace({'label':label_dict})

# Split Train / Test Set

In [322]:

##(Step 2-1) Split Train / Test set 
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.cmpl_fc1.values, 
                                                  test_size=0.2, 
                                                  random_state=42, 
                                                  stratify=df.cmpl_fc1.values)

df['data_type'] = ['not_set']*df.shape[0]
df['label'] = df['cmpl_fc1']
df = df.replace({'label':label_dict})

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type']   = 'val'
print(df.groupby(['cmpl_fc1', 'label', 'data_type'])['review_text'].count().reset_index().sort_values(['label','data_type'],ascending=[True,True]))

# Encoded Bert Tokenizer

In [326]:
##(Step 3-1) Encoded Bert Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].review_text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].review_text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# Make Bert Dataset

In [327]:
## (Step 4-1) Make Bert input Dataset 
## hyper parameter  'batch size' default '5'
batch_size = 5

dataloader_train      = DataLoader(dataset_train, 
                                  sampler=RandomSampler(dataset_train), 
                                  batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                  sampler=SequentialSampler(dataset_val), 
                                  batch_size=batch_size)

# BertModel - bertbase

In [328]:
## (Step 5-1) Bert Model - Bertbase-uncased 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Optimizaion Bert

In [329]:
## (Step 6-1) Optimization Bert ( Adam Optimizer)

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
## (Step 6-2) Hyper parameter 'Epochs' default 5                 
epochs    = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)


# Evaluate Function model performance 

In [330]:
## (Step 7-1) Evaluate Function model performance
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
device = torch.device("cpu")

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

# Train Bert 

In [331]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'/home/ez-flow/big_data/model/CF_Bert_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/2989 [00:00<?, ?it/s]


Epoch 1
Training loss: 1.768855391512072
Validation loss: 1.293726212779681
F1 Score (Weighted): 0.6029686681079964


Epoch 2:   0%|          | 0/2989 [00:00<?, ?it/s]


Epoch 2
Training loss: 1.1502682848363945
Validation loss: 1.2176828917925773
F1 Score (Weighted): 0.6238240057862128


Epoch 3:   0%|          | 0/2989 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.9274944996628904
Validation loss: 1.2118226965214463
F1 Score (Weighted): 0.6190196979545188


Epoch 4:   0%|          | 0/2989 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.7837688199793761
Validation loss: 1.307875133726908
F1 Score (Weighted): 0.6150342074656724


Epoch 5:   0%|          | 0/2989 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.6719211910213488
Validation loss: 1.3995930333851256
F1 Score (Weighted): 0.6033102699322258


Epoch 6:   0%|          | 0/2989 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.5825581417372903
Validation loss: 1.5368820448392544
F1 Score (Weighted): 0.5942500894734635


Epoch 7:   0%|          | 0/2989 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.5131978827973285
Validation loss: 1.6291928634317456
F1 Score (Weighted): 0.5951507910912768


Epoch 8:   0%|          | 0/2989 [00:00<?, ?it/s]


Epoch 8
Training loss: 0.46755418215008054
Validation loss: 1.683635844996172
F1 Score (Weighted): 0.5903713108748617


# Evaluate (Testset)

In [6]:
from sklearn.metrics import classification_report,plot_confusion_matrix,confusion_matrix,accuracy_score
import confusion_matrix_customized as cm_customize

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    print(label_dict_inverse)
    preds_flat = np.argmax(preds, axis=1).flatten()
    
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')
        
def make_confusion_matrix_customized(y_true,y_pred):
    cm=confusion_matrix(y_true,y_pred)
    print('================================== Evaluation Report ==================================')
    print(classification_report(y_true, y_pred,target_names=label_dict))
    print('======================================================================================')
    cm_customize._execute_confusion_matrix(y_true,y_pred,'Bert',label_dict)
    accuracy_score(y_true,y_pred)
    
    
def opt_convert_predict_class(preds):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    print(label_dict_inverse)
    preds_flat  = np.argmax(preds, axis=1).flatten()
    
    return preds_flat
    
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
device = torch.device("cpu")
model.to(device)

## you should modify champion model directory 
model_dir = '/home/ez-flow/big_data/model/CF_Bert_Operation.model'
model.load_state_dict(torch.load(model_dir, map_location=torch.device('cpu')))


_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

y_pred = opt_convert_predict_class(predictions)
y_true = true_vals
make_confusion_matrix_customized(y_true,y_pred)
