# Import Library

In [24]:
# !pip install accelerate -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

In [12]:
import tqdm as notebook_tqdm
import torch
print(torch.__version__)
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
import evaluate
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import re
import emoji
import itertools
import pythainlp
print(pythainlp.__version__)
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus.common import thai_words, thai_stopwords
stopwords = list(thai_stopwords())

2.3.0
5.0.3


In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Text Preprocessing

In [3]:
def read_data(file_name:str):
    file_type = file_name.split("/")[-1].split(".")[-1]
    if file_type in ['xlsx', 'xls']:
        return pd.read_excel(file_name)
    elif file_type in ['csv']:
        return pd.read_csv(file_name)
    else:
        raise ValueError(f"The input data must be excel or csv file")
        # print(f"The input data must be excel or csv file !!!")
        # sys.exit(1)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\d+', '', text)
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) # no emoji
    text = emoji.replace_emoji(text, "")
    my_punctuation = '|!"$-#@%^_*+,/&\'()<=>?[\\]'
    text = re.sub('['+ re.escape(my_punctuation) +']+', '', text)
    text = re.sub(r'\.{2,}', '', text)
    text = re.sub(r'\\', '', text)
    return text

def new_word_tokenize(text):
    lst_words = word_tokenize(text, keep_whitespace=False)
    lst_words = [i for i in lst_words if i not in stopwords]
    return lst_words

def data_preprocessing(df, text_col, label_col):
    # df = read_data('./data/new_data_group.xlsx')
    data = df[[text_col, label_col]] # select text and class column
    data['txt_type'] = data[text_col].apply(lambda x : isinstance(x, str)) # check str type
    data = data[data['txt_type'] == True]
    data = data.drop('txt_type', axis=1)
    data['cln_text'] = data[text_col].apply(clean_text)
    data['len_text'] = data['cln_text'].apply(len)
    data = data[data['len_text'] > 0]
    data = data.drop('len_text', axis=1)
    data['cln_words'] = data['cln_text'].apply(str).apply(new_word_tokenize) # clean and tokenize
    data['len_cln_words'] = data['cln_words'].apply(len)
    data = data[data['len_cln_words'] > 0]
    data = data.drop('len_cln_words', axis=1)
    data['sent'] = data['cln_words'].apply(lambda x : " ".join(x)) # join words
    data['sent_cln'] = data['sent'].apply(lambda x : "".join(new_word_tokenize(x)))
    # data = data[['sent', label_col]]
    data = data[['sent_cln', label_col]]
    data = data.rename({"sent_cln": "sent"}, axis=1)
    print("Data size :", data.shape)
    return data

# Read data

In [31]:
df = pd.read_excel('data/Re_label_row_data_5182.xlsx')
df = df[['message','label']]
df.head()

Unnamed: 0,message,label
0,เช็คเบี้ยค่ะ,ใบเสนอราคา
1,เชคเบี้ยค่ะ,ใบเสนอราคา
2,เช็คเบี้ยคะ,ใบเสนอราคา
3,เช็คเบี้ย ป.1 ค่ะ,ใบเสนอราคา
4,เช็คเบี้ยต่ออายุค่ะ,ใบเสนอราคา


# Data Preprocessing

In [32]:
# label encoding
enc = LabelEncoder()
enc.fit(df.label)
idx2label = {k: v for k, v in enumerate(enc.classes_)}
label2idx = {v: k for k, v in idx2label.items()}
print(idx2label)
print(label2idx)

{0: 'การชำระเงิน', 1: 'การอบรม', 2: 'ข้อความที่ไม่สามารถจัดหมวดหมู่ได้', 3: 'ข้อมูลประกัน', 4: 'คอมมิชชั่น', 5: 'งานกรมธรรม์', 6: 'งานประกันและการเคลม', 7: 'สมาชิกและนายหน้า', 8: 'อื่นๆ', 9: 'ใบเสนอราคา'}
{'การชำระเงิน': 0, 'การอบรม': 1, 'ข้อความที่ไม่สามารถจัดหมวดหมู่ได้': 2, 'ข้อมูลประกัน': 3, 'คอมมิชชั่น': 4, 'งานกรมธรรม์': 5, 'งานประกันและการเคลม': 6, 'สมาชิกและนายหน้า': 7, 'อื่นๆ': 8, 'ใบเสนอราคา': 9}


In [33]:
df['label'] = enc.transform(df['label'])
df.head(2)

Unnamed: 0,message,label
0,เช็คเบี้ยค่ะ,9
1,เชคเบี้ยค่ะ,9


In [38]:
# text preprocessing
pp_df = data_preprocessing(df, 'message', 'label')
pp_df.head()

Data size : (4979, 2)


Unnamed: 0,sent,label
0,เช็คเบี้ย,9
1,เชคเบี้ย,9
2,เช็คเบี้ย,9
3,เช็คเบี้ยป.,9
4,เช็คเบี้ยต่ออายุ,9


In [39]:
pp_df.label.value_counts()

label
9    1226
2    1075
3     670
0     525
5     525
4     301
7     276
6     252
8      83
1      46
Name: count, dtype: int64

In [40]:
pp_df = pd.concat([pp_df[pp_df['label']==0].sample(n=800, replace=True),
                   pp_df[pp_df['label']==1].sample(n=800, replace=True),
                   pp_df[pp_df['label']==2].sample(n=800, replace=True),
                   pp_df[pp_df['label']==3].sample(n=800, replace=True),
                   pp_df[pp_df['label']==4].sample(n=800, replace=True),
                   pp_df[pp_df['label']==5].sample(n=800, replace=True),
                   pp_df[pp_df['label']==6].sample(n=800, replace=True),
                   pp_df[pp_df['label']==7].sample(n=800, replace=True),
                   pp_df[pp_df['label']==8].sample(n=800, replace=True),
                   pp_df[pp_df['label']==9].sample(n=800, replace=True)], axis=0)

pp_df.label.value_counts()

label
0    800
1    800
2    800
3    800
4    800
5    800
6    800
7    800
8    800
9    800
Name: count, dtype: int64

In [41]:
# split to train valid test set
train_data, test_data = train_test_split(pp_df, test_size=0.2, random_state=9, stratify=pp_df['label'])
# test_data, valid_data = train_test_split(test_data, test_size=0.5, random_state=31, stratify=test_data['label'])
print(train_data.shape, test_data.shape)

(6400, 2) (1600, 2)


In [56]:
# convert to huggingface dataset
hg_train_data = Dataset.from_pandas(train_data)
# hg_valid_data = Dataset.from_pandas(valid_data)
hg_test_data = Dataset.from_pandas(test_data)

# Load Pre-Trained Model

In [57]:
tokenizer = AutoTokenizer.from_pretrained("poom-sci/WangchanBERTa-finetuned-sentiment", model_max_length=128) # set max length
model = AutoModelForSequenceClassification.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased", num_labels=10) # set num class

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
# # set label name
model.config.id2label = idx2label
model.config.label2id = label2idx
# model.config

In [59]:
# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["sent"],
            max_length=128, # set max length
            truncation=True,
            padding="max_length" )

In [60]:
# Tokenize the dataset
dataset_train = hg_train_data.map(tokenize_dataset)
dataset_test = hg_test_data.map(tokenize_dataset)
# dataset_valid = hg_valid_data.map(tokenize_dataset)

Map:   0%|          | 0/3983 [00:00<?, ? examples/s]

Map:   0%|          | 0/996 [00:00<?, ? examples/s]

# Training

In [61]:
# set embedding (if change)
model.resize_token_embeddings(len(tokenizer))

Embedding(25004, 768)

In [67]:
# Set up training arguments
NUM_STEP = 500 # 1000
BATCH_SIZE = 8
EPOCH = 30 # 20

training_args = TrainingArguments(
    output_dir="./checkpoint2",
    save_total_limit=5,
    overwrite_output_dir=True, 
    logging_dir='./logs2',
    logging_strategy='epoch',
    logging_steps=NUM_STEP,
    num_train_epochs=EPOCH,
    fp16=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=1e-5, # 5e-5
    # weight_decay=0.01, # use if larg model size
    save_strategy= 'epoch',
    save_steps=NUM_STEP,
    evaluation_strategy='epoch',
    eval_steps=NUM_STEP,
    load_best_model_at_end=True
    )

In [68]:
# define compute metrics
def compute_metrics(eval_pred):
    metric = load_metric("accuracy")
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [69]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [70]:
# setup trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)])

In [18]:
# train model ... lr = 2e-5
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,2.0387,1.947074,0.299699
2,1.8254,1.822419,0.337851
3,1.7062,1.709939,0.389558
4,1.4629,1.852316,0.35241
5,1.4379,1.744033,0.443273
6,1.2685,1.645916,0.468373
7,1.1898,1.584506,0.512048
8,1.1666,1.565495,0.477912
9,1.0592,1.695978,0.443273
10,1.0017,1.660202,0.46988


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

TrainOutput(global_step=7480, training_loss=1.094575359732072, metrics={'train_runtime': 927.3272, 'train_samples_per_second': 64.422, 'train_steps_per_second': 8.066, 'total_flos': 3929845867530240.0, 'train_loss': 1.094575359732072, 'epoch': 20.0})

In [27]:
# train model ... lr = 3e-5
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.1689,1.856793,0.353414
2,1.1751,1.427935,0.502008
3,1.0243,1.419886,0.522088
4,0.873,1.228252,0.640562
5,0.8099,1.258863,0.62249
6,0.7504,1.350661,0.626506
7,0.6586,1.2727,0.676707
8,0.5543,1.336737,0.656627
9,0.5008,1.438612,0.644578
10,0.4207,1.551466,0.639558


TrainOutput(global_step=9960, training_loss=0.529628101793159, metrics={'train_runtime': 1033.2553, 'train_samples_per_second': 77.096, 'train_steps_per_second': 9.639, 'total_flos': 5240233039964160.0, 'train_loss': 0.529628101793159, 'epoch': 20.0})

In [74]:
# train model ... lr = 1e-5 model.to(device) (new cleaning)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.1065,1.356137,0.594378
2,1.0551,1.354072,0.604418
3,1.0576,1.366277,0.586345
4,0.9683,1.32673,0.61245
5,0.9317,1.313122,0.613454
6,0.8777,1.39138,0.606426
7,0.8682,1.394851,0.590361
8,0.812,1.412683,0.60241
9,0.816,1.451638,0.585341
10,0.7979,1.430578,0.604418


TrainOutput(global_step=7470, training_loss=0.8536515446551832, metrics={'train_runtime': 826.3682, 'train_samples_per_second': 144.597, 'train_steps_per_second': 18.079, 'total_flos': 3930174779973120.0, 'train_loss': 0.8536515446551832, 'epoch': 15.0})

In [75]:
# Evaluation
trainer.evaluate(dataset_test)

{'eval_loss': 1.313122034072876,
 'eval_accuracy': 0.6134538152610441,
 'eval_runtime': 4.2285,
 'eval_samples_per_second': 235.543,
 'eval_steps_per_second': 29.561,
 'epoch': 15.0}

# Save Model

In [76]:
# Save model
tokenizer.save_pretrained('./model2')
trainer.save_model('./model2')
print("Save completed...")

Save completed...


# Load Model

In [None]:
# # Load model
# # tokenizer = AutoTokenizer.from_pretrained('pretrained_WangchanBERTa/model')
# # loaded_model = AutoModelForSequenceClassification.from_pretrained('pretrained_WangchanBERTa/model')
# pipe = pipeline("text-classification", model="pretrained_WangchanBERTa/model")

# Evaluation

In [30]:
y_test_predict = trainer.predict(dataset_test)
y_test_logits = y_test_predict.predictions
y_test_prob = torch.softmax(torch.tensor(y_test_logits), dim=1) # tf.nn.softmax(y_test_logits)
y_test_pred_labels = np.argmax(y_test_prob, axis=1)
y_test_actual_labels = y_test_predict.label_ids

In [31]:
# Compute f1 metric
metric_f1 = evaluate.load("f1")
metric_f1.compute(predictions=y_test_pred_labels, references=y_test_actual_labels, average="weighted")

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

{'f1': 0.6414928228685276}

In [32]:
acc = accuracy_score(y_test_actual_labels, y_test_pred_labels)
print(f"Accuracy : {acc:.4f}")

Accuracy : 0.6406


# etc

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def to_prediction(prep_text):
    inputs = tokenizer(prep_text, padding=True, truncation=True, return_tensors="pt").to(device)
    ## Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
    ## Move outputs to CPU if necessary and calculate softmax probabilities
    softmax_probs = torch.softmax(outputs.logits, dim=1).cpu().tolist()
    ## Extract predicted labels
    predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()
    # predicted_labels = outputs.logits.argmax(dim=1).cpu().tolist()
    results = [(idx2label[label], probs[label]) for label, probs in zip(predicted_labels, softmax_probs)]
    return results