# Import Library

In [None]:
# !pip install accelerate transformers datasets evaluate pythainlp emoji -q

In [None]:
import torch
print(torch.__version__)
import tqdm as notebook_tqdm
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
import evaluate
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import re
import emoji
import itertools
import pythainlp
print(pythainlp.__version__)
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus.common import thai_words, thai_stopwords
stopwords = list(thai_stopwords())
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Text Preprocessing

In [None]:
def read_data(file_name:str):
    file_type = file_name.split("/")[-1].split(".")[-1]
    if file_type in ['xlsx', 'xls']:
        return pd.read_excel(file_name)
    elif file_type in ['csv']:
        return pd.read_csv(file_name)
    else:
        raise ValueError(f"The input data must be excel or csv file")
        # print(f"The input data must be excel or csv file !!!")
        # sys.exit(1)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\d+', '', text)
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) # no emoji
    text = emoji.replace_emoji(text, "")
    my_punctuation = '|!"$-#@%^_*+,/&\'()<=>?[\\]'
    text = re.sub('['+ re.escape(my_punctuation) +']+', '', text)
    text = re.sub(r'\.{2,}', '', text)
    text = re.sub(r'\\', '', text)
    return text

def new_word_tokenize(text):
    lst_words = word_tokenize(text, keep_whitespace=False)
    lst_words = [i for i in lst_words if i not in stopwords]
    return lst_words

def data_preprocessing(df, text_col, label_col):
    # df = read_data('./data/new_data_group.xlsx')
    data = df[[text_col, label_col]] # select text and class column
    data['txt_type'] = data[text_col].apply(lambda x : isinstance(x, str)) # check str type
    data = data[data['txt_type'] == True]
    data = data.drop('txt_type', axis=1)
    data['cln_text'] = data[text_col].apply(clean_text)
    data['len_text'] = data['cln_text'].apply(len)
    data = data[data['len_text'] > 0]
    data = data.drop('len_text', axis=1)
    data['cln_words'] = data['cln_text'].apply(str).apply(new_word_tokenize) # clean and tokenize
    data['len_cln_words'] = data['cln_words'].apply(len)
    data = data[data['len_cln_words'] > 0]
    data = data.drop('len_cln_words', axis=1)
    data['sent'] = data['cln_words'].apply(lambda x : " ".join(x)) # join words
    # data = data[['sent', label_col]]
    data = data[['cln_text', label_col]]
    data = data.rename({"cln_text": "sent"}, axis=1)
    print("Data size :", data.shape)
    return data

# Read data

In [None]:
df = pd.read_excel('data/Re_label_row_data_5182.xlsx')
df = df[['message','label']]
df.head(2)

# Data Preprocessing

In [None]:
# label encoding
enc = LabelEncoder()
enc.fit(df.label)
idx2label = {k: v for k, v in enumerate(enc.classes_)}
label2idx = {v: k for k, v in idx2label.items()}
print(idx2label)
print(label2idx)

In [None]:
df['label'] = enc.transform(df['label'])
df.head(2)

In [None]:
# text preprocessing
pp_df = data_preprocessing(df, 'message', 'label')
pp_df.head()

In [None]:
# split to train valid test set
train_data, test_data = train_test_split(pp_df, test_size=0.8, random_state=31, stratify=pp_df['label'])
# test_data, valid_data = train_test_split(test_data, test_size=0.5, random_state=31, stratify=test_data['label'])
print(train_data.shape, test_data.shape)

In [None]:
# convert to huggingface dataset
hg_train_data = Dataset.from_pandas(train_data)
# hg_valid_data = Dataset.from_pandas(valid_data)
hg_test_data = Dataset.from_pandas(test_data)

# Load Pre-Trained Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("poom-sci/WangchanBERTa-finetuned-sentiment", model_max_length=128) # set max length
model = AutoModelForSequenceClassification.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased", num_labels=10) # set num class

In [None]:
# # set label name
model.config.id2label = idx2label
model.config.label2id = label2idx
# model.config

In [None]:
# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer (data["sent"],
            max_length=128, # set max length
            truncation=True,
            padding="max_length" )

In [None]:
# Tokenize the dataset
dataset_train = hg_train_data.map(tokenize_dataset)
dataset_test = hg_test_data.map(tokenize_dataset)
# dataset_valid = hg_valid_data.map(tokenize_dataset)

# Training

In [None]:
# set embedding (if change)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Set up training arguments
NUM_STEP = 500 # 1000
BATCH_SIZE = 8
EPOCH = 50 # 20

training_args = TrainingArguments(
    output_dir="./checkpoint",
    save_total_limit=3,
    # overwrite_output_dir=True, 
    logging_dir='./logs',
    logging_strategy='epoch',
    logging_steps=NUM_STEP,
    num_train_epochs=EPOCH,
    fp16=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=3e-5, # 5e-5
    # weight_decay=0.01, # use if larg model size
    save_strategy= 'epoch',
    save_steps=NUM_STEP,
    evaluation_strategy='epoch',
    eval_steps=NUM_STEP,
    load_best_model_at_end=True
    )

In [None]:
# define compute metrics
def compute_metrics(eval_pred):
    metric = load_metric("accuracy")
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# setup trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=5)])
    )

In [None]:
# train model ...
trainer.train()

In [None]:
# Evaluation
trainer.evaluate(dataset_test)

# Save Model

In [None]:
# Save model
tokenizer.save_pretrained('./model')
trainer.save_model('./model')
print("Save completed...")

# Load Model

In [None]:
# # Load model
# # tokenizer = AutoTokenizer.from_pretrained('pretrained_WangchanBERTa/model')
# # loaded_model = AutoModelForSequenceClassification.from_pretrained('pretrained_WangchanBERTa/model')
# pipe = pipeline("text-classification", model="pretrained_WangchanBERTa/model")

# Evaluation

In [None]:
y_test_predict = trainer.predict(dataset_test)
y_test_logits = y_test_predict.predictions
y_test_prob = torch.softmax(torch.tensor(y_test_logits), dim=1) # tf.nn.softmax(y_test_logits)
y_test_pred_labels = np.argmax(y_test_prob, axis=1)
y_test_actual_labels = y_test_predict.label_ids

In [None]:
# Compute f1 metric
metric_f1 = evaluate.load("f1")
metric_f1.compute(predictions=y_test_pred_labels, references=y_test_actual_labels, average="weighted")

In [None]:
acc = accuracy_score(y_test_actual_labels, y_test_pred_labels)
print(f"Accuracy : {acc:.4f}")

# etc

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def to_prediction(prep_text):
    inputs = tokenizer(prep_text, padding=True, truncation=True, return_tensors="pt").to(device)
    ## Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
    ## Move outputs to CPU if necessary and calculate softmax probabilities
    softmax_probs = torch.softmax(outputs.logits, dim=1).cpu().tolist()
    ## Extract predicted labels
    predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()
    # predicted_labels = outputs.logits.argmax(dim=1).cpu().tolist()
    results = [(idx2label[label], probs[label]) for label, probs in zip(predicted_labels, softmax_probs)]
    return results