# Finetune Korean syllable embedding for NMSC

## Import

In [None]:
import os
import json
from tqdm import tqdm
import fasttext
from mecab import MeCab
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, PrecisionRecallDisplay, confusion_matrix, accuracy_score, fbeta_score
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, use_memory_fs=True)

## Initialization

### Some directories and filepaths

- `dataset`: `.csv` file (This notebook expects it to include two columns, `text` and `label`)
- `pretrained_bin`: `.bin` file of the pre-trained embeddings
- `pretrained_vec`: `.vec` file of the pre-trained embeddings
- `trainset`: `.txt` file where each line follows the format `__label__<label> <text>`

In [None]:
filepaths = dict()
filepaths['dataset'] = './dataset.csv'
filepaths['pretrained_bin'] = './fasttext.bin'
filepaths['pretrained_vec'] = './fasttext.vec'
filepaths['trainset'] = './train.txt'
filepaths['valset'] = './val.txt'

### Dataset

In [None]:
dataset = pd.read_csv(filepaths['dataset'])
num_classes = dataset['label'].nunique()

In [None]:
train_set, val_set = train_test_split(dataset, test_size=0.3, random_state=777, stratify=dataset['label'])

In [None]:
train_set

In [None]:
val_set

### MeCab analyzer

In [None]:
mecab = MeCab()

### fastText

In [None]:
model = fasttext.load_model(filepaths['pretrained_bin'])

#### E.g., 10-th token

In [None]:
model.words[:10]

## Fine-tuning

In [None]:
def write_fasttext_dataset(dataframe, dst_path, label_map):
    results = dataframe.apply(
        lambda x: ''.join([label_map[x['label']], ' ', x['text']]),
        axis=1
    )
    
    results.to_csv(dst_path, header=False, index=False)

In [None]:
label_map = dict()
label_map[0] = '__label__Neutral'
label_map[1] = '__label__Toxic'
    
write_fasttext_dataset(train_set, 'train.txt', label_map)
write_fasttext_dataset(val_set, 'val.txt', label_map)

In [None]:
model = fasttext.train_supervised(
    'train.txt', 
    pretrainedVectors=filepaths['pretrained_vec'],
)

In [None]:
val_set

In [None]:
tqdm.pandas()

In [None]:
def infer(model, val_set, topk=2):
    return val_set.progress_apply(lambda x: model.predict(x['text'], k=topk), axis=1)

In [None]:
%%time
validation_results = infer(model, val_set)

In [None]:
validation_results

In [None]:
# model.save_model("model_filename.bin")

## Validation

### P@1, R@1

- Do not use if the task is not multi-label classification

In [None]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('val.txt'))

### precision-recall curve

In [None]:
def plot_precision_recall_curve(model, dataset):
    labels = dataset['label']
    preds = dataset.progress_apply(lambda x: model.predict(x['text'], k=2), axis=1)
    preds = preds.apply(lambda x: np.array(x[1])[np.argsort(x[0])][1])
    PrecisionRecallDisplay.from_predictions(labels, preds)

In [None]:
plot_precision_recall_curve(model, val_set)

In [None]:
def get_precision_recall_curve(model, dataset):
    labels = dataset['label']
    preds = dataset.progress_apply(lambda x: model.predict(x['text'], k=2), axis=1)
    preds = preds.apply(lambda x: np.array(x[1])[np.argsort(x[0])][1])
    return precision_recall_curve(labels, preds)

In [None]:
metrics = dict()
metrics['precision'], metrics['recall'], metrics['threshold'] = get_precision_recall_curve(model, val_set)

### Finding an optimal threshold based on f1-score

In [None]:
def get_fbeta_score(precision, recall, beta=1):
    coefficient = (1 + beta**2)
    numerator = precision * recall
    denominator = ((beta**2) * precision) + recall
    return coefficient * (numerator/denominator)

In [None]:
f1_scores = get_fbeta_score(metrics['precision'], metrics['recall'])
best_threshold = metrics['threshold'][f1_scores.argmax()]
best_precision = metrics['precision'][f1_scores.argmax()]
best_recall = metrics['recall'][f1_scores.argmax()]

In [None]:
f1_scores.max()

In [None]:
f1_scores.argmax()

In [None]:
best_threshold

In [None]:
best_precision

In [None]:
best_recall