# 사전 학습한 음절 임베딩으로 성인 검색어 인식 모델을 파인튜닝하기

## Import

In [None]:
import os
import json
from tqdm import tqdm
import fasttext
from mecab import MeCab
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, PrecisionRecallDisplay, confusion_matrix, accuracy_score, fbeta_score
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, use_memory_fs=True)

## 초기화

### 주요 파일 경로

### 데이터셋

In [None]:
dataset = pd.read_csv(filepaths['dataset'])

In [None]:
train_set, val_set = train_test_split(dataset, test_size=0.3, random_state=777, stratify=dataset['label'])

In [None]:
train_set

In [None]:
val_set

### MeCab 분석기

In [None]:
mecab = MeCab()

### 음절 임베딩을 사전학습한 fastText

In [None]:
model = fasttext.load_model(filepaths['fastText_w_syllable'])

#### 예시: 토큰 일부

In [None]:
model.words[:10]

## 파인튜닝

In [None]:
def write_fasttext_set(dataframe, dst_path):
    label_map = dict()
    label_map[0] = '__label__Neutral'
    label_map[1] = '__label__Toxic'
    
    results = dataframe.apply(
        lambda x: ''.join([label_map[x['label']], ' ', x['text']]),
        axis=1
    )
    
    results.to_csv(dst_path, header=False, index=False)

In [None]:
write_fasttext_set(train_set, 'train.txt')
write_fasttext_set(val_set, 'val.txt')

In [None]:
model = fasttext.train_supervised(
    'train.txt', 
    pretrainedVectors='./fasttext_syllable_pretrained/fasttext.vec',
)

In [None]:
val_set

In [None]:
tqdm.pandas()

In [None]:
def infer(model, val_set):
    return val_set.progress_apply(lambda x: model.predict(x['norm'], k=2), axis=1)

In [None]:
%%time
validation_results = infer(model, val_set)

In [None]:
validation_results

In [None]:
# model.save_model("model_filename.bin")

## 평가

### 정확도

In [None]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('val.txt'))

### precision-recall curve

In [None]:
def plot_precision_recall_curve(model, dataset):
    labels = dataset['label']
    preds = dataset.progress_apply(lambda x: model.predict(x['norm'], k=2), axis=1)
    preds = preds.apply(lambda x: np.array(x[1])[np.argsort(x[0])][1])
    PrecisionRecallDisplay.from_predictions(labels, preds)

In [None]:
plot_precision_recall_curve(model, val_set)

In [None]:
def get_precision_recall_curve(model, dataset):
    labels = dataset['label']
    preds = dataset.progress_apply(lambda x: model.predict(x['norm'], k=2), axis=1)
    preds = preds.apply(lambda x: np.array(x[1])[np.argsort(x[0])][1])
    return precision_recall_curve(labels, preds)

In [None]:
metrics = dict()
metrics['precision'], metrics['recall'], metrics['threshold'] = get_precision_recall_curve(model, val_set)

### f1-score를 기준으로 가장 적절한 threshold를 결정하기

In [None]:
def get_fbeta_score(precision, recall, beta=1):
    coefficient = (1 + beta**2)
    numerator = precision * recall
    denominator = ((beta**2) * precision) + recall
    return coefficient * (numerator/denominator)