In [4]:
#imports
from datasets import load_dataset
from thai2transformers.metrics import classification_metrics
from pythainlp.ulmfit import process_thai, THWIKI_LSTM, ThaiTokenizer, pre_rules_th, post_rules_th
import pandas as pd
from fastai.text import *
from fastai.callbacks import CSVLogger, SaveModelCallback
import os

In [98]:
#parameters
class Args:
    dataset_name_or_path = 'prachathai67k'
    feature_col = 'title'
    label_col = 'review_star'
    label_cols = ['politics', 'human_rights', 'quality_of_life', 
                  'international', 'social', 'environment', 'economics', 
                  'culture', 'labor', 'national_security', 'ict', 'education']
    output_dir = 'prachathai67k'
    batch_size = 64
    is_multilabel = True

args = Args()

In [99]:
if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

In [100]:
dataset = load_dataset(args.dataset_name_or_path)
dataset

Reusing dataset prachathai67k (/home/admin/.cache/huggingface/datasets/prachathai67k/prachathai67k/1.1.0/2eeb3bfaf307043e606a58f1f2af8b3d6bbf8a2d0b957d7bfafaf1dc1ef4b5ac)


DatasetDict({
    train: Dataset({
        features: ['url', 'date', 'title', 'body_text', 'politics', 'human_rights', 'quality_of_life', 'international', 'social', 'environment', 'economics', 'culture', 'labor', 'national_security', 'ict', 'education'],
        num_rows: 54379
    })
    validation: Dataset({
        features: ['url', 'date', 'title', 'body_text', 'politics', 'human_rights', 'quality_of_life', 'international', 'social', 'environment', 'economics', 'culture', 'labor', 'national_security', 'ict', 'education'],
        num_rows: 6721
    })
    test: Dataset({
        features: ['url', 'date', 'title', 'body_text', 'politics', 'human_rights', 'quality_of_life', 'international', 'social', 'environment', 'economics', 'culture', 'labor', 'national_security', 'ict', 'education'],
        num_rows: 6789
    })
})

In [101]:
if args.dataset_name_or_path == 'wongnai_reviews':
    train_val_split = dataset['train'].train_test_split(test_size=0.1, shuffle=True, seed=2020)
    dataset['train'] = train_val_split['train']
    dataset['validation'] = train_val_split['test']
dataset

DatasetDict({
    train: Dataset({
        features: ['url', 'date', 'title', 'body_text', 'politics', 'human_rights', 'quality_of_life', 'international', 'social', 'environment', 'economics', 'culture', 'labor', 'national_security', 'ict', 'education'],
        num_rows: 54379
    })
    validation: Dataset({
        features: ['url', 'date', 'title', 'body_text', 'politics', 'human_rights', 'quality_of_life', 'international', 'social', 'environment', 'economics', 'culture', 'labor', 'national_security', 'ict', 'education'],
        num_rows: 6721
    })
    test: Dataset({
        features: ['url', 'date', 'title', 'body_text', 'politics', 'human_rights', 'quality_of_life', 'international', 'social', 'environment', 'economics', 'culture', 'labor', 'national_security', 'ict', 'education'],
        num_rows: 6789
    })
})

In [102]:
#feature labels
if args.is_multilabel:
    train_df = pd.DataFrame(dataset['train'])[[args.feature_col] + args.label_cols]
    train_df.columns = ['texts'] + args.label_cols
    valid_df = pd.DataFrame(dataset['validation'])[[args.feature_col] + args.label_cols]
    valid_df.columns = ['texts'] + args.label_cols
    test_df = pd.DataFrame(dataset['test'])[[args.feature_col] + args.label_cols]
    test_df.columns = ['texts'] + args.label_cols
else:
    #x
    if args.dataset_name_or_path == 'generated_reviews_enth':
        texts_train = [i['th'] for i in dataset['train'][args.feature_col]]
        texts_valid = [i['th'] for i in dataset['validation'][args.feature_col]]
        texts_test = [i['th'] for i in dataset['test'][args.feature_col]]
    else:
        texts_train = dataset['train'][args.feature_col]
        texts_valid = dataset['validation'][args.feature_col]
        texts_test = dataset['test'][args.feature_col]
    #y
    if args.dataset_name_or_path == 'generated_reviews_enth' and args.label_col=='review_star':
        labels_train = [i-1 for i in dataset['train'][args.label_col]]
        labels_valid = [i-1 for i in dataset['validation'][args.label_col]]
        labels_test = [i-1 for i in dataset['test'][args.label_col]]
    else:
        labels_train = dataset['train'][args.label_col]
        labels_valid = dataset['validation'][args.label_col]
        labels_test = dataset['test'][args.label_col]
    #df
    train_df = pd.DataFrame({'texts':texts_train, 'labels':labels_train})
    valid_df = pd.DataFrame({'texts':texts_valid, 'labels':labels_valid})
    test_df = pd.DataFrame({'texts':texts_test, 'labels':labels_test})

train_df.shape, valid_df.shape, test_df.shape

((54379, 13), (6721, 13), (6789, 13))

In [10]:
tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)

processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
            NumericalizeProcessor(vocab=None, max_vocab=60000, min_freq=3)]

data_lm = (ItemLists(args.output_dir, 
            train=TextList.from_df(train_df, args.output_dir, cols=["texts"], processor=processor),
            valid=TextList.from_df(valid_df, args.output_dir, cols=["texts"], processor=processor))
    .label_for_lm()
    .databunch(bs=args.batch_size))
data_lm.sanity_check()
data_lm.save(f'{args.dataset_name_or_path}_lm.pkl')

In [11]:
config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True,
             output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)
trn_args = dict(drop_mult=1., clip=0.12, alpha=2, beta=1)

learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args)

#load pretrained models
learn.load_pretrained(**THWIKI_LSTM);

In [12]:
print("training frozen")
learn.freeze_to(-1)
learn.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))

training frozen


epoch,train_loss,valid_loss,accuracy,time
0,4.770234,4.421361,0.318327,00:28


In [13]:
#train unfrozen
print("training unfrozen")
learn.unfreeze()
learn.fit_one_cycle(5, 1e-3, moms=(0.8, 0.7))

training unfrozen


epoch,train_loss,valid_loss,accuracy,time
0,4.347134,4.142264,0.347872,00:38
1,4.150784,3.989359,0.359503,00:39
2,3.950324,3.895626,0.370871,00:39
3,3.784429,3.858943,0.37453,00:39
4,3.709645,3.854859,0.374904,00:39


In [14]:
learn.save_encoder("lm_enc")

In [17]:
#lm data
data_lm = load_data(args.output_dir, f"{args.dataset_name_or_path}_lm.pkl")
data_lm.sanity_check()

#classification data
tt = Tokenizer(tok_func=ThaiTokenizer, lang="th", pre_rules=pre_rules_th, post_rules=post_rules_th)
processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
            NumericalizeProcessor(vocab=data_lm.vocab, max_vocab=60000, min_freq=3)]

if args.is_multilabel:
    data_cls = (ItemLists(args.output_dir, 
                train=TextList.from_df(train_df, args.output_dir, cols=["texts"], processor=processor),
                valid=TextList.from_df(valid_df, args.output_dir, cols=["texts"], processor=processor),)
        .label_from_df(list(train_df.columns[1:]))
        .databunch(bs=args.batch_size)
        )
else:
    data_cls = (ItemLists(args.output_dir, 
                train=TextList.from_df(train_df, args.output_dir, cols=["texts"], processor=processor),
                valid=TextList.from_df(valid_df, args.output_dir, cols=["texts"], processor=processor),)
        .label_from_df("labels")
        .databunch(bs=args.batch_size)
        )
data_cls.sanity_check()
print(len(data_cls.vocab.itos))

11856


In [21]:
#model
config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False,
             output_p=0.4, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5)
trn_args = dict(bptt=70, drop_mult=0.7, alpha=2, beta=1, max_len=500)

learn = text_classifier_learner(data_cls, AWD_LSTM, config=config, pretrained=False, **trn_args)
#load pretrained finetuned model
learn.load_encoder("lm_enc");

In [22]:
#train
monitor_metric = 'valid_loss' if args.is_multilabel else 'accuracy' 
learn.freeze_to(-1)
learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7))
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2 / (2.6 ** 4), 1e-2), moms=(0.8, 0.7))
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3 / (2.6 ** 4), 5e-3), moms=(0.8, 0.7))
learn.unfreeze()
learn.fit_one_cycle(5, slice(1e-3 / (2.6 ** 4), 1e-3), moms=(0.8, 0.7),
                   callbacks=[SaveModelCallback(learn, every='improvement', monitor=monitor_metric, name='bestmodel')])

epoch,train_loss,valid_loss,time
0,0.263054,0.240299,00:20


epoch,train_loss,valid_loss,time
0,0.246976,0.22738,00:22


epoch,train_loss,valid_loss,time
0,0.234152,0.217878,00:32


epoch,train_loss,valid_loss,time
0,0.224458,0.214642,00:53
1,0.219356,0.211842,00:51
2,0.213312,0.2097,00:51
3,0.206874,0.208715,00:52
4,0.203129,0.208968,00:50


Better model found at epoch 0 with valid_loss value: 0.2146424502134323.
Better model found at epoch 1 with valid_loss value: 0.21184176206588745.
Better model found at epoch 2 with valid_loss value: 0.20970037579536438.
Better model found at epoch 3 with valid_loss value: 0.20871511101722717.


In [103]:
#thresholding
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

def best_threshold(y, probs):
    f1s = []
    for th in range(1,100):
        f1s.append((th/100,f1_score(y,(probs> (th/100)).astype(int))))
    f1s_df = pd.DataFrame(f1s).sort_values(1,ascending=False).reset_index(drop=True)
    f1s_df.columns = ['th_label','f1_label']
    return f1s_df.th_label[0], f1s_df.f1_label[0]

if args.is_multilabel:
    #databunch
    data_lm = load_data(args.output_dir, f"{args.dataset_name_or_path}_lm.pkl")
    data_lm.sanity_check()

    #classification data
    tt = Tokenizer(tok_func=ThaiTokenizer, lang="th", pre_rules=pre_rules_th, post_rules=post_rules_th)
    processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
                NumericalizeProcessor(vocab=data_lm.vocab, max_vocab=60000, min_freq=3)]

    if args.is_multilabel:
        data_cls = (ItemLists(args.output_dir, 
                    train=TextList.from_df(train_df, args.output_dir, cols=["texts"], processor=processor),
                    valid=TextList.from_df(valid_df, args.output_dir, cols=["texts"], processor=processor),)
            .label_from_df(list(train_df.columns[1:]))
            .databunch(bs=args.batch_size)
            )
    else:
        data_cls = (ItemLists(args.output_dir, 
                    train=TextList.from_df(train_df, args.output_dir, cols=["texts"], processor=processor),
                    valid=TextList.from_df(valid_df, args.output_dir, cols=["texts"], processor=processor),)
            .label_from_df("labels")
            .databunch(bs=args.batch_size)
            )
    data_cls.sanity_check()
    print(len(data_cls.vocab.itos))

    #model
    config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False,
                 output_p=0.4, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5)
    trn_args = dict(bptt=70, drop_mult=0.7, alpha=2, beta=1, max_len=500)

    learn = text_classifier_learner(data_cls, AWD_LSTM, config=config, pretrained=False, **trn_args)
    learn.load("bestmodel");


    #get predictions
    probs, y_true, loss = learn.get_preds(ds_type = DatasetType.Valid, ordered=True, with_loss=True)
    y_true = y_true.numpy()
    probs = probs.numpy()
    best_ths = []
    for i in range(y_true.shape[1]):
        best_th, _ = best_threshold(y_true[:,i], probs[:,i])
        best_ths.append(best_th)

11856


In [104]:
#test
#databunch
data_lm = load_data(args.output_dir, f"{args.dataset_name_or_path}_lm.pkl")
data_lm.sanity_check()

#classification data
tt = Tokenizer(tok_func=ThaiTokenizer, lang="th", pre_rules=pre_rules_th, post_rules=post_rules_th)
processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
            NumericalizeProcessor(vocab=data_lm.vocab, max_vocab=60000, min_freq=3)]

if args.is_multilabel:
    data_cls = (ItemLists(args.output_dir, 
                train=TextList.from_df(train_df, args.output_dir, cols=["texts"], processor=processor),
                valid=TextList.from_df(test_df, args.output_dir, cols=["texts"], processor=processor),)
        .label_from_df(list(train_df.columns[1:]))
        .databunch(bs=args.batch_size)
        )
else:
    data_cls = (ItemLists(args.output_dir, 
                train=TextList.from_df(train_df, args.output_dir, cols=["texts"], processor=processor),
                valid=TextList.from_df(test_df, args.output_dir, cols=["texts"], processor=processor),)
        .label_from_df("labels")
        .databunch(bs=args.batch_size)
        )
data_cls.sanity_check()
print(len(data_cls.vocab.itos))
            
#model
config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False,
             output_p=0.4, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5)
trn_args = dict(bptt=70, drop_mult=0.7, alpha=2, beta=1, max_len=500)

learn = text_classifier_learner(data_cls, AWD_LSTM, config=config, pretrained=False, **trn_args)
learn.load("bestmodel");


#get predictions
probs, y_true, loss = learn.get_preds(ds_type = DatasetType.Valid, ordered=True, with_loss=True)
classes = learn.data.train_ds.classes

11856


In [105]:
if args.is_multilabel:
    probs = probs.numpy()
    y_true = y_true.numpy()
    preds = np.zeros((y_true.shape[0], y_true.shape[1]))
    for i in range(y_true.shape[1]):
        preds[:,i] = (probs[:,i] > best_ths[i]).astype(int)
        
    #micro
    micro_df = pd.DataFrame.from_dict({'accuracy': (preds==y_true).mean(),
    'f1_micro':f1_score(y_true.reshape(-1),preds.reshape(-1)),
    'precision_micro':precision_score(y_true.reshape(-1),preds.reshape(-1)),
    'recall_micro':recall_score(y_true.reshape(-1),preds.reshape(-1))}, orient='index').transpose()
    
    #macro
    test_performances = []
    for i in range(y_true.shape[1]):
        d = {}
        d['f1_macro'] = f1_score(y_true[:,i],preds[:,i])
        d['precision_macro'] = precision_score(y_true[:,i],preds[:,i])
        d['recall_macro'] = recall_score(y_true[:,i],preds[:,i])
        test_performances.append(d)
    macro_df = pd.DataFrame(pd.DataFrame(test_performances).mean()).transpose()
    
    #test performance
    results = pd.concat([micro_df,macro_df],1)
else:
    y_true = np.array([classes[i] for i in y_true.numpy()])
    preds = np.array([classes[i] for i in probs.argmax(1).numpy()])
    prob = probs.numpy()

    class Preds:
        label_ids = y_true
        predictions = prob

    results = pd.DataFrame.from_dict(classification_metrics(Preds),orient='index').transpose()

In [106]:
results

Unnamed: 0,accuracy,f1_micro,precision_micro,recall_micro,f1_macro,precision_macro,recall_macro
0,0.899642,0.662065,0.608263,0.726308,0.602147,0.566918,0.651016
