In [1]:
#imports
from datasets import load_dataset
from thai2transformers.metrics import classification_metrics
from pythainlp.ulmfit import process_thai, THWIKI_LSTM, ThaiTokenizer, pre_rules_th, post_rules_th
import pandas as pd
from fastai.text import *
from fastai.callbacks import CSVLogger, SaveModelCallback
import os

In [2]:
#parameters
class Args:
    dataset_name_or_path = 'generated_reviews_enth'
    feature_col = 'translation'
    label_col = 'review_star'
    output_dir = 'generated_reviews_enth_review_star'
    batch_size = 64
    seed = 1412

args = Args()

In [3]:
if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

In [4]:
dataset = load_dataset(args.dataset_name_or_path)
dataset

Reusing dataset generated_reviews_enth (/home/admin/.cache/huggingface/datasets/generated_reviews_enth/generated_reviews_enth/1.0.0/6118765c366071f4d7a4504d9278e331778d23edae49951eab97a174effe0b76)


DatasetDict({
    train: Dataset({
        features: ['translation', 'review_star', 'correct'],
        num_rows: 141369
    })
    validation: Dataset({
        features: ['translation', 'review_star', 'correct'],
        num_rows: 15708
    })
    test: Dataset({
        features: ['translation', 'review_star', 'correct'],
        num_rows: 17453
    })
})

In [5]:
if args.dataset_name_or_path == 'wongnai_reviews':
    train_val_split = dataset['train'].train_test_split(test_size=0.1, shuffle=True, seed=2020)
    dataset['train'] = train_val_split['train']
    dataset['validation'] = train_val_split['test']
dataset

DatasetDict({
    train: Dataset({
        features: ['translation', 'review_star', 'correct'],
        num_rows: 141369
    })
    validation: Dataset({
        features: ['translation', 'review_star', 'correct'],
        num_rows: 15708
    })
    test: Dataset({
        features: ['translation', 'review_star', 'correct'],
        num_rows: 17453
    })
})

In [6]:
#x
if args.dataset_name_or_path == 'generated_reviews_enth':
    texts_train = [i['th'] for i in dataset['train'][args.feature_col]]
    texts_valid = [i['th'] for i in dataset['validation'][args.feature_col]]
    texts_test = [i['th'] for i in dataset['test'][args.feature_col]]
else:
    texts_train = dataset['train'][args.feature_col]
    texts_valid = dataset['validation'][args.feature_col]
    texts_test = dataset['test'][args.feature_col]

In [7]:
#y
if args.dataset_name_or_path == 'generated_reviews_enth' and args.label_col=='review_star':
    labels_train = [i-1 for i in dataset['train'][args.label_col]]
    labels_valid = [i-1 for i in dataset['validation'][args.label_col]]
    labels_test = [i-1 for i in dataset['test'][args.label_col]]
else:
    labels_train = dataset['train'][args.label_col]
    labels_valid = dataset['validation'][args.label_col]
    labels_test = dataset['test'][args.label_col]

In [8]:
#df
train_df = pd.DataFrame({'texts':texts_train, 'labels':labels_train})
valid_df = pd.DataFrame({'texts':texts_valid, 'labels':labels_valid})
test_df = pd.DataFrame({'texts':texts_test, 'labels':labels_test})
# train_valid = pd.concat([train_df,valid_df]).reset_index(drop=True)
# valid_idx = list(train_valid.iloc[train_df.shape[0]:,:].index)
train_df.shape, valid_df.shape, test_df.shape

((141369, 2), (15708, 2), (17453, 2))

In [14]:
tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)

processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
            NumericalizeProcessor(vocab=None, max_vocab=60000, min_freq=3)]

data_lm = (ItemLists(args.output_dir, 
            train=TextList.from_df(train_df, args.output_dir, cols=["texts"], processor=processor),
            valid=TextList.from_df(valid_df, args.output_dir, cols=["texts"], processor=processor))
    .label_for_lm()
    .databunch(bs=args.batch_size))
data_lm.sanity_check()
data_lm.save(f'{args.dataset_name_or_path}_lm.pkl')

In [15]:
config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True,
             output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)
trn_args = dict(drop_mult=1., clip=0.12, alpha=2, beta=1)

learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args)

#load pretrained models
learn.load_pretrained(**THWIKI_LSTM);

In [16]:
print("training frozen")
learn.freeze_to(-1)
learn.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))

training frozen


epoch,train_loss,valid_loss,accuracy,time
0,3.901398,3.667264,0.314096,04:27


In [17]:
#train unfrozen
print("training unfrozen")
learn.unfreeze()
learn.fit_one_cycle(5, 1e-3, moms=(0.8, 0.7))

training unfrozen


epoch,train_loss,valid_loss,accuracy,time
0,3.562119,3.389167,0.347284,06:00
1,3.425128,3.265404,0.362213,06:01
2,3.312375,3.198505,0.370227,06:01
3,3.235396,3.164119,0.374517,06:01
4,3.184817,3.157655,0.375286,06:01


In [18]:
learn.save_encoder("lm_enc")

In [9]:
#lm data
data_lm = load_data(args.output_dir, f"{args.dataset_name_or_path}_lm.pkl")
data_lm.sanity_check()

#classification data
tt = Tokenizer(tok_func=ThaiTokenizer, lang="th", pre_rules=pre_rules_th, post_rules=post_rules_th)
processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
            NumericalizeProcessor(vocab=data_lm.vocab, max_vocab=60000, min_freq=3)]

data_cls = (ItemLists(args.output_dir, 
            train=TextList.from_df(train_df, args.output_dir, cols=["texts"], processor=processor),
            valid=TextList.from_df(valid_df, args.output_dir, cols=["texts"], processor=processor))
    .label_from_df("labels")
    .databunch(bs=args.batch_size)
    )

data_cls.sanity_check()
print(len(data_cls.vocab.itos))

16672


In [10]:
#model
config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False,
             output_p=0.4, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5)
trn_args = dict(bptt=70, drop_mult=0.7, alpha=2, beta=1, max_len=500)

learn = text_classifier_learner(data_cls, AWD_LSTM, config=config, pretrained=False, **trn_args)
#load pretrained finetuned model
learn.load_encoder("lm_enc");

In [None]:
#train
learn.freeze_to(-1)
learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7))
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2 / (2.6 ** 4), 1e-2), moms=(0.8, 0.7))
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3 / (2.6 ** 4), 5e-3), moms=(0.8, 0.7))
learn.unfreeze()
learn.fit_one_cycle(5, slice(1e-3 / (2.6 ** 4), 1e-3), moms=(0.8, 0.7),
                   callbacks=[SaveModelCallback(learn, every='improvement', monitor='accuracy', name='bestmodel')])

epoch,train_loss,valid_loss,accuracy,time
0,1.101346,0.986745,0.586007,02:27


epoch,train_loss,valid_loss,accuracy,time
0,0.947138,0.903645,0.618857,02:38


epoch,train_loss,valid_loss,accuracy,time


In [None]:
#test
learn.load("bestmodel")

#get predictions
probs, y_true, loss = learn.get_preds(ds_type = DatasetType.Valid, ordered=True, with_loss=True)
classes = learn.data.train_ds.classes
y_true = np.array([classes[i] for i in y_true.numpy()])
preds = np.array([classes[i] for i in probs.argmax(1).numpy()])
prob = probs.numpy()
loss = loss.numpy()

In [None]:
class Preds:
    label_ids = y_true
    predictions = prob
    
pd.DataFrame.from_dict(classification_metrics(Preds),orient='index').transpose()