# Turkish ULMFiT from scratch

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *

In [0]:
bs=128
torch.cuda.set_device(2)
data_path = Config.data_path()

lang = 'en'
name = f'{lang}wiki'
path = data_path/name
path.mkdir(exist_ok=True, parents=True)
lm_fns = [f'{lang}_wt', f'{lang}_wt_vocab']

### Download data

In [0]:
from nlputils import split_wiki,get_wiki

get_wiki(path,lang)
!head -n4 {path}/{name}

In [0]:
dest = split_wiki(path,lang)

### Create pretrained model

In [0]:
data = (TextList.from_folder(dest)
            .split_by_rand_pct(0.1, seed=42)
            .label_for_lm()           
            .databunch(bs=bs, num_workers=1))

data.save(f'{lang}_databunch')
len(data.vocab.itos),len(data.train_ds)

In [0]:
data = load_data(path, f'{lang}_databunch', bs=bs)

In [0]:
learn = language_model_learner(data, AWD_LSTM, drop_mult=0.5, pretrained=False).to_fp16()

In [0]:
lr = 1e-2
lr *= bs/48  # Scale learning rate by batch size

In [0]:
learn.unfreeze()
learn.fit_one_cycle(10, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,3.436113,3.491434,0.366925,28:52
1,3.44124,3.544118,0.361326,28:33
2,3.571766,3.556932,0.358438,28:31
3,3.51054,3.519243,0.362278,28:27
4,3.447639,3.44932,0.369404,28:29
5,3.412284,3.406376,0.375022,28:20
6,3.286754,3.255309,0.391874,28:19
7,3.172497,3.128522,0.406803,28:37
8,3.126867,3.025249,0.419882,28:36
9,3.128793,2.991077,0.424622,28:39


Save the pretrained model and vocab:

In [0]:
path.ls()

[PosixPath('/home/jhoward/data/viwiki/docs'),
 PosixPath('/home/jhoward/data/viwiki/viwiki-latest-pages-articles.xml'),
 PosixPath('/home/jhoward/data/viwiki/vi_wt87_vocab.pkl'),
 PosixPath('/home/jhoward/data/viwiki/extract'),
 PosixPath('/home/jhoward/data/viwiki/tmp'),
 PosixPath('/home/jhoward/data/viwiki/test.csv'),
 PosixPath('/home/jhoward/data/viwiki/viwiki'),
 PosixPath('/home/jhoward/data/viwiki/log'),
 PosixPath('/home/jhoward/data/viwiki/train.csv')]

In [0]:
mdl_path = path/'models'
mdl_path.mkdir(exist_ok=True)
learn.to_fp32().save(mdl_path/lm_fns[0], with_opt=False)
learn.data.vocab.save(mdl_path/(lm_fns[1] + '.pkl'))

## Vietnamese sentiment analysis

### Language model

- [Data](https://github.com/ngxbac/aivivn_phanloaisacthaibinhluan/tree/master/data)
- [Competition details](https://www.aivivn.com/contests/1)
- Top 3 f1 scores: 0.900, 0.897, 0.897

In [0]:
train_df = pd.read_csv(path/'train.csv')
train_df.loc[pd.isna(train_df.comment),'comment']='NA'
train_df.head()

Unnamed: 0,id,comment,label
0,train_000000,Dung dc sp tot cam on \nshop Đóng gói sản phẩm...,0
1,train_000001,Chất lượng sản phẩm tuyệt vời . Son mịn nhưng...,0
2,train_000002,Chất lượng sản phẩm tuyệt vời nhưng k có hộp ...,0
3,train_000003,:(( Mình hơi thất vọng 1 chút vì mình đã kỳ vọ...,1
4,train_000004,Lần trước mình mua áo gió màu hồng rất ok mà đ...,1


In [0]:
test_df = pd.read_csv(path/'test.csv')
test_df.loc[pd.isna(test_df.comment),'comment']='NA'
test_df.head()

Unnamed: 0,id,comment
0,test_000000,Chưa dùng thử nên chưa biết
1,test_000001,Không đáng tiềnVì ngay đợt sale nên mới mua n...
2,test_000002,Cám ơn shop. Đóng gói sản phẩm rất đẹp và chắc...
3,test_000003,Vải đẹp.phom oki luôn.quá ưng
4,test_000004,Chuẩn hàng đóng gói đẹp


In [0]:
df = pd.concat([train_df,test_df], sort=False)

In [0]:
data_lm = (TextList.from_df(df, path, cols='comment')
    .split_by_rand_pct(0.1, seed=42)
    .label_for_lm()           
    .databunch(bs=bs, num_workers=1))

In [0]:
learn_lm = language_model_learner(data_lm, AWD_LSTM, pretrained_fnames=lm_fns, drop_mult=1.0)

In [0]:
lr = 1e-3
lr *= bs/48

In [0]:
learn_lm.fit_one_cycle(2, lr*10, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,4.97508,4.138585,0.317773,00:07
1,4.408635,4.025489,0.326423,00:07


In [0]:
learn_lm.unfreeze()
learn_lm.fit_one_cycle(8, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,4.142114,3.928278,0.33623,00:09
1,4.010835,3.793583,0.349972,00:09
2,3.873617,3.694702,0.35724,00:09
3,3.761377,3.632186,0.364648,00:09
4,3.679017,3.595601,0.366964,00:09
5,3.614548,3.576386,0.369224,00:09
6,3.575895,3.567496,0.370285,00:09
7,3.560278,3.566525,0.370173,00:10


In [0]:
learn_lm.save(f'{lang}fine_tuned')
learn_lm.save_encoder(f'{lang}fine_tuned_enc')

### Classifier

In [0]:
data_clas = (TextList.from_df(train_df, path, vocab=data_lm.vocab, cols='comment')
    .split_by_rand_pct(0.1, seed=42)
    .label_from_df(cols='label')
    .databunch(bs=bs, num_workers=1))

data_clas.save(f'{lang}_textlist_class')

In [0]:
data_clas = load_data(path, f'{lang}_textlist_class', bs=bs, num_workers=1)

In [0]:
from sklearn.metrics import f1_score

In [0]:
@np_func
def f1(inp,targ): return f1_score(targ, np.argmax(inp, axis=-1))

In [0]:
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c.load_encoder(f'{lang}fine_tuned_enc')
learn_c.freeze()

In [0]:
lr=2e-2
lr *= bs/48

In [0]:
learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,_inner,time
0,0.33815,0.275298,0.899876,0.87843,00:02
1,0.302302,0.245949,0.902985,0.877226,00:02


In [0]:
learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,_inner,time
0,0.321768,0.255457,0.899254,0.871367,00:02
1,0.305934,0.250888,0.894901,0.872021,00:02


In [0]:
learn_c.freeze_to(-2)
learn_c.fit_one_cycle(2, slice(lr/(2.6**4),lr), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,_inner,time
0,0.300939,0.26108,0.893657,0.866201,00:03
1,0.26379,0.220207,0.906716,0.886115,00:03


In [0]:
learn_c.freeze_to(-3)
learn_c.fit_one_cycle(2, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,_inner,time
0,0.282888,0.238203,0.905473,0.886483,00:04
1,0.248599,0.216489,0.918532,0.90155,00:04


In [0]:
learn_c.unfreeze()
learn_c.fit_one_cycle(1, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,_inner,time
0,0.201508,0.217176,0.91107,0.890084,00:05


In [0]:
learn_c.save(f'{lang}clas')

Competition top 3 f1 scores: 0.90, 0.89, 0.89. Winner used an ensemble of 4 models: TextCNN, VDCNN, HARNN, and SARNN.

## Ensemble

In [0]:
data_clas = load_data(path, f'{lang}_textlist_class', bs=bs, num_workers=1)
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c.load(f'{lang}clas', purge=False);

In [0]:
preds,targs = learn_c.get_preds(ordered=True)
accuracy(preds,targs),f1(preds,targs)

(tensor(0.9111), tensor(0.8952))

In [0]:
data_clas_bwd = load_data(path, f'{lang}_textlist_class_bwd', bs=bs, num_workers=1, backwards=True)
learn_c_bwd = text_classifier_learner(data_clas_bwd, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c_bwd.load(f'{lang}clas_bwd', purge=False);

In [0]:
preds_b,targs_b = learn_c_bwd.get_preds(ordered=True)
accuracy(preds_b,targs_b),f1(preds_b,targs_b)

(tensor(0.9092), tensor(0.8957))

In [0]:
preds_avg = (preds+preds_b)/2

In [0]:
accuracy(preds_avg,targs_b),f1(preds_avg,targs_b)

(tensor(0.9154), tensor(0.9016))