# Main

In [None]:
import os
import csv

import numpy as np
import pandas as pd
import torch

from tqdm import tqdm
from simpletransformers.ner import NERModel, NERArgs

In [2]:
cuda_available = torch.cuda.is_available()
cuda_available

True

## Load data in simpletransformers format

In [4]:
train_df = pd.read_csv('csv_data/lst20_train.csv')
val_df = pd.read_csv('csv_data/lst20_val.csv')

In [5]:
train_df.head()

Unnamed: 0,sentence_id,words,labels
0,0,ธ.กสิกรไทย,NN
1,0,จับ,VV
2,0,มือ,NN
3,0,เวอริไซน์,NN
4,0,,PU


In [6]:
val_df.head()

Unnamed: 0,sentence_id,words,labels
0,73198,',PU
1,73198,ปอย,NN
2,73198,',PU
3,73198,โต้,VV
4,73198,,PU


In [7]:
train_df.iloc[4]['words'] == ' '

True

## Model & Training

In [8]:
with open("ss3-final-nlp-pos-tagging/pos_list.txt", 'r') as f:
    labels_list = eval(f.read())
print(len(labels_list))
print(labels_list)

16
['AJ', 'AV', 'AX', 'CC', 'CL', 'FX', 'IJ', 'NG', 'NN', 'NU', 'PA', 'PR', 'PS', 'PU', 'VV', 'XX']


In [56]:
model_args = NERArgs()
model_args.labels_list = labels_list
model_args.train_batch_size = 32
model_args.eval_batch_size = 32
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True
model_args.num_train_epochs = 10
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.evaluate_during_training_verbose = True
model_args.output_dir = 'outputs_xlm'
model_args.overwrite_output_dir = True
model_args.max_seq_length = 180
# model_args.save_eval_checkpoints = False
# model_args.save_model_every_epoch = False
model_args.silent = False
model_args.use_multiprocessing = True
model_args.evaluate_during_training_steps = -1

In [None]:
# TODO: Search for the best model
model = NERModel("xlmroberta", "xlm-roberta-large", args=model_args)
# model = NERModel("camembert", "airesearch/wangchanberta-base-att-spm-uncased", args=model_args)

In [None]:
model.train_model(train_df, eval_data=val_df)

## Get predictions

In [63]:
model_args.silent = True
model_args.use_multiprocessing = False

model = NERModel("xlmroberta", "outputs_xlm/best_model", args=model_args)
# model = NERModel("camembert", "outputs_wangchan/best_model", args=model_args)



In [64]:
# Evaluation
result, model_outputs, preds_list = model.eval_model(val_df)

  return [


In [None]:
result

In [66]:
test_file = "ss3-final-nlp-pos-tagging/pos_test.txt"

In [67]:
with open(test_file, 'r') as f:
    l = f.readlines()
    
test_list = list(map(lambda x: ' ' if x == '\n' else x.replace('\n', ''), l))[:-1]
len(test_list)

103913

In [68]:
test_list[:10]

['อย่างไรก็ตาม',
 'เครื่อง',
 'บิน',
 'แอร์บัส',
 ' ',
 'เอ',
 '380',
 'ได้',
 'ถึง',
 'ที่']

In [69]:
CHUNK_SIZE = 40
PAD_SIZE = 10

chunk_list = []

i = 0
while (i < len(test_list)):
    start = i - PAD_SIZE if (i - PAD_SIZE) >= 0 else 0 
    end = i + CHUNK_SIZE + PAD_SIZE if (i + CHUNK_SIZE + PAD_SIZE) < len(test_list) else len(test_list)
    chunk_list.append((i, start, end))
    i += CHUNK_SIZE
    
all_pred = []

for i, start, end in tqdm(chunk_list):
    predictions, raw_outputs = model.predict([test_list[start:end]], split_on_space=False)
    i_start = i - start
    i_end = i - start + CHUNK_SIZE if (i - start + CHUNK_SIZE) < len(predictions[0]) else len(predictions[0])
    all_pred.append(predictions[0][i_start:i_end])

100%|██████████| 2598/2598 [00:28<00:00, 92.51it/s]


In [72]:
test_pred = [[k, v] for i in all_pred for j in i for (k, v) in j.items()]
len(test_pred)

103913

In [73]:
sub_df = pd.DataFrame(test_pred, columns=['Word', 'Predicted'])
sub_df['Id'] = sub_df.index + 1

In [74]:
sub_df

Unnamed: 0,Word,Predicted,Id
0,อย่างไรก็ตาม,CC,1
1,เครื่อง,NN,2
2,บิน,VV,3
3,แอร์บัส,NN,4
4,,PU,5
...,...,...,...
103908,เตรียม,VV,103909
103909,พร้อม,VV,103910
103910,รับ,VV,103911
103911,การ,FX,103912


In [75]:
os.makedirs('submissions', exist_ok=True)
sub_df[['Id', 'Predicted']].to_csv('submissions/sub_xlm.csv', index=False)