# Language Model Finetuner

## Downloads

In [8]:
#download files
import os
if not os.path.exists('data/product_review_classification.csv'):
    !wget https://www.dropbox.com/s/wtfwlg6436offvt/product_review_classification.csv
    !mv product_review_classification.csv data/product_review_classification.csv
if not os.path.exists('data/mari-intent.zip'):
    !wget https://github.com/PyThaiNLP/truevoice-intent/raw/master/mari-intent.zip
    !unzip mari-intent.zip; mv mari* data/

## Imports

In [9]:
%load_ext tensorboard
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(level=logging.INFO)

#misc
import math
import csv
import numpy as np
import pandas as pd
import re
import glob
import argparse
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.notebook import tqdm
tqdm.pandas()

#torch 
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

#lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

#huggingface; only works with tokenizers==0.7.0 on mac now
from transformers import (
    AdamW, 
    get_linear_schedule_with_warmup, 
    get_constant_schedule, 
    AutoTokenizer, 
    AutoModel,
    AutoModelForSequenceClassification, 
    AutoConfig,
    Trainer, 
    TrainingArguments
)

from tokenizers import ByteLevelBPETokenizer

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data Preparation

In [10]:
#product classification
df = pd.read_csv('data/product_review_classification.csv').iloc[:,1:]
df['rating'] = df.rating.map(lambda x: int(x-1))
train_df, valid_df, test_df = np.split(df.sample(frac=1), [int(.8*len(df)), int(.9*len(df))])
print(train_df.shape, valid_df.shape, test_df.shape)

#save
!rm -r data/train_en; rm -r data/valid_en; rm -r data/test_en;
!rm -r data/train_th; rm -r data/valid_th; rm -r data/test_th; 
!mkdir data/train_en; mkdir data/valid_en; mkdir data/test_en; 
!mkdir data/train_th; mkdir data/valid_th; mkdir data/test_th;
train_df.iloc[:,[1,0]].to_csv('data/train_en/train.csv',index=False)
valid_df.iloc[:,[1,0]].to_csv('data/valid_en/valid.csv',index=False)
test_df.iloc[:,[1,0]].to_csv('data/test_en/test.csv',index=False)
train_df.iloc[:,[2,0]].to_csv('data/train_th/train.csv',index=False)
valid_df.iloc[:,[2,0]].to_csv('data/valid_th/valid.csv',index=False)
test_df.iloc[:,[2,0]].to_csv('data/test_th/test.csv',index=False)

(51808, 3) (6476, 3) (6476, 3)


In [11]:
df = pd.read_csv('data/product_review_classification.csv').iloc[:,2:]
train_df, valid_df, test_df = np.split(df.sample(frac=1), [int(.8*len(df)), int(.9*len(df))])
print(train_df.shape, valid_df.shape, test_df.shape)

(51808, 2) (6476, 2) (6476, 2)


In [12]:
!rm -r data/train_lm; rm -r data/valid_lm; rm -r data/test_lm; 
!mkdir data/train_lm; mkdir data/valid_lm; mkdir data/test_lm;
train_df.melt().sample(frac=1).value.to_csv('data/train_lm/train.txt',index=False, header=None)
valid_df.melt().sample(frac=1).value.to_csv('data/valid_lm/valid.txt',index=False, header=None)
test_df.melt().sample(frac=1).value.to_csv('data/test_lm/test.txt',index=False, header=None)

## Train Tokenizer

### Create `vocab.json` and `merges.txt`

In [13]:
%%time
!rm -r data/tokenizer/bpe_enth_52000; mkdir data/tokenizer/bpe_enth_52000
fnames = [str(x) for x in glob.glob("data/train_lm/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=fnames, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# Save files to disk
tokenizer.save_model('data/tokenizer/bpe_enth_52000')

CPU times: user 2min 26s, sys: 6.16 s, total: 2min 32s
Wall time: 35.1 s


['data/tokenizer/bpe_enth_52000/vocab.json',
 'data/tokenizer/bpe_enth_52000/merges.txt']

### Manually Add Special Tokens

In [14]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "data/tokenizer/bpe_enth_52000/vocab.json",
    "data/tokenizer/bpe_enth_52000/merges.txt",
)

In [15]:
enc = tokenizer.encode('วันนี้วันดี')
enc.ids

[278, 269, 550, 350, 278, 269, 642, 272]

In [16]:
tokenizer.decode(enc.ids)

'วันนี้วันดี'

In [17]:
enc.tokens

['à¸§', 'à¸±', 'à¸Ļà¸Ļ', 'à¸µà¹ī', 'à¸§', 'à¸±', 'à¸Ļà¸Ķ', 'à¸µ']

In [18]:
#add processing
tokenizer._tokenizer.post_processor = BertProcessing(
    #ending (token, id)
    ("</s>", tokenizer.token_to_id("</s>")),
    #starting (token, id)
    ("<s>", tokenizer.token_to_id("<s>")),
)
#same as roberta
tokenizer.enable_truncation(max_length=512)

In [19]:
enc = tokenizer.encode('วันนี้วันดี')
enc.ids

[0, 278, 269, 550, 350, 278, 269, 642, 272, 2]

In [20]:
tokenizer.decode(enc.ids)

'<s>วันนี้วันดี</s>'

In [21]:
enc.tokens

['<s>',
 'à¸§',
 'à¸±',
 'à¸Ļà¸Ļ',
 'à¸µà¹ī',
 'à¸§',
 'à¸±',
 'à¸Ļà¸Ķ',
 'à¸µ',
 '</s>']

### Use `RobertaTokenizerFast`

In [22]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("data/tokenizer/bpe_enth_52000")

INFO:transformers.tokenization_utils_base:Model name 'data/tokenizer/bpe_enth_52000' not found in model shortcut name list (roberta-base, roberta-large, roberta-large-mnli, distilroberta-base, roberta-base-openai-detector, roberta-large-openai-detector). Assuming 'data/tokenizer/bpe_enth_52000' is a path, a model identifier, or url to a directory containing tokenizer files.
INFO:transformers.tokenization_utils_base:Didn't find file data/tokenizer/bpe_enth_52000/added_tokens.json. We won't load it.
INFO:transformers.tokenization_utils_base:Didn't find file data/tokenizer/bpe_enth_52000/special_tokens_map.json. We won't load it.
INFO:transformers.tokenization_utils_base:Didn't find file data/tokenizer/bpe_enth_52000/tokenizer_config.json. We won't load it.
INFO:transformers.tokenization_utils_base:Didn't find file data/tokenizer/bpe_enth_52000/tokenizer.json. We won't load it.
INFO:transformers.tokenization_utils_base:loading file data/tokenizer/bpe_enth_52000/vocab.json
INFO:transformer

In [23]:
tokenizer.encode_plus('สวัสดีครับsdfsdf')

{'input_ids': [0, 3612, 269, 1137, 272, 474, 269, 292, 87, 72, 74, 87, 72, 74, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## MLM Dataset

In [24]:
from thai2transformers.datasets import MLMDataset

In [25]:
train_dataset = MLMDataset(tokenizer,'data/train_lm')
eval_dataset = MLMDataset(tokenizer,'data/valid_lm')

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=103781.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=12969.0), HTML(value='')))




## Data Collator

In [26]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

## Models

In [27]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    type_vocab_size=1,
    #roberta base
    num_hidden_layers=12,
    hidden_size=768, 
    intermediate_size=3072,
    num_attention_head=12,
#     #roberta large
#     num_hidden_layers=24,
#     hidden_size=1024, 
#     intermediate_size=4096,
#     num_attention_head=16
)

In [28]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

## Trainer

In [29]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    prediction_loss_only=True,
)

INFO:transformers.training_args:PyTorch: setting up devices
INFO:transformers.trainer:You are instantiating a Trainer but W&B is not installed. To use wandb logging, run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface.


## Evaluate

In [44]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="results/checkpoint-16000",
    tokenizer=tokenizer
)

INFO:transformers.modelcard:Model card: {
  "caveats_and_recommendations": {},
  "ethical_considerations": {},
  "evaluation_data": {},
  "factors": {},
  "intended_use": {},
  "metrics": {},
  "model_details": {},
  "quantitative_analyses": {},
  "training_data": {}
}

INFO:transformers.configuration_utils:loading configuration file results/checkpoint-16000/config.json
INFO:transformers.configuration_utils:Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "roberta",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 52000
}

INFO:transformers

In [45]:
fill_mask("ผลิตภัณฑ์นี้ซื้อมาเป็น<mask>และบุคคลนั้นตื่นเต้นกับมันและต้องการซื้ออีกชุด")

[{'sequence': '<s>ผลิตภัณฑ์นี้ซื้อมาเป็น่และบุคคลนั้นตื่นเต้นกับมันและต้องการซื้ออีกชุด</s>',
  'score': 0.04462442547082901,
  'token': 264,
  'token_str': 'à¹Ī'},
 {'sequence': '<s>ผลิตภัณฑ์นี้ซื้อมาเป็นัและบุคคลนั้นตื่นเต้นกับมันและต้องการซื้ออีกชุด</s>',
  'score': 0.0436684675514698,
  'token': 269,
  'token_str': 'à¸±'},
 {'sequence': '<s>ผลิตภัณฑ์นี้ซื้อมาเป็น้และบุคคลนั้นตื่นเต้นกับมันและต้องการซื้ออีกชุด</s>',
  'score': 0.032831691205501556,
  'token': 268,
  'token_str': 'à¹ī'},
 {'sequence': '<s>ผลิตภัณฑ์นี้ซื้อมาเป็น.และบุคคลนั้นตื่นเต้นกับมันและต้องการซื้ออีกชุด</s>',
  'score': 0.021591097116470337,
  'token': 18,
  'token_str': '.'},
 {'sequence': '<s>ผลิตภัณฑ์นี้ซื้อมาเป็นีและบุคคลนั้นตื่นเต้นกับมันและต้องการซื้ออีกชุด</s>',
  'score': 0.021187879145145416,
  'token': 272,
  'token_str': 'à¸µ'}]

In [41]:
tokenizer.decode(train_dataset[0].numpy())

'<s>ผลิตภัณฑ์นี้ซื้อมาเป็นของขวัญและบุคคลนั้นตื่นเต้นกับมันและต้องการซื้ออีกชุด\n</s>'