In [1]:
import os
import json
import math
import torch
import pickle
import shutil
import transformers

import numpy as np
import pandas as pd

In [2]:
from pathlib import Path
from itertools import chain
from tqdm import tqdm
from datetime import datetime
from datetime import timedelta

In [3]:
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import BertTokenizer
from transformers import BertModel

In [4]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
import nbimporter

from model import BERTClass
from model import loss_fn
from model import validation
from model import train
from model import evaluate

from flow import clean
from flow import load_or_create_model
from flow import create_or_update_best
from flow import load_best
from flow import create_or_update_last

from flow import create_or_update_loss
from flow import plot_loss
from flow import load_or_create_acc_epoch

from dataset import get_dataset

Importing Jupyter notebook from model.ipynb
Importing Jupyter notebook from flow.ipynb
Importing Jupyter notebook from dataset.ipynb


2021-04-08 18:41:54 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-08 18:41:54 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/resolve/main/config.json HTTP/1.1" 200 0
2021-04-08 18:41:54 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-08 18:41:55 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/resolve/main/sentencepiece.bpe.model HTTP/1.1" 200 0
2021-04-08 18:41:55 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-08 18:41:56 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/resolve/main/tokenizer.json HTTP/1.1" 200 0
Building model [5m[33m...[0m[0m2021-04-08 18:41:56 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-08 18:41:57 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/resolve/main/config.json HTTP/1.1" 200 0
2021-04-08 18:41:58 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-08 18:41:58 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/r

In [6]:
%run data.ipynb

(20000, 2)
(14686, 2)
34686 documents - 1.595MB (training set)
1017 documents - 0.061MB (test set)

Stored 'data_train' (DataFrame)
Stored 'X_train' (list)
Stored 'X_test' (list)
Stored 'y_train' (list)
Stored 'y_test' (list)
Stored 'data_train_size_mb' (float)


In [7]:
tomorrow = datetime.now() + timedelta(days=1)
tomorrow_morning = tomorrow.replace(hour=7, minute=30, second=0, microsecond=0)
tomorrow_morning

datetime.datetime(2021, 4, 9, 7, 30)

In [8]:
today = datetime.now()
today_afternoon = today.replace(hour=18, minute=0, second=0, microsecond=0)
today_afternoon

datetime.datetime(2021, 4, 8, 18, 0)

## Data

In [9]:
# MAX_LEN = 128
# MAX_LEN = int(mu_1std)
MAX_LEN = 512

In [10]:
model_name = 'hfl/chinese-roberta-wwm-ext'

In [11]:
%store -r X_train
%store -r y_train
%store -r X_test
%store -r y_test

In [12]:
training_set, testing_set = get_dataset(model_name, X_train, y_train, X_test, y_test, MAX_LEN)

2021-04-08 18:42:12 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-08 18:42:13 DEBUG https://huggingface.co:443 "HEAD /hfl/chinese-roberta-wwm-ext/resolve/main/vocab.txt HTTP/1.1" 200 0
2021-04-08 18:42:13 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-08 18:42:14 DEBUG https://huggingface.co:443 "HEAD /hfl/chinese-roberta-wwm-ext/resolve/main/added_tokens.json HTTP/1.1" 200 0
2021-04-08 18:42:14 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-08 18:42:14 DEBUG https://huggingface.co:443 "HEAD /hfl/chinese-roberta-wwm-ext/resolve/main/special_tokens_map.json HTTP/1.1" 200 0
2021-04-08 18:42:14 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-08 18:42:15 DEBUG https://huggingface.co:443 "HEAD /hfl/chinese-roberta-wwm-ext/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2021-04-08 18:42:15 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-08 18:42:15 DEBUG https://huggingface.co:443 "HEAD

## Model

In [13]:
first_round = True

In [14]:
flag = model_name

In [15]:
path_model = os.path.join('./.model', flag)

In [16]:
# deadline = tomorrow_morning
# deadline = today_afternoon
deadline = today_afternoon if today_afternoon > datetime.now() else tomorrow_morning

In [17]:
# TODO: for test only
# TODO: by param
# !rm -rf model/chinese-roberta-wwm-ext

clean(flag, first_round)

  {


In [18]:
# HOURS = 4
MINUTES_EACH_EPOCH = 32

In [19]:
# EPOCHS = 2 # int(HOURS * 60 / 25)
# EPOCHS = 2
EPOCHS = int((deadline-datetime.now()).seconds / 60 / MINUTES_EACH_EPOCH)
EPOCHS

23

In [20]:
TRAIN_BATCH_SIZE = 16 # 4
VALID_BATCH_SIZE = TRAIN_BATCH_SIZE # 4 # 8 # 4

LEARNING_RATE = 1e-05

In [21]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [22]:
model = load_or_create_model(flag, model_name, device)

2021-04-08 18:42:57 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-08 18:42:58 DEBUG https://huggingface.co:443 "HEAD /hfl/chinese-roberta-wwm-ext/resolve/main/config.json HTTP/1.1" 200 0
2021-04-08 18:42:58 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-08 18:42:58 DEBUG https://huggingface.co:443 "HEAD /hfl/chinese-roberta-wwm-ext/resolve/main/pytorch_model.bin HTTP/1.1" 302 0


In [23]:
# !ls -alh model/chinese-roberta-wwm-ext
try:
    os.listdir(path_model)
except:
    pass

In [24]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [25]:
losses = []; 

In [26]:
total_batch_train = math.ceil(len(X_train)/TRAIN_BATCH_SIZE)
total_batch_test = math.ceil(len(X_test)/VALID_BATCH_SIZE)

In [None]:
for epoch in range(1, EPOCHS+1): 
    model, loss = train(model, optimizer, training_loader, testing_loader, epoch, device, total_batch_train, total_batch_test)
    create_or_update_best(flag, model, loss, epoch)
    create_or_update_last(flag, model, epoch)
    losses.append(loss)

100%|██████████| 2168/2168 [32:10<00:00,  1.12it/s]
100%|██████████| 64/64 [00:21<00:00,  3.01it/s]


epoch: 1, loss train:  0.2910189926624298, loss test: 2.241748332977295
current best, epoch accumulate: 1


  0%|          | 0/2168 [00:00<?, ?it/s]

last epoch accumulate: 1


100%|██████████| 2168/2168 [33:22<00:00,  1.08it/s]
100%|██████████| 64/64 [00:19<00:00,  3.23it/s]


epoch: 2, loss train:  -0.016070052981376648, loss test: 2.6930603981018066


  0%|          | 0/2168 [00:00<?, ?it/s]

last epoch accumulate: 2


 17%|█▋        | 376/2168 [05:40<27:44,  1.08it/s]

In [None]:
# create_or_update_last(flag, model, epoch)
create_or_update_loss(flag, losses)
load_or_create_acc_epoch(flag, epoch)

In [None]:
# !ls -alh model/chinese-roberta-wwm-ext
os.listdir(path_model)

In [None]:
plot_loss(flag, path=path_model);

In [None]:
# best model validation
model_best = load_best(flag)
evaluate(model_best, testing_loader, device, total_batch_test)