In [1]:
import os
import json
import math
import torch
import pickle
import shutil
import transformers

import numpy as np
import pandas as pd

In [2]:
from pathlib import Path
from itertools import chain
from tqdm import tqdm
from datetime import datetime
from datetime import timedelta

In [3]:
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import BertTokenizer
from transformers import BertModel

In [4]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
import nbimporter

from classify_comment_taptap_dataset import get_dataset

from classify_comment_taptap_model import BERTClass
from classify_comment_taptap_model import loss_fn
from classify_comment_taptap_model import validation
from classify_comment_taptap_model import train
from classify_comment_taptap_model import evaluate

from classify_comment_taptap_flow import clean
from classify_comment_taptap_flow import load_or_create_model
from classify_comment_taptap_flow import create_or_update_best
from classify_comment_taptap_flow import load_best
from classify_comment_taptap_flow import create_or_update_last

from classify_comment_taptap_flow import create_or_update_loss
from classify_comment_taptap_flow import plot_loss

Importing Jupyter notebook from classify_comment_taptap_dataset.ipynb
Importing Jupyter notebook from classify_comment_taptap_model.ipynb
Importing Jupyter notebook from classify_comment_taptap_flow.ipynb


In [6]:
%run classify_comment_taptap_data.ipynb
%run classify_comment_taptap_dataset.ipynb

0it [00:00, ?it/s]

CPU times: user 10 s, sys: 3.13 s, total: 13.2 s
Wall time: 13.2 s
(150, 5)


150it [00:15,  9.72it/s]


CPU times: user 15.1 s, sys: 2.84 s, total: 18 s
Wall time: 17.9 s
(488452, 14)
(45450, 14)
CPU times: user 992 ms, sys: 20 ms, total: 1.01 s
Wall time: 1.01 s
(30378, 2)
50
(195017, 14)
Stored 'df_comments' (DataFrame)
Stored 'df_comments_length' (Series)
Stored 'df_comments_topics' (DataFrame)
(30378,)
(27670,)
0.9108565409177695
(30378, 2)
FULL Dataset: (30378, 2)
TRAIN Dataset: (24302, 2)
TEST Dataset: (6076, 2)
Stored 'train_dataset' (DataFrame)
Stored 'test_dataset' (DataFrame)
Stored 'mlb' (MultiLabelBinarizer)


In [7]:
tomorrow = datetime.now() + timedelta(days=1)
tomorrow_morning = tomorrow.replace(hour=7, minute=30, second=0, microsecond=0)
tomorrow_morning

datetime.datetime(2021, 3, 4, 7, 30)

In [8]:
today = datetime.now()
today_afternoon = today.replace(hour=18, minute=0, second=0, microsecond=0)
today_afternoon

datetime.datetime(2021, 3, 3, 18, 0)

## Data

In [9]:
# MAX_LEN = 128
# MAX_LEN = int(mu_1std)
MAX_LEN = 512

In [10]:
model_name = 'hfl/chinese-roberta-wwm-ext'

In [11]:
%store -r train_dataset
%store -r test_dataset
%store -r mlb

In [12]:
training_set, testing_set = get_dataset(model_name, train_dataset, test_dataset, MAX_LEN)

## Model

In [13]:
first_round = True

In [14]:
# flag = 'chinese-roberta-wwm-ext'
flag = model_name

In [15]:
path_model = os.path.join('./model', flag)

In [16]:
# deadline = tomorrow_morning
deadline = today_afternoon

In [17]:
# TODO: for test only
# TODO: by param
# !rm -rf model/chinese-roberta-wwm-ext

clean(flag, first_round)

  {


In [18]:
# HOURS = 4
MINUTES_EACH_EPOCH = 25

In [19]:
deadline

datetime.datetime(2021, 3, 3, 18, 0)

In [20]:
datetime.now()

datetime.datetime(2021, 3, 3, 9, 35, 12, 525182)

In [21]:
# EPOCHS = 2 # int(HOURS * 60 / 25)
# EPOCHS = 2
EPOCHS = int((deadline-datetime.now()).seconds / 60 / MINUTES_EACH_EPOCH)
EPOCHS

20

In [22]:
TRAIN_BATCH_SIZE = 16 # 4
VALID_BATCH_SIZE = TRAIN_BATCH_SIZE # 4 # 8 # 4

LEARNING_RATE = 1e-05

In [23]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [24]:
model = load_or_create_model(flag, model_name, mlb, device)

In [25]:
# !ls -alh model/chinese-roberta-wwm-ext
os.listdir(path_model)

[]

In [26]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [27]:
losses = []; 

In [28]:
total_batch_train = math.ceil(len(train_dataset)/TRAIN_BATCH_SIZE)
total_batch_test = math.ceil(len(test_dataset)/VALID_BATCH_SIZE)

In [None]:
for epoch in range(1, EPOCHS+1): 
    model, loss = train(model, optimizer, training_loader, testing_loader, epoch, device, total_batch_train, total_batch_test)
    create_or_update_best(flag, model, loss, epoch)
    losses.append(loss)
    
create_or_update_last(flag, model, epoch)
create_or_update_loss(flag, losses)

100%|██████████| 1519/1519 [22:15<00:00,  1.14it/s]
100%|██████████| 380/380 [01:58<00:00,  3.20it/s]


epoch: 1, loss train:  0.7646468877792358, loss test: 0.7507001757621765
current best, epoch accumulate: 1


  3%|▎         | 46/1519 [00:40<21:39,  1.13it/s]

In [None]:
# !ls -alh model/chinese-roberta-wwm-ext
os.listdir(path_model)

In [None]:
plot_loss(flag);

In [None]:
# best model validation
model_best = load_best(flag)
evaluate(model_best, testing_loader, device, total_batch_test, mlb)