<a href="https://colab.research.google.com/github/vinnik-dmitry07/Chatbot/blob/main/train_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi
!pip install --quiet parlai

Thu Feb 18 17:11:22 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    31W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from pathlib import Path

GDRIVE_ROOT = Path('/content/drive/MyDrive/')
SAVE_DIR = GDRIVE_ROOT / 'chatbot_model'
DATA_DIR = GDRIVE_ROOT / 'chatbot_data'

In [None]:
from datetime import timedelta

EPISODE_DT = timedelta(minutes=3)  # change to split messages in separate dialogues if time delta is greater than EPISODE_DT
TRAIN_PART, TEST_PART, VALID_PART = 0.996, 0.002, 0.002

assert TRAIN_PART + TEST_PART + VALID_PART == 1

In [None]:
from google.colab import drive

drive.mount(str(GDRIVE_ROOT.parent))

Mounted at /content/drive


In [None]:
import json

with open(DATA_DIR / 'result.json', 'r', encoding='utf8') as f:
    raw_messages = json.load(f)['messages']

In [None]:
from datetime import datetime

filtered_messages = []
for msg in raw_messages:
    if (
            'from' in msg and
            'from_id' in msg and
            'mime_type' not in msg and
            msg['text'] and
            isinstance(msg['text'], str) and
            len(msg['text']) < 50
    ):
        msg1 = msg.copy()
        msg1['date'] = datetime.strptime(msg1['date'], '%Y-%m-%dT%H:%M:%S')
        filtered_messages.append(msg1)

In [None]:
import re

joined_messages = []
for i in range(len(filtered_messages)):
    alphanum_text = re.sub(r'[^A-Za-z0-9 ]+', '', filtered_messages[i]['text']).strip()
    if alphanum_text:
        if (    
                joined_messages and    
                filtered_messages[i - 1]['from_id'] == filtered_messages[i]['from_id'] and
                filtered_messages[i - 1]['date'] - filtered_messages[i]['date'] <= EPISODE_DT
        ):
            joined_messages[-1]['text'] += ' ' + alphanum_text
        else:
            new_message = filtered_messages[i].copy()
            new_message['text'] = alphanum_text
            joined_messages.append(new_message)

In [None]:
def partition(alist, indices):
    return [alist[a:b] for a, b in zip([0] + indices, indices + [None])]

In [None]:
def save_jsonl(messages, suffix, human_readable=False):
    time_diffs = [messages[i + 1]['date'] - messages[i]['date'] for i in range(len(messages) - 1)]
    split_positions = [i + 1 for i in range(len(time_diffs)) if time_diffs[i] > EPISODE_DT]
    episodes = partition(messages, split_positions)
    print(f'{suffix} episodes: {len(episodes)}, messages: {len(messages)}')

    with open(DATA_DIR / f'data_{suffix}.jsonl', 'w', **({'encoding': 'utf8'} if human_readable else {})) as outfile:
        for episode in episodes:
            dialog = [{'id': i % 2, 'text': msg['text']} for i, msg in enumerate(episode)]
            episode = {'dialog': [dialog]}
            json.dump(episode, outfile, **({'ensure_ascii': False} if human_readable else {}))
            outfile.write('\n')

In [None]:
import numpy as np

train, test, valid = np.split(joined_messages, [
    int(TRAIN_PART * len(joined_messages)),
    int((TRAIN_PART + TEST_PART) * len(joined_messages)),
])

save_jsonl(train, suffix='train')
save_jsonl(test, suffix='test')
save_jsonl(valid, suffix='valid')

NameError: ignored

In [None]:
import shutil
import subprocess
import time
import threading
from pathlib import Path


def dir_size_bytes(path):
    return int(subprocess.check_output(['du','--bytes', '--summarize', path]).split()[0].decode())

def check_chache(max_cache_size_gb=18, check_period_minutes=5):
    this_id = str(threading.get_ident())
    thread_path = Path('/threads')
    thread_path.mkdir(exist_ok=True)

    def threads_ids():
        return [str(p.name) for p in thread_path.iterdir() if p.is_file()]
    
    if not(threads_ids()):
        (thread_path / this_id).open(mode='w').close()
        while True:
            ids = threads_ids()
            if not (len(ids) == 1 and ids[0] == this_id):
                break

            print(f'Thread {this_id} is checking chache.')

            for cache_path in Path('/root/.config/Google/DriveFS').glob('**/content_cache'):
                chache_path_str = str(cache_path)
                chache_size_gb = dir_size_bytes(chache_path_str) / 10 ** 9
                if chache_size_gb > max_cache_size_gb:
                    print(f'Deleting {chache_path_str} with size {chache_size_gb} GB.')
                    shutil.rmtree(chache_path_str)
            time.sleep(check_period_minutes * 60)

False


In [12]:
# threading.Thread(target=check_chache).start()

import os

os.environ['SAVE_DIR'] = str(SAVE_DIR)
!rm --recursive --force $SAVE_DIR
!mkdir --parents $SAVE_DIR


from parlai.scripts.train_model import TrainModel

TrainModel.main(
    task='jsonfile',
    jsonfile_datapath=str(DATA_DIR / 'data'),
    jsonfile_datatype_extension=True,

    model='transformer/generator',
    model_file=str(SAVE_DIR / 'model'),
    
    init_model='zoo:tutorial_transformer_generator/model',

    n_heads=16, n_layers=8, n_positions=512, text_truncate=512,
    label_truncate=128, ffn_size=2048, embedding_size=512,
    activation='gelu', variant='xlm',
    dict_lower=True, dict_tokenizer='bpe',
    dict_file='zoo:tutorial_transformer_generator/model.dict',
    learn_positional_embeddings=True,
    
    lr=1e-5, optimizer='adam',
    warmup_updates=5000,
    validation_metric='ppl',
    validation_every_n_secs=60 * 60,  # running eval: valid
    save_every_n_secs=60,  # saving model checkpoint

    batchsize=12, fp16=True, fp16_impl='mem_efficient',
    
    skip_generation=True,
    
    dynamic_batching='full',

    label_turns='both',  # https://parl.ai/docs/core/teachers.html#parlai.core.teachers.ConversationTeacher
)

Thread 140129149110016 is checking chache.
10:50:57 | building data: /usr/local/lib/python3.7/dist-packages/data/models/tutorial_transformer_generator/tutorial_transformer_generator_v1.tar.gz
10:50:57 | Downloading http://parl.ai/downloads/_models/tutorial_transformer_generator/tutorial_transformer_generator_v1.tar.gz to /usr/local/lib/python3.7/dist-packages/data/models/tutorial_transformer_generator/tutorial_transformer_generator_v1.tar.gz


Downloading tutorial_transformer_generator_v1.tar.gz: 100%|██████████| 1.12G/1.12G [00:39<00:00, 28.7MB/s]


10:51:55 | building dictionary first...
10:51:55 | No model with opt yet at: /content/drive/MyDrive/chatbot_model/model(.opt)
10:51:55 | [33myour model is being loaded with opts that do not exist in the model you are initializing the weights with: allow_missing_init_opts: False,download_path: None,loglevel: info,dynamic_batching: full,verbose: False,datapath: /usr/local/lib/python3.7/dist-packages/data,eval_dynamic_batching: None,load_from_checkpoint: True,tensorboard_logdir: None,jsonfile_datapath: /content/drive/MyDrive/chatbot_data/data,jsonfile_datatype_extension: True,label_turns: both,n_encoder_layers: -1,n_decoder_layers: -1,model_parallel: False,beam_block_full_context: True,beam_length_penalty: 0.65,topk: 10,topp: 0.9,beam_delay: 30,beam_block_list_filename: None,temperature: 1.0,compute_tokenized_bleu: False,interactive_mode: False,fp16_impl: mem_efficient,force_fp16_tokens: False,adafactor_eps: (1e-30, 0.001),history_reversed: False,history_add_global_end_token: None,specia

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg.mul_(beta1).add_(1 - beta1, grad)


10:52:17 | Overflow: setting loss scale to 32768.0
10:52:18 | Overflow: setting loss scale to 16384.0
10:52:26 | time:10s total_exs:6252 epochs:0.01
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
   .9000  4029 12081 623.5 6252             20207  11.59    .4716  4.65 6.099e-08 899.6  2697 104.6      .2877   
    total_train_updates  tpb   tps   ups  
                     30 4929 14778 2.998

10:52:36 | time:20s total_exs:10096 epochs:0.01
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss       lr  ltpb  ltps   ppl  token_acc  \
       1  5574 19834 379.9 3844             16384  13.14    .3837 4.453 1.33e-07 480.2  1709 85.92      .3137   
    total_train_updates  tpb   tps   ups  
                     66 6054 21543 3.559

10:52:46 | time:31s total_exs:12976 epochs:0.02
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss       lr  ltpb  ltps   ppl  token_acc  \
       1  6335 229



10:53:22 | time:66s total_exs:18984 epochs:0.03
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss       lr  ltpb  ltps  ppl  token_acc  \
       1  6364 13638 109.9 1692             16384  15.19    .3064 4.422 4.13e-07 246.8 528.9 83.3      .3133   
    total_train_updates  tpb   tps   ups  
                    206 6611 14166 2.143

10:53:32 | time:77s total_exs:20568 epochs:0.03
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss       lr  ltpb  ltps   ppl  token_acc  \
       1  6422 20036 154.4 1584             16384   15.5    .3068 4.466 4.77e-07 244.2 762.1 86.99      .3076   
    total_train_updates  tpb   tps  ups  
                    238 6666 20798 3.12

10:53:43 | time:87s total_exs:22072 epochs:0.03
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6126 18638 147.6 1504             16384  15.66    .3352  4.44 5.389e-07 227.1 690.8 84.78      .3039   
    total_



10:54:28 | time:133s total_exs:27576 epochs:0.04
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6410 12695 72.61 1100             16384  17.01    .3108 4.483 8.089e-07 172.5 341.5 88.47      .3098   
    total_train_updates  tpb   tps  ups  
                    404 6583 13036 1.98

10:54:35 | Overflow: setting loss scale to 16384.0
10:54:39 | time:143s total_exs:28764 epochs:0.04
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
   .9688  6206 19779 118.3 1188             16384   16.6    .3373 4.498 8.729e-07   173 551.4 89.79      .3067   
    total_train_updates  tpb   tps   ups  
                    436 6379 20331 3.188

10:54:49 | time:153s total_exs:29680 epochs:0.04
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6548 16920 91.03  916             16384  17.51    .312



10:55:35 | time:199s total_exs:33672 epochs:0.04
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6741 12448 57.77  876             16384  17.36    .3034 4.319 1.167e-06 139.7 257.9 75.13      .3283   
    total_train_updates  tpb   tps   ups  
                    583 6880 12705 1.847

10:55:45 | time:209s total_exs:34552 epochs:0.05
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6655 20544 87.62  880             16384  17.87    .3170 4.403 1.229e-06 128.1 395.5 81.66      .3218   
    total_train_updates  tpb   tps   ups  
                    614 6783 20940 3.087

Thread 140129149110016 is checking chache.
10:55:55 | time:219s total_exs:35108 epochs:0.05
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps  ppl  token_acc  \
       1  6857 14047 54.24  556             16384  17.61    .3406 4.366



10:56:41 | time:265s total_exs:38364 epochs:0.05
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6499 12495 50.39  760             16384  18.19    .3681 4.366 1.509e-06 125.2 240.7 78.69      .3184   
    total_train_updates  tpb   tps   ups  
                    754 6624 12736 1.923

10:56:51 | time:275s total_exs:39264 epochs:0.05
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps  ppl  token_acc  \
       1  6393 19802 89.92  900             16384  17.48    .3318 4.138 1.571e-06 124.7 386.4 62.7      .3504   
    total_train_updates  tpb   tps   ups  
                    785 6518 20188 3.098

10:57:01 | time:285s total_exs:39784 epochs:0.05
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps  ppl  token_acc  \
       1  6568 13296 50.13  520             16384  18.89    .3681  4.29 1.613e-06   108 218.7   73      .3332   
   



10:57:47 | time:331s total_exs:42664 epochs:0.06
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6567 11949 42.86  636             16384  18.46    .3768  4.38 1.837e-06 115.2 209.7 79.82      .3247   
    total_train_updates  tpb   tps  ups  
                    918 6682 12159 1.82

10:57:57 | time:341s total_exs:43372 epochs:0.06
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps  ppl  token_acc  \
       1  6667 19036  69.7  708             16384  17.75    .3526 4.194 1.895e-06 112.1 320.2 66.3      .3373   
    total_train_updates  tpb   tps   ups  
                    947 6779 19356 2.856

10:58:07 | time:351s total_exs:44032 epochs:0.06
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps  ppl  token_acc  \
       1  6503 17938 65.02  660             16384  17.88    .3973 4.355 1.951e-06 112.2 309.4 77.9      .3136   
    t



10:58:53 | time:397s total_exs:46580 epochs:0.06
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6275 10541 38.77  600             16384   17.5    .3903 4.258 2.173e-06 114.1 191.7 70.69      .3306   
    total_train_updates  tpb   tps  ups  
                   1086 6389 10733 1.68

10:58:54 | Overflow: setting loss scale to 16384.0
10:58:56 | Overflow: setting loss scale to 16384.0
10:59:04 | time:408s total_exs:47308 epochs:0.06
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps  ppl  token_acc  \
   .9355  6223 18577  70.1  728             16384  17.51    .4095 4.227 2.235e-06 108.2 322.9 68.5      .3251   
    total_train_updates  tpb   tps   ups  
                   1117 6331 18899 2.986

10:59:14 | time:418s total_exs:47676 epochs:0.06
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6446 



10:59:59 | time:464s total_exs:49976 epochs:0.07
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6358 11146 37.76  560             16384  18.64    .3904  4.15 2.495e-06 97.19 170.4 63.42      .3411   
    total_train_updates  tpb   tps   ups  
                   1247 6455 11316 1.753

11:00:10 | time:474s total_exs:50540 epochs:0.07
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6711 19084 55.31  564             16384   19.9    .4144 4.083 2.553e-06 91.69 260.8 59.34      .3524   
    total_train_updates  tpb   tps   ups  
                   1276 6802 19345 2.844

11:00:19 | Overflow: setting loss scale to 16384.0
11:00:20 | time:484s total_exs:51024 epochs:0.07
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
   .9600  6528 16013 47.49  484             16384   18.2    .3



11:01:05 | time:530s total_exs:53360 epochs:0.07
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6423 11944    34  512             16384  18.79    .3811 4.072 2.841e-06 78.79 146.5 58.65      .3708   
    total_train_updates  tpb   tps  ups  
                   1420 6502 12091 1.86

11:01:15 | Overflow: setting loss scale to 16384.0
11:01:15 | time:540s total_exs:53908 epochs:0.07
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
   .9667  6396 19051  54.4  548             16384  18.91    .4126 4.218 2.901e-06 85.27   254 67.91      .3307   
    total_train_updates  tpb   tps   ups  
                   1450 6482 19305 2.979

11:01:26 | time:550s total_exs:54444 epochs:0.07
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6264 16744 53.06  536             16384  19.11    .405



11:02:57 | time:641s total_exs:56516 epochs:0.07
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
   .9600  5987  2485 9.296  560             16384  16.77    .3808 4.169 3.169e-06 114.6 47.58 64.64      .3172   
    total_train_updates  tpb  tps   ups  
                   1584 6101 2532 .4150

11:03:07 | time:651s total_exs:57000 epochs:0.08
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6558 18998 48.35  484             16384  21.15    .4093 4.071 3.227e-06 73.31 212.4 58.63      .3401   
    total_train_updates  tpb   tps   ups  
                   1613 6631 19211 2.897

11:03:11 | Overflow: setting loss scale to 16384.0
11:03:17 | time:661s total_exs:57472 epochs:0.08
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
   .9655  6479 18770 47.15  472             16384  20.05    .409



11:04:02 | time:707s total_exs:59464 epochs:0.08
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6453 11615 30.67  460             16384  20.18    .3996 4.144 3.515e-06 89.78 161.6 63.05      .3296   
    total_train_updates  tpb   tps  ups  
                   1757 6542 11777  1.8

11:04:12 | Overflow: setting loss scale to 16384.0
11:04:13 | time:717s total_exs:59984 epochs:0.08
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
   .9677  6186 18604 50.44  520             16384  19.99    .3996 4.218 3.577e-06 90.65 272.6 67.86      .3142   
    total_train_updates  tpb   tps   ups  
                   1788 6277 18876 3.008

11:04:21 | Overflow: setting loss scale to 16384.0
11:04:23 | time:727s total_exs:60596 epochs:0.08
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
   .9667  612



11:05:15 | time:779s total_exs:62700 epochs:0.08
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6242  7949 24.34  516             16384  18.66    .4094 4.096 3.871e-06 97.15 123.7 60.07      .3317   
    total_train_updates  tpb  tps   ups  
                   1935 6339 8073 1.274

11:05:25 | time:789s total_exs:63144 epochs:0.08
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps  ppl  token_acc  \
       1  6176 17428  43.2  444             16384  20.51    .4094 4.135 3.929e-06 77.59 218.9 62.5      .3369   
    total_train_updates  tpb   tps   ups  
                   1964 6254 17647 2.822

11:05:35 | time:799s total_exs:63568 epochs:0.08
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6535 18283 42.36  424             16384  20.03    .4094 3.988 3.985e-06 71.46 199.9 53.94      .3478   
   



11:07:22 | time:906s total_exs:65200 epochs:0.09
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6052  2064 4.984  380             16384   21.3    .3902 3.989 4.199e-06 68.73 23.44 53.97      .3665   
    total_train_updates  tpb  tps   ups  
                   2099 6121 2087 .3410

11:07:32 | time:916s total_exs:65720 epochs:0.09
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  5837 17452 51.82  520             16384  20.36    .3856 3.995 4.259e-06  78.4 234.4 54.31      .3588   
    total_train_updates  tpb   tps  ups  
                   2129 5916 17686 2.99

11:07:35 | Overflow: setting loss scale to 16384.0
11:07:42 | time:926s total_exs:66180 epochs:0.09
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
   .9677  6135 18615 45.02  460             16384  20.16    .3997 



11:08:27 | Overflow: setting loss scale to 16384.0
11:08:27 | time:972s total_exs:68004 epochs:0.09
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
   .9286  6166 11464 28.95  436             16384   19.2    .4031 3.817 4.553e-06 65.04 120.9 45.45      .3740   
    total_train_updates  tpb   tps   ups  
                   2276 6231 11585 1.859

11:08:38 | time:982s total_exs:68504 epochs:0.09
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6133 18445  48.5  500             16384  19.67    .4094 4.051 4.615e-06 83.55 251.3 57.45      .3405   
    total_train_updates  tpb   tps   ups  
                   2307 6217 18697 3.008

11:08:48 | time:992s total_exs:68908 epochs:0.09
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6159 18246 39.89  404             16384  21.24    .3



11:10:20 | time:1084s total_exs:70664 epochs:0.09
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6209  2633 5.676  348             16384  21.44    .4094 4.201 4.909e-06    69 29.26 66.75      .3339   
    total_train_updates  tpb  tps   ups  
                   2454 6278 2662 .4241

11:10:30 | time:1094s total_exs:71044 epochs:0.09
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6235 17461 38.01  380             16384  21.18    .4124 4.103 4.965e-06 66.68 186.7 60.55      .3321   
    total_train_updates  tpb   tps   ups  
                   2482 6301 17647 2.801

11:10:32 | Overflow: setting loss scale to 16384.0
11:10:40 | time:1104s total_exs:71380 epochs:0.09
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
   .9630  6170 16512  33.3  336             16384  21.81    .



11:11:41 | time:1165s total_exs:72860 epochs:0.10
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6237  5323 11.95  364             16384  21.13    .4094 3.996 5.232e-06 64.38 54.95 54.39      .3501   
    total_train_updates  tpb  tps   ups  
                   2616 6301 5378 .8535

11:11:51 | time:1176s total_exs:73272 epochs:0.10
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6039 18170 39.98  412             16384  20.87    .4058 3.932 5.294e-06 67.13   202 51.02      .3518   
    total_train_updates  tpb   tps   ups  
                   2647 6106 18372 3.009

11:12:01 | time:1186s total_exs:73676 epochs:0.10
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps  ppl  token_acc  \
       1  6021 17992 40.23  404             16384  20.86    .3856 4.091 5.354e-06 69.17 206.6 59.8      .3214   




11:12:47 | time:1231s total_exs:75388 epochs:0.10
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss       lr  ltpb  ltps   ppl  token_acc  \
       1  6026 11447 24.97  368             16384  21.28    .3762 3.838 5.59e-06 58.11 110.4 46.42      .3835   
    total_train_updates  tpb   tps  ups  
                   2795 6084 11558  1.9

11:12:57 | time:1241s total_exs:75836 epochs:0.10
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6087 18433 43.76  448             16384  20.21    .3844 3.979 5.652e-06 69.77 211.3 53.47      .3430   
    total_train_updates  tpb   tps   ups  
                   2826 6156 18644 3.029

11:13:07 | time:1251s total_exs:76300 epochs:0.10
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  5912 17965 45.48  464             16384  20.77    .3682 3.892 5.714e-06 66.03 200.6 49.03      .3561   




11:14:41 | time:1346s total_exs:77904 epochs:0.10
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6188  2334 5.092  324             16384  21.65    .3682 3.934 5.934e-06 61.33 23.13 51.13      .3342   
    total_train_updates  tpb  tps   ups  
                   2967 6250 2357 .3772

11:14:52 | time:1356s total_exs:78360 epochs:0.10
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  5916 16809 44.67  456             16384  20.16    .3682 3.938 5.992e-06 75.79 215.3 51.32      .3453   
    total_train_updates  tpb   tps   ups  
                   2996 5992 17024 2.841

11:15:02 | time:1366s total_exs:78788 epochs:0.10
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6179 18472 42.65  428             16384  21.11    .4032 3.777 6.052e-06  61.7 184.4 43.69      .3603  



11:15:47 | time:1411s total_exs:80372 epochs:0.11
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  5942 11125 25.68  384             16384  20.82    .3682 3.963 6.288e-06 65.75 123.1 52.62      .3542   
    total_train_updates  tpb   tps   ups  
                   3144 6008 11249 1.872

Thread 140129149110016 is checking chache.
Deleting /root/.config/Google/DriveFS/107380112206456973130/content_cache with size 22.315599233 GB.
11:15:57 | time:1421s total_exs:80736 epochs:0.11
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        lr  ltpb  ltps   ppl  token_acc  \
       1  6162 17850 36.36  364             16384  21.55    .3682 3.813 6.346e-06 57.72 167.2 45.29      .3495   
    total_train_updates  tpb   tps   ups  
                   3173 6220 18017 2.897

11:16:07 | time:1432s total_exs:81132 epochs:0.11
    clip  ctpb  ctps  exps  exs  fp16_loss_scalar  gnorm  gpu_mem  loss        l



FileNotFoundError: ignored

In [13]:
!ls /content/drive/MyDrive/chatbot_model/

model.checkpoint	      model.checkpoint.dict.opt
model.checkpoint.dict	      model.checkpoint.opt
model.checkpoint.dict.codecs  model.checkpoint.trainstats
