<a href="https://colab.research.google.com/github/vinnik-dmitry07/Chatbot/blob/main/train_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi
!pip install --quiet parlai

In [23]:
from pathlib import Path

GDRIVE_ROOT = Path('/content/drive/MyDrive/')
SAVE_DIR = GDRIVE_ROOT / 'chatbot_model'
DATA_DIR = GDRIVE_ROOT / 'chatbot_data'

In [24]:
from datetime import timedelta

EPISODE_DT = timedelta(minutes=3)  # change to split messages in separate dialogues if time delta is greater than EPISODE_DT
TRAIN_PART, TEST_PART, VALID_PART = 0.996, 0.002, 0.002

assert TRAIN_PART + TEST_PART + VALID_PART == 1

In [15]:
from google.colab import drive

drive.mount(str(GDRIVE_ROOT.parent))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import json

with open(DATA_DIR / 'result.json', 'r', encoding='utf8') as f:
    raw_messages = json.load(f)['messages']

In [17]:
from datetime import datetime

filtered_messages = []
for msg in raw_messages:
    if (
            'from' in msg and
            'from_id' in msg and
            'mime_type' not in msg and
            msg['text'] and
            isinstance(msg['text'], str)
    ):
        msg['date'] = datetime.strptime(msg['date'], '%Y-%m-%dT%H:%M:%S')
        filtered_messages.append(msg)

In [18]:
joined_messages = [filtered_messages[0]]
for i in range(1, len(filtered_messages)):
    if (
            filtered_messages[i - 1]['from_id'] == filtered_messages[i]['from_id'] and
            filtered_messages[i - 1]['date'] - filtered_messages[i]['date'] <= EPISODE_DT
    ):
        joined_messages[-1]['text'] += ' ' + filtered_messages[i]['text']
    else:
        joined_messages.append(filtered_messages[i])

In [19]:
def partition(alist, indices):
    return [alist[a:b] for a, b in zip([0] + indices, indices + [None])]

In [20]:
def save_jsonl(messages, suffix, human_readable=False):
    time_diffs = [messages[i + 1]['date'] - messages[i]['date'] for i in range(len(messages) - 1)]
    split_positions = [i + 1 for i in range(len(time_diffs)) if time_diffs[i] > EPISODE_DT]
    episodes = partition(messages, split_positions)
    print(f'{suffix} episodes: {len(episodes)}, messages: {len(messages)}')

    with open(DATA_DIR / f'data_{suffix}.jsonl', 'w', **({'encoding': 'utf8'} if human_readable else {})) as outfile:
        for episode in episodes:
            dialog = [
                {
                    'id': i % 2,
                    'text': msg['text'].replace('\n', ' '),
                } for i, msg in enumerate(episode)
            ]

            episode = {'dialog': [dialog]}
            json.dump(episode, outfile, **({'ensure_ascii': False} if human_readable else {}))
            outfile.write('\n')

In [21]:
import numpy as np

train, test, valid = np.split(joined_messages, [
    int(TRAIN_PART * len(joined_messages)),
    int((TRAIN_PART + TEST_PART) * len(joined_messages)),
])

save_jsonl(train, suffix='train')
save_jsonl(test, suffix='test')
save_jsonl(valid, suffix='valid')

train episodes: 346, messages: 861230
test episodes: 1, messages: 1729
valid episodes: 1, messages: 1730


In [22]:
#import os

#os.environ['SAVE_DIR'] = str(SAVE_DIR)
#!rm --recursive --force $SAVE_DIR
#!mkdir --parents $SAVE_DIR


from parlai.scripts.train_model import TrainModel

TrainModel.main(
    task='jsonfile',
    jsonfile_datapath=str(DATA_DIR / 'data'),
    jsonfile_datatype_extension=True,

    model='transformer/generator',
    model_file=str(SAVE_DIR / 'model'),
    
    init_model='zoo:tutorial_transformer_generator/model',

    n_heads=16, n_layers=8, n_positions=512, text_truncate=512,
    label_truncate=128, ffn_size=2048, embedding_size=512,
    activation='gelu', variant='xlm',
    dict_lower=True, dict_tokenizer='bpe',
    dict_file='zoo:tutorial_transformer_generator/model.dict',
    learn_positional_embeddings=True,
    
    lr=1e-5, optimizer='adam',
    warmup_updates=5000,
    validation_metric='ppl',
    validation_every_n_secs=60 * 60,
    save_every_n_secs=10 * 60,  # saving model checkpoint

    batchsize=12, fp16=True, fp16_impl='mem_efficient',
    
    skip_generation=True,
    
    dynamic_batching='full',
)

19:31:38 | building dictionary first...
19:31:38 | [33mOverriding opt["jsonfile_datapath"] to /content/drive/MyDrive/chatbot_data/data (previously: data)[0m
19:31:38 | [33mOverriding opt["init_model"] to zoo:tutorial_transformer_generator/model (previously: /usr/local/lib/python3.6/dist-packages/data/models/tutorial_transformer_generator/model)[0m
19:31:38 | [33mOverriding opt["optimizer"] to adam (previously: mem_eff_adam)[0m
19:31:38 | [33myour model is being loaded with opts that do not exist in the model you are initializing the weights with: download_path: None,datapath: /usr/local/lib/python3.6/dist-packages/data,interactive_mode: False[0m
19:31:38 | [33myour model is being loaded with opts that differ from the model you are initializing the weights with. Add the following args to your run command to change this: 
--jsonfile-datapath data --force-fp16-tokens False --optimizer mem_eff_adam[0m
19:31:38 | Using CUDA
19:31:38 | loading dictionary from /content/drive/MyDrive

KeyboardInterrupt: ignored