In [1]:
import os

from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np
import re
from PyPDF2 import PdfReader
import os

In [3]:


def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [4]:
import json

In [5]:
with open('Ecommerce_FAQ_Chatbot_dataset.json', 'r') as json_file:
    data = json.load(json_file)

In [6]:
# Extract questions and answers
questions = [entry['question'] for entry in data['questions']]
answers = [entry['answer'] for entry in data['questions']]

In [7]:
# Store questions and answers in a text file
with open('train.txt', 'w') as text_file:
    for q, a in zip(questions, answers):
        text_file.write(f"[Q] {q}\n[A] {a}\n\n")

In [21]:
#train_file_path = "/content/drive/MyDrive/ColabNotebooks/data/chatbot_docs/combined_text/full_text/train.txt"
train_file_path = "train.txt"
model_name = 'gpt2'
#output_dir = '/content/drive/MyDrive/ColabNotebooks/models/chat_models/custom_full_text'
output_dir = 'custom_q_and_a'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 100
save_steps = 50000

In [22]:
# Train
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

100%|██████████| 400/400 [02:13<00:00,  3.00it/s]


{'train_runtime': 133.2579, 'train_samples_per_second': 24.014, 'train_steps_per_second': 3.002, 'train_loss': 0.17795049667358398, 'epoch': 100.0}


In [23]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [31]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):
    
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    decoded_text = tokenizer.decode(final_outputs[0], skip_special_tokens=True)

    # Split the decoded text by newline character '\n'
    split_text = decoded_text.split('\n')

    # Get the first [Q] and [A] pair
    first_qa_pair = split_text[:2]  # Take only the first [Q] and [A]

    # Join the [Q] and [A] pair back together with a newline character
    formatted_text = '\n'.join(first_qa_pair)
    # print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))
    return formatted_text

In [32]:
model2_path = "custom_q_and_a"
sequence2 = "[Q] Do you offer international shipping and how can I track my order? "
max_len = 100
answer = generate_text(model2_path,sequence2,max_len)

In [33]:
print(answer)

[Q] Do you offer international shipping and how can I track my order? [A] If a product is listed as'sold out,' it is currently unavailable for purchase in your country. Please check back later or sign up for notifications when it becomes available again.



# SQL FAQ

In [40]:
from datasets import load_dataset

dataset = load_dataset("b-mc2/sql-create-context")

Downloading readme: 100%|██████████| 3.35k/3.35k [00:00<00:00, 3.35MB/s]


Downloading and preparing dataset json/b-mc2--sql-create-context to C:/Users/kimwa/.cache/huggingface/datasets/b-mc2___json/b-mc2--sql-create-context-21a3552632daf3cf/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data: 100%|██████████| 21.8M/21.8M [00:13<00:00, 1.59MB/s]
Downloading data files: 100%|██████████| 1/1 [00:17<00:00, 17.41s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 105.16it/s]
                                                                   

Dataset json downloaded and prepared to C:/Users/kimwa/.cache/huggingface/datasets/b-mc2___json/b-mc2--sql-create-context-21a3552632daf3cf/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00,  6.65it/s]


In [46]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'context', 'answer'],
        num_rows: 78577
    })
})

In [48]:

def train_text_generate_question_answer(data, train_name):
    # Extract questions and answers
    questions = [entry['question'] for entry in data]
    answers = [entry['answer'] for entry in data]

    # Store questions and answers in a text file
    with open(train_name, 'w') as text_file:
        for q, a in zip(questions, answers):
            text_file.write(f"[Q] {q}\n[A] {a}\n\n")

In [49]:
train_text_generate_question_answer(dataset['train'], 'sql_train.txt')

In [62]:
train_file_path = os.path.join(os.getcwd(),"sql_train.txt" )
model_name = 'gpt2'
output_dir = 'custom_sql'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 20
save_steps = 50000

In [64]:
from utils import train
# Train
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

  0%|          | 56/72800 [00:19<6:39:35,  3.03it/s]