In [1]:
import torch
import pandas as pd

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AdamW,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
optimizer = AdamW(model.parameters(), lr=5e-5)



In [3]:
file_path = "dataset/training.txt"
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

df = pd.DataFrame(lines, columns=["text"])

In [4]:
class ChatDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
        }


dataset = ChatDataset(df["text"].tolist(), tokenizer)

train_dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
# Tokenize and calculate the maximum length
max_token_length = 0
for text in df["text"]:
    tokens = tokenizer.encode(text, truncation=False)  # Don't truncate yet
    max_token_length = max(max_token_length, len(tokens))

print(f"Maximum length of your dataset: {max_token_length} tokens")

Maximum length of your dataset: 770 tokens


In [7]:
# Initialize variables to accumulate token lengths
total_tokens = 0
num_rows = len(df)

# Loop through each row in the dataset
for text in df["text"]:
    tokens = tokenizer.encode(text, truncation=False)  # Don't truncate yet
    total_tokens += len(tokens)

# Calculate the average length of the tokens
average_token_length = total_tokens / num_rows
print(f"Average token length: {average_token_length:.2f} tokens")

Average token length: 56.75 tokens


In [15]:
fine_tuned_model = AutoModelForCausalLM.from_pretrained("./chatbot_model")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./chatbot_model")

input_text = "What is an atom?"
input_ids = fine_tuned_tokenizer.encode(
    "User: " + input_text + "\nBot:", return_tensors="pt"
)
response_ids = fine_tuned_model.generate(
    input_ids,
    max_length=100,
    pad_token_id=fine_tuned_tokenizer.eos_token_id,
    no_repeat_ngram_size=2,
    top_k=50,
    top_p=0.9,
    temperature=0.7,
)
response_text = fine_tuned_tokenizer.decode(
    response_ids[:, input_ids.shape[-1] :][0], skip_special_tokens=True
)

print("Chatbot:", response_text)

Chatbot:  The atom is the smallest unit of matter. It is a single atom that has the same number of protons as all other atoms. The number is called the atomic number (N)."

