In [None]:
# Step 1: Setup Environment
!pip install fsspec==2024.10.0
!pip install transformers datasets torch onnx onnxruntime

In [None]:
# Step 2: Import Libraries
import os
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import torch
from google.colab import drive

In [None]:
# Step 3: Mount Google Drive
drive.mount('/content/drive')


In [None]:
# Step 4: Load Dataset
dataset_path = '/content/drive/My Drive/generated_datasets.json'  # Update with your path
with open(dataset_path, 'r') as f:
    data = json.load(f)


In [None]:
# Convert dataset to DialoGPT format
train_file = "train.txt"
with open(train_file, "w") as f:
    for convo in data["conversations"]:
        f.write(f"{convo['user']}\n{convo['bot']}\n\n")

In [None]:
# Step 5: Load DialoGPT Model and Tokenizer
model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


In [None]:
# Step 6: Preprocessing
def create_dataset(file_path, tokenizer, block_size=512):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )
    return dataset

def create_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

train_dataset = create_dataset(train_file, tokenizer)
data_collator = create_data_collator(tokenizer)

In [None]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is available


In [None]:
# Step 7: Training
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

trainer.train()



<IPython.core.display.Javascript object>

In [None]:
# Step 8: Save the Fine-Tuned Model
os.makedirs("onnx", exist_ok=True)
os.makedirs("pytorch", exist_ok=True)

In [None]:
# Save PyTorch Model
torch.save(model.state_dict(), "pytorch/dialoGPT_model.pth")


In [None]:
# Save ONNX Model
dummy_input = torch.zeros(1, 1).long()
torch.onnx.export(
    model,
    dummy_input,
    "onnx/dialoGPT_model.onnx",
    input_names=["input_ids"],
    output_names=["output"],
    dynamic_axes={"input_ids": {0: "batch_size", 1: "sequence_length"}}
)

In [None]:
# Step 9: Save to Google Drive
!cp -r onnx /content/drive/MyDrive/fine_tuned_models/
!cp -r pytorch /content/drive/MyDrive/fine_tuned_models/


In [None]:
# Step 10: Inference
def chat_with_bot(user_input, model, tokenizer):
    input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors="pt")
    chat_history_ids = model.generate(
        input_ids,
        max_length=1000,
        pad_token_id=tokenizer.eos_token_id,
        top_p=0.92,
        top_k=50,
        temperature=0.7,
    )
    response = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
    return response

In [None]:
# Example Chat
while True:
    user_input = input("User: ")
    if user_input.lower() == "exit":
        break
    bot_response = chat_with_bot(user_input, model, tokenizer)
    print(f"Bot: {bot_response}")

In [None]:
Weights & Biases: 48723499de67dac0da9115c091320aa958015969