In [None]:
pip install transformers

In [None]:
pip install datasets

In [None]:
pip install transformers[torch]

In [12]:
import os
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset

# Define paths
output_dir = "C:\\Users\\USER\\Desktop\\Markopolo.ai\\LLM\\results"
data_file = "C:\\Users\\USER\\Desktop\\Markopolo.ai\\LLM\\watch_data.jsonl"

# Ensure the output directory exists
if not os.path.exists(output_dir):
    try:
        os.makedirs(output_dir)
        print(f"Directory {output_dir} created successfully.")
    except Exception as e:
        print(f"Error creating directory {output_dir}: {e}")

# Load the dataset and split it into 80% training and 20% testing
try:
    train_dataset = load_dataset('json', data_files=data_file, split='train[:80%]')
    test_dataset = load_dataset('json', data_files=data_file, split='train[80%:]')
    print("Dataset loaded and split successfully.")
except Exception as e:
    print(f"Error loading dataset: {e}")

# Load a pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set the pad_token to a new special token if needed
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))  # Resize the model embeddings to account for new tokens

# Define the tokenization function with labels
def tokenize_function(examples):
    combined_texts = [
        "User: " + prompt + "\nModel: " + response
        for prompt, response in zip(examples['prompt'], examples['response'])
    ]
    # Tokenize with padding and return attention mask
    tokenized_data = tokenizer(combined_texts, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    
    # Set the labels to be the same as input_ids
    tokenized_data["labels"] = tokenized_data["input_ids"].clone()
    # Replace padding tokens in labels with -100
    tokenized_data["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels]
        for labels in tokenized_data["labels"]
    ]
    
    return tokenized_data

# Tokenize the datasets
try:
    tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
    tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
    print("Tokenization completed successfully.")
except KeyError as e:
    print(f"KeyError during tokenization: {e}")
    exit(1)

# Set up training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir=output_dir,
    save_total_limit=3,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# Fine-tune the model
try:
    trainer.train()
    print("Model training completed successfully.")
except Exception as e:
    print(f"Error during training: {e}")




Dataset loaded and split successfully.
Tokenization completed successfully.




  0%|          | 0/168 [00:00<?, ?it/s]

In [None]:
# Function to interact with the fine-tuned model
def chat_with_model():
    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit']:
            break
        inputs = tokenizer.encode(user_input, return_tensors="pt")
        
        # Generate output with attention mask
        outputs = model.generate(
            inputs, 
            max_length=150, 
            pad_token_id=tokenizer.pad_token_id, 
            temperature=0.7,  # Adjust temperature for randomness
            top_k=50,  # Limit to the top-k most likely tokens
            top_p=0.9)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Model: {response}")

# Start the chat
chat_with_model()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Model: what's the price of the Fossil watch?
The Fossil Watch is an Fossil Watch with an Automatic Chronograph Display and Automatic Watch Dial. It is an an anesthetized and Fossil Watch with an Automatic Chronograph Display and Automatic Watch Dial. It is an anesthetized and an anesthetized and an anesthetized and an anesthetized and an anesthetized and an anest Fossil Watch with an Automatic Chronograph Display and Automatic Watch Dial. It is an anesthetized and an anesthetized and an anesthetized and an anesthetized and an anesthetized and an anesthetized and an anesthetized and
