In [None]:
pip install transformers

In [None]:
pip install datasets

In [None]:
pip install transformers[torch]

In [None]:
import os
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset

# Defining paths
output_dir = "C:\\Users\\USER\\Desktop\\Markopolo.ai\\LLM\\results"
data_file = "C:\\Users\\USER\\Desktop\\Markopolo.ai\\LLM\\watch_data.jsonl"

# Ensuring the output directory exists
if not os.path.exists(output_dir):
    try:
        os.makedirs(output_dir)
        print(f"Directory {output_dir} created successfully.")
    except Exception as e:
        print(f"Error creating directory {output_dir}: {e}")

# Loading the dataset and splitting it into 80% training and 20% testing
try:
    train_dataset = load_dataset('json', data_files=data_file, split='train[:80%]')
    test_dataset = load_dataset('json', data_files=data_file, split='train[80%:]')
    print("Dataset loaded and split successfully.")
except Exception as e:
    print(f"Error loading dataset: {e}")

In [12]:

# Loading a pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Setting the pad_token to a new special token if needed
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))  

# Defining the tokenization function with labels
def tokenize_function(examples):
    combined_texts = [
        "User: " + prompt + "\nModel: " + response
        for prompt, response in zip(examples['prompt'], examples['response'])
    ]
    # Tokenizing with padding and return attention mask
    tokenized_data = tokenizer(combined_texts, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    
    # Setting the labels to be the same as input_ids
    tokenized_data["labels"] = tokenized_data["input_ids"].clone()
    # Replacing padding tokens in labels with -100
    tokenized_data["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels]
        for labels in tokenized_data["labels"]
    ]
    
    return tokenized_data

# Tokenizing the datasets
try:
    tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
    tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
    print("Tokenization completed successfully.")
except KeyError as e:
    print(f"KeyError during tokenization: {e}")
    exit(1)

# Setting up training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir=output_dir,
    save_total_limit=3,
)

# Initializing the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# Fine-tuning the model
try:
    trainer.train()
    print("Model training completed successfully.")
except Exception as e:
    print(f"Error during training: {e}")




Dataset loaded and split successfully.
Tokenization completed successfully.




  0%|          | 0/168 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

{'eval_loss': 1.837031364440918, 'eval_runtime': 35.5068, 'eval_samples_per_second': 1.183, 'eval_steps_per_second': 0.591, 'epoch': 1.0}


  0%|          | 0/21 [00:00<?, ?it/s]

{'eval_loss': 2.083439350128174, 'eval_runtime': 51.4107, 'eval_samples_per_second': 0.817, 'eval_steps_per_second': 0.408, 'epoch': 2.0}
{'train_runtime': 1230.8234, 'train_samples_per_second': 0.273, 'train_steps_per_second': 0.136, 'train_loss': 4.656118483770461, 'epoch': 2.0}
Model training completed successfully.


In [15]:
# Function to interact with the fine-tuned model
def chat_with_model():
    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit']:
            break
        inputs = tokenizer.encode(user_input, return_tensors="pt")
        
        # Generate output with attention mask
        outputs = model.generate(
            inputs, 
            max_length=150, 
            pad_token_id=tokenizer.pad_token_id, 
            do_sample=True,
            temperature=0.7,  # temperature for randomness
            top_k=50,  # Limit to the top-k most likely tokens
            top_p=0.9,
            repetition_penalty=1.2
            )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Model: {response}")

# Start the chat
chat_with_model()

Model: what's the price of Fossil watch?
Fossils Watch: $ The cost and specifications for this item are as follows. Color, Material (Model): Stainless Steel or Black Leather Band with Automatic Dial - N/A Retail Price : USD$39 Quantity(s) /w Display Details... Item has an approximate retail value where indicated.. DO NOT print This information is currently accurate . Information on condition Does Not Contain Mechanical Reproduction Control -- Machine wash only., Simple to clean Instructions ...~ Mint Condition Rating indicates a model that was last updated on by its manufacturer in accordance Men & Womens Chronograph Watches Series 2000 through
