In [1]:
!pip install -q transformers

In [2]:
!pip install peft


Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.2


In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from peft import LoraConfig, get_peft_model
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.add_special_tokens({'sep_token': '[SEP]'})
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]



In [5]:
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
model.resize_token_embeddings(len(tokenizer))

# Configure LoRA
lora_config = LoraConfig(
    r=4,                      
    lora_alpha=32,            
    lora_dropout=0.1,         
    target_modules=["c_attn"] 
)

model = get_peft_model(model, lora_config)


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



# Dataset tokenization

In [6]:
import pandas as pd

df = pd.read_csv('/kaggle/input/reddit-conversations/casual_data_windows.csv')

print(df.head())

   Unnamed: 0                                                  0  \
0           0            What kind of phone(s) do you guys have?   
1           1  I have a pixel. It's pretty great. Much better...   
2           2       Does it really charge all the way in 15 min?   
3           3            What kind of phone(s) do you guys have?   
4           4  Samsung Galaxy J1. It's my first cell phone an...   

                                                   1  \
0  I have a pixel. It's pretty great. Much better...   
1       Does it really charge all the way in 15 min?   
2  Pretty fast. I've never timed it, but it's und...   
3  Samsung Galaxy J1. It's my first cell phone an...   
4  What do you think of it? Anything you don't like?   

                                                   2  
0       Does it really charge all the way in 15 min?  
1  Pretty fast. I've never timed it, but it's und...  
2  cool. I've been thinking of getting one, my ph...  
3  What do you think of it? Anythi

In [8]:
def format_conversation(row):
    if pd.notna(row['2']):
        return f"{row['0']} [SEP] {row['1']} [SEP] {row['2']}"
    else:
        return f"{row['0']} [SEP] {row['1']}"

df['formatted_text'] = df.apply(format_conversation, axis=1)

print(df['formatted_text'].head())


0    What kind of phone(s) do you guys have? [SEP] ...
1    I have a pixel. It's pretty great. Much better...
2    Does it really charge all the way in 15 min? [...
3    What kind of phone(s) do you guys have? [SEP] ...
4    Samsung Galaxy J1. It's my first cell phone an...
Name: formatted_text, dtype: object


In [9]:
def tokenize_function(text):
    return tokenizer(text, return_special_tokens_mask=True, truncation=True, padding='max_length', max_length=512)

# Tokenize the dataset
tokenized_datasets = df['formatted_text'].apply(tokenize_function)


In [10]:
class ConversationDataset(Dataset):
    def __init__(self, tokenized_texts):
        self.input_ids = [torch.tensor(t['input_ids']) for t in tokenized_texts]
        self.attention_masks = [torch.tensor(t['attention_mask']) for t in tokenized_texts]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.input_ids[idx],  
        }

In [11]:
train_dataset = ConversationDataset(tokenized_datasets)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

PeftModel(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50258, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-5): 6 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lor

In [12]:
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 5
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



# Training

In [13]:
model.train()
for epoch in range(num_epochs):
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        inputs = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

100%|██████████| 14075/14075 [1:06:27<00:00,  3.53it/s]
100%|██████████| 14075/14075 [1:06:34<00:00,  3.52it/s]
100%|██████████| 14075/14075 [1:06:33<00:00,  3.52it/s]
100%|██████████| 14075/14075 [1:06:33<00:00,  3.52it/s]
100%|██████████| 14075/14075 [1:06:34<00:00,  3.52it/s]


In [25]:
model.save_pretrained("lora_fine_tuned_gpt2")


# Evaluation

In [17]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from peft import LoraConfig, get_peft_model

# Load the tokenizer
#tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
#tokenizer.add_special_tokens({'sep_token': '[SEP]'})
#tokenizer.pad_token = tokenizer.eos_token

# Load the fine-tuned model
#model = GPT2LMHeadModel.from_pretrained("lora_fine_tuned_gpt2")
#model.resize_token_embeddings(len(tokenizer))
model.eval()  # Set model to evaluation mode for inference


PeftModel(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50258, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-5): 6 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lor

# Generation

In [24]:
import torch

def generate_response(prompt, model, tokenizer, max_length=50):
    # Encode the prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Move inputs to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {key: val.to(device) for key, val in inputs.items()}
    model.to(device)

    # Generate response
    output = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,  # Prevent repetition of phrases
        top_k=50,                # Use top-k sampling to reduce randomness
        top_p=0.9,               # Use nucleus sampling for diversity
        temperature=0.7          # Lower values make the output less random
    )

    # Decode the generated tokens back to text
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Test with a sample prompt
prompt = "you are not okay at all, you dumb tincan."
response = generate_response(prompt, model, tokenizer)
print("Model response:", response)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Model response: you are not okay at all, you dumb tincan.  I'm not a fan of the game. I just want to play it.
