# Imports

In [20]:
import torch
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_scheduler, pipeline
import numpy as np
import os
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Dataset

In [None]:
import pandas as pd

df = pd.read_csv('../dataset/casual_data_windows.csv')

print(df.head())

   Unnamed: 0                                                  0  \
0           0            What kind of phone(s) do you guys have?   
1           1  I have a pixel. It's pretty great. Much better...   
2           2       Does it really charge all the way in 15 min?   
3           3            What kind of phone(s) do you guys have?   
4           4  Samsung Galaxy J1. It's my first cell phone an...   

                                                   1  \
0  I have a pixel. It's pretty great. Much better...   
1       Does it really charge all the way in 15 min?   
2  Pretty fast. I've never timed it, but it's und...   
3  Samsung Galaxy J1. It's my first cell phone an...   
4  What do you think of it? Anything you don't like?   

                                                   2  
0       Does it really charge all the way in 15 min?  
1  Pretty fast. I've never timed it, but it's und...  
2  cool. I've been thinking of getting one, my ph...  
3  What do you think of it? Anythi

In [None]:
import pandas as pd

desired_labels = {'neutral', 'disapproval', 'caring', 'annoyance', 'anger', 'excitement', 'joy'}
classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None, device="cuda")

def format_conversation(row, classifier):
    # Format the conversation string
    if pd.notna(row['2']):
        formatted = f"{row['0']} [SEP] {row['1']} [SEP] {row['2']}"
    else:
        formatted = f"{row['0']} [SEP] {row['1']}"

    model_outputs = classifier(formatted)
    model_outputs = model_outputs[0]

    # Filter outputs for desired emotion labels
    filtered_outputs = [output for output in model_outputs if output['label'] in desired_labels]
    
    # Extract labels with scores > 0.2
    relevant_emotions = [output['label'].upper() for output in filtered_outputs if output['score'] > 0.2]
    
    # Default to NEUTRAL if no emotion scores are above threshold
    if not relevant_emotions:
        relevant_emotions = ["NEUTRAL"]
    
    # Combine emotion tokens with the formatted conversation
    emotion_tokens = " ".join([f"[{emotion}]" for emotion in relevant_emotions])
    return f"{emotion_tokens} {formatted}"

df['formatted_text'] = df.apply(lambda row: format_conversation(row, classifier), axis=1)

# Preview the dataset
print(df['formatted_text'].head())


config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


0    [NEUTRAL] What kind of phone(s) do you guys ha...
1    [NEUTRAL] I have a pixel. It's pretty great. M...
2    [NEUTRAL] Does it really charge all the way in...
3    [NEUTRAL] What kind of phone(s) do you guys ha...
4    [NEUTRAL] Samsung Galaxy J1. It's my first cel...
Name: formatted_text, dtype: object


In [22]:
class ConversationDataset(Dataset):
    def __init__(self, conversations, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.conversations = conversations
        self.max_length = max_length

    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        text = self.conversations[idx]
        encoded = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'labels': encoded['input_ids'].squeeze(0)
        }


In [25]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'sep_token': '[SEP]'})

special_tokens = ["[NEUTRAL]", "[DISAPPROVAL]", "[CARING]", "[ANNOYANCE]", "[ANGER]", "[EXCITEMENT]", "[JOY]"]
tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})

7

In [26]:
train_dataset = ConversationDataset(dataset, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Model

In [None]:
# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer)) 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# Training

In [28]:
# Training hyperparameters
epochs = 3
learning_rate = 5e-5
batch_size = 2
warmup_steps = 100
max_seq_len = 128

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader) * epochs
)

# Loss logging
loss_fn = torch.nn.CrossEntropyLoss()



In [None]:
# Training loop
model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        epoch_loss += loss.item()
    
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_dataloader)}")

# Save the fine-tuned model
model.save_pretrained("conversation_gpt2_with_emotions")
tokenizer.save_pretrained("conversation_gpt2_with_emotions")


100%|██████████| 14075/14075 [23:41<00:00,  9.90it/s]


Epoch 1, Loss: 1.131512241463043


100%|██████████| 14075/14075 [23:33<00:00,  9.96it/s]


Epoch 2, Loss: 0.8778699633622042


100%|██████████| 14075/14075 [23:29<00:00,  9.99it/s]


Epoch 3, Loss: 0.7936823577334572


('conversation_gpt2_with_emotions/tokenizer_config.json',
 'conversation_gpt2_with_emotions/special_tokens_map.json',
 'conversation_gpt2_with_emotions/vocab.json',
 'conversation_gpt2_with_emotions/merges.txt',
 'conversation_gpt2_with_emotions/added_tokens.json')

# Generation

In [None]:
def generate_response(prompt, model, tokenizer, max_length=50, top_k=50):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            do_sample=True,
            top_k=top_k,
            eos_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.pad_token_id,
            num_return_sequences=1
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

prompt = "girl this is crazy [SEP]"
response = generate_response(prompt, model, tokenizer)
print(response)


girl this is crazy  You guys have amazing friends who love you  😀😊☺🙌👍👊  😂😂 😆
🬬 LOL How do


In [None]:
def generate_response_with_emotions(prompt, emotion, model, tokenizer, max_length=30, top_k=50):
    model.eval()
    prompt_with_emotion = f"[{emotion.upper()}] {prompt}"
    input_ids = tokenizer.encode(prompt_with_emotion, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            do_sample=True,
            top_k=top_k,
            eos_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.pad_token_id,
            num_return_sequences=1
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

prompt = "surely this will go so well [SEP] "
emotions = {"neutral", "disapproval", "caring", "annoyance", "anger", "excitement", "joy"}
for emotion in emotions:
    response = generate_response_with_emotions(prompt, emotion, model, tokenizer)
    print(emotion + ": " + response + "\n")


anger:  surely this will go so well  ~~you're right~~ there's nothing wrong with that.   Yup?

excitement:  surely this will go so well   I hope so It is :w The good kind I'm

joy:  surely this will go so well  ~~it'll go well~~  I still think so, and I'm glad I didn't

neutral:  surely this will go so well   it will indeed  It’s not gonna happen, it just won’t

disapproval:  surely this will go so well   This just made me get a little upset that my friend didn't reply so much as I needed

caring:  surely this will go so well   I think he might be joking.  It will. Just remember that in the end the world

annoyance:  surely this will go so well   I doubt it but it's ok :)  Ah! How long have you been treating yourself?



In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Path to the saved model and tokenizer
model_path = "../model/conversation-gpt2-with-emotions"

tokenizer = GPT2Tokenizer.from_pretrained(model_path)

model = GPT2LMHeadModel.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print("Model and tokenizer loaded successfully!")


Model and tokenizer loaded successfully!
