In [None]:
import torch
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM
from datasets import load_dataset
from peft import get_peft_model, AutoPeftModelForCausalLM, LoraConfig
from sklearn.model_selection import train_test_split

def tokenize_batch(tokenizer, batch, text_key='verse_text'):
    return tokenizer(batch[text_key], return_tensors="pt", padding=True, truncation=True)

# Load an appropriate dataset
dataset_name = "poem_sentiment"  # Replaced with the actual dataset name
dataset = load_dataset(dataset_name)

# Assuming that the dataset has a 'train' key, update this if needed
train_dataset = dataset['train']

# Check the structure of your dataset and find the correct key for the text
print("Dataset Structure:", train_dataset.features)

# Use the correct key for the text in your dataset
text_key = 'verse_text'  # Replace with the actual key for the text
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Set padding token to EOS token
tokenizer.pad_token = tokenizer.eos_token
train_inputs = tokenize_batch(tokenizer, train_dataset, text_key)

# Split the dataset into train and test sets
train_dataset, test_dataset = train_test_split(train_dataset, test_size=0.1, random_state=42)

# Load the Lora model
pretrained_model_name = "facebook/opt-350m"  # You can choose another model suitable for your task
lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
    lora_alpha=32,
    lora_dropout=0.05,
    base_model_name_or_path=pretrained_model_name
)

# Load the pre-trained model
pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)

# Use get_peft_model to apply LoRA
lora_model = get_peft_model(pretrained_model, lora_config)

# Print trainable parameters
print("Trainable Parameters Before Fine-Tuning:")
lora_model.print_trainable_parameters()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    per_device_train_batch_size=8,
    num_train_epochs=1,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    save_total_limit=1,
    save_steps=500,
)

# Tokenize and preprocess the datasets for training
train_inputs = tokenize_batch(tokenizer, train_dataset, text_key)
test_inputs = tokenize_batch(tokenizer, test_dataset, text_key)

# Fine-tune the LoRA model
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_inputs,
    eval_dataset=test_inputs,
)
trainer.train()

# Save the fine-tuned LoRA model
lora_model.save_pretrained("./fine_tuned_model")

# Load the fine-tuned LoRA model for inference
fine_tuned_model = AutoPeftModelForCausalLM.from_pretrained("./fine_tuned_model")

# Print trainable parameters after fine-tuning
print("\nTrainable Parameters After Fine-Tuning:")
fine_tuned_model.print_trainable_parameters()

# Tokenize and evaluate with the fine-tuned model
input_text = test_dataset[0][text_key]  # Use any text example from your test dataset
inputs = tokenizer(input_text, return_tensors="pt")
outputs_fine_tuned = fine_tuned_model(**inputs)

# Perform any necessary evaluation based on your task
print("Fine-Tuned Model Output:", outputs_fine_tuned)


In [None]:
import torch
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM
from datasets import load_dataset
from peft import get_peft_model, AutoPeftModelForCausalLM, LoraConfig
from sklearn.model_selection import train_test_split

def tokenize_batch(tokenizer, batch, id_key='id', text_key='verse_text', label_key='label'):
    inputs = tokenizer(batch[text_key], return_tensors="pt", padding=True, truncation=True)
    inputs['labels'] = batch[label_key]
    return inputs

# Load an appropriate dataset
dataset_name = "poem_sentiment"  # Replaced with the actual dataset name
dataset = load_dataset(dataset_name)

# Assuming that the dataset has a 'train' key, update this if needed
train_dataset = dataset['train']

# Check the structure of your dataset and find the correct key for the text
print("Dataset Structure:", train_dataset.features)

# Use the correct key for the text in your dataset
text_key = 'verse_text'  # Replace with the actual key for the text
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Set padding token to EOS token
tokenizer.pad_token = tokenizer.eos_token
train_inputs = tokenize_batch(tokenizer, train_dataset, text_key)

# Split the dataset into train and test sets
train_dataset, test_dataset = train_test_split(train_dataset, test_size=0.1, random_state=42)

# Load the Lora model
pretrained_model_name = "facebook/opt-350m"  # You can choose another model suitable for your task
lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
    lora_alpha=32,
    lora_dropout=0.05,
    base_model_name_or_path=pretrained_model_name
)

# Load the pre-trained model
pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)

# Use get_peft_model to apply LoRA
lora_model = get_peft_model(pretrained_model, lora_config)

# Print trainable parameters
print("Trainable Parameters Before Fine-Tuning:")
lora_model.print_trainable_parameters()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    per_device_train_batch_size=8,
    num_train_epochs=1,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    save_total_limit=1,
    save_steps=500,
)

# Tokenize and preprocess the datasets for training
train_inputs = tokenize_batch(tokenizer, train_dataset, text_key, 'label')
test_inputs = tokenize_batch(tokenizer, test_dataset, text_key, 'label')

# Fine-tune the LoRA model
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_inputs,
    eval_dataset=test_inputs,
)
trainer.train()

# Save the fine-tuned LoRA model
lora_model.save_pretrained("./fine_tuned_model")

# Load the fine-tuned LoRA model for inference
fine_tuned_model = AutoPeftModelForCausalLM.from_pretrained("./fine_tuned_model")

# Print trainable parameters after fine-tuning
print("\nTrainable Parameters After Fine-Tuning:")
fine_tuned_model.print_trainable_parameters()

# Tokenize and evaluate with the fine-tuned model
input_text = test_dataset[0][text_key]  # Use any text example from your test dataset
inputs = tokenizer(input_text, return_tensors="pt")
outputs_fine_tuned = fine_tuned_model(**inputs)

# Perform any necessary evaluation based on your task
print("Fine-Tuned Model Output:", outputs_fine_tuned)


In [None]:
import torch
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM
from datasets import load_dataset
from peft import get_peft_model, AutoPeftModelForCausalLM, LoraConfig
from sklearn.model_selection import train_test_split

def tokenize_batch(tokenizer, batch, input_ids='idx', text_key='verse_text', label_key='label'):
    inputs = tokenizer(batch[text_key], return_tensors="pt", padding=True, truncation=True)
    inputs['labels'] = batch[label_key]
    return inputs

# Load an appropriate dataset
dataset_name = "poem_sentiment"  # Replaced with the actual dataset name
dataset = load_dataset(dataset_name)

# Add this before creating the Trainer
for batch in train_inputs:
    print(batch)


# Assuming that the dataset has a 'train' key, update this if needed
train_dataset = dataset['train']

# Check the structure of your dataset and find the correct key for the text
print("Dataset Structure:", train_dataset.features)

# Use the correct key for the text in your dataset
text_key = 'verse_text'  # Replace with the actual key for the text
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Set padding token to EOS token
tokenizer.pad_token = tokenizer.eos_token
train_inputs = tokenize_batch(tokenizer, train_dataset, text_key=text_key)  # Specify text_key here

# Split the dataset into train and test sets
train_dataset, test_dataset = train_test_split(train_dataset, test_size=0.1, random_state=42)

# Load the Lora model
pretrained_model_name = "facebook/opt-350m"  # You can choose another model suitable for your task
lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
    lora_alpha=32,
    lora_dropout=0.05,
    base_model_name_or_path=pretrained_model_name
)

# Load the pre-trained model
pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)

# Use get_peft_model to apply LoRA
lora_model = get_peft_model(pretrained_model, lora_config)

# Print trainable parameters
print("Trainable Parameters Before Fine-Tuning:")
lora_model.print_trainable_parameters()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    per_device_train_batch_size=8,
    num_train_epochs=1,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    save_total_limit=1,
    save_steps=500,
)

# Tokenize and preprocess the datasets for training
train_inputs = tokenize_batch(tokenizer, train_dataset, input_ids='idx', text_key=text_key, label_key='label')  # Specify text_key and label_key here
test_inputs = tokenize_batch(tokenizer, test_dataset, input_ids='idx', text_key=text_key, label_key='label')  # Specify text_key and label_key here

# Fine-tune the LoRA model
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_inputs,
    eval_dataset=test_inputs,
)
trainer.train()

# Save the fine-tuned LoRA model
lora_model.save_pretrained("./fine_tuned_model")

# Load the fine-tuned LoRA model for inference
fine_tuned_model = AutoPeftModelForCausalLM.from_pretrained("./fine_tuned_model")

# Print trainable parameters after fine-tuning
print("\nTrainable Parameters After Fine-Tuning:")
fine_tuned_model.print_trainable_parameters()

# Tokenize and evaluate with the fine-tuned model
input_text = test_dataset[0][text_key]  # Use any text example from your test dataset
inputs = tokenizer(input_text, return_tensors="pt")
outputs_fine_tuned = fine_tuned_model(**inputs)

# Perform any necessary evaluation based on your task
print("Fine-Tuned Model Output:", outputs_fine_tuned)


In [None]:
import torch
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM
from datasets import load_dataset
from peft import get_peft_model, AutoPeftModelForCausalLM, LoraConfig
from sklearn.model_selection import train_test_split

def tokenize_batch(tokenizer, batch, text_key='text', label_key='label'):
    inputs = tokenizer(batch[text_key], return_tensors="pt", padding=True, truncation=True)
    inputs['labels'] = batch[label_key]
    return inputs

# Load an appropriate dataset
dataset = load_dataset("ethos", "binary")

# Add this before creating the Trainer
for batch in train_inputs:
    print(batch)


# Assuming that the dataset has a 'train' key, update this if needed
train_dataset = dataset['train']

# Check the structure of your dataset and find the correct key for the text
print("Dataset Structure:", train_dataset.features)

# Use the correct key for the text in your dataset
text_key = 'text'  # Replace with the actual key for the text
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Set padding token to EOS token
tokenizer.pad_token = tokenizer.eos_token
train_inputs = tokenize_batch(tokenizer, train_dataset, text_key=text_key)  # Specify text_key here

# Split the dataset into train and test sets
train_dataset, test_dataset = train_test_split(train_dataset, test_size=0.1, random_state=42)

# Load the Lora model
pretrained_model_name = "facebook/opt-350m"  # You can choose another model suitable for your task
lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
    lora_alpha=32,
    lora_dropout=0.05,
    base_model_name_or_path=pretrained_model_name
)

# Load the pre-trained model
pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)

# Use get_peft_model to apply LoRA
lora_model = get_peft_model(pretrained_model, lora_config)

# Print trainable parameters
print("Trainable Parameters Before Fine-Tuning:")
lora_model.print_trainable_parameters()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    per_device_train_batch_size=8,
    num_train_epochs=1,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    save_total_limit=1,
    save_steps=500,
)

# Tokenize and preprocess the datasets for training
train_inputs = tokenize_batch(tokenizer, train_dataset, text_key=text_key, label_key='label')  # Specify text_key and label_key here
test_inputs = tokenize_batch(tokenizer, test_dataset, text_key=text_key, label_key='label')  # Specify text_key and label_key here

# Fine-tune the LoRA model
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_inputs,
    eval_dataset=test_inputs,
)
trainer.train()

# Save the fine-tuned LoRA model
lora_model.save_pretrained("./fine_tuned_model")

# Load the fine-tuned LoRA model for inference
fine_tuned_model = AutoPeftModelForCausalLM.from_pretrained("./fine_tuned_model")

# Print trainable parameters after fine-tuning
print("\nTrainable Parameters After Fine-Tuning:")
fine_tuned_model.print_trainable_parameters()

# Tokenize and evaluate with the fine-tuned model
input_text = test_dataset[0][text_key]  # Use any text example from your test dataset
inputs = tokenizer(input_text, return_tensors="pt")
outputs_fine_tuned = fine_tuned_model(**inputs)

# Perform any necessary evaluation based on your task
print("Fine-Tuned Model Output:", outputs_fine_tuned)


In [None]:
import torch
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM
from datasets import load_dataset
from peft import get_peft_model, AutoPeftModelForCausalLM, LoraConfig
from sklearn.model_selection import train_test_split

# Define the splits and the percentage of data to be used
splits = ['train', 'test']
sizes_percent = {'train': 0.1, 'test': 0.1}  # Using only a fraction of the data

# Load the poem_sentiment full dataset for specified splits
poem_sentiment = {split: load_dataset('poem_sentiment', split=split, trust_remote_code=True) for split in splits}

# Thin out the dataset to reduce computational resources
for split in splits:
    # Shuffle and select a fraction of the data
    sampled_size = int(poem_sentiment[split].shape[0] * sizes_percent[split])
    poem_sentiment[split] = poem_sentiment[split].shuffle(seed=42).select(range(sampled_size))
    
def tokenize_batch(tokenizer, batch, id_key='id', text_key='verse_text', label_key='label'):
    inputs = tokenizer(batch[text_key], return_tensors="pt", padding=True, truncation=True)
    inputs['labels'] = batch[label_key]
    return inputs

# Assuming that the dataset has a 'train' key, update this if needed
train_dataset = poem_sentiment['train']

# Check the structure of your dataset and find the correct key for the text
print("Dataset Structure:", train_dataset.features)

# Use the correct key for the text in your dataset
text_key = 'text'  # Replace with the actual key for the text
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Set padding token to EOS token
tokenizer.pad_token = tokenizer.eos_token
train_inputs = tokenize_batch(tokenizer, train_dataset, text_key=text_key)  # Specify text_key here

# Split the dataset into train and test sets
train_dataset, test_dataset = train_test_split(train_dataset, test_size=0.1, random_state=42)

# Load the Lora model
pretrained_model_name = "facebook/opt-350m"  # comparing to facebook word model
lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
    lora_alpha=32,
    lora_dropout=0.05,
    base_model_name_or_path=pretrained_model_name
)

# Load the pre-trained model
pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)

# Use get_peft_model to apply LoRA
lora_model = get_peft_model(pretrained_model, lora_config)

# Print trainable parameters
print("Trainable Parameters Before Fine-Tuning:")
lora_model.print_trainable_parameters()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    per_device_train_batch_size=8,
    num_train_epochs=1,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    save_total_limit=1,
    save_steps=500,
)

# Tokenize and preprocess the datasets for training
train_inputs = tokenize_batch(tokenizer, train_dataset, text_key=text_key, label_key='label')  # Specify text_key and label_key here
test_inputs = tokenize_batch(tokenizer, test_dataset, text_key=text_key, label_key='label')  # Specify text_key and label_key here

# Fine-tune the LoRA model
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_inputs,
    eval_dataset=test_inputs,
)
trainer.train()

# Save the fine-tuned LoRA model
lora_model.save_pretrained("./fine_tuned_model")

# Load the trained Lora model for further evaluation
lora_reloaded = AutoPeftModelForCausalLM.from_pretrained("./fine_tuned_model")

# Print trainable parameters after fine-tuning
print("\nTrainable Parameters After Fine-Tuning:")
lora_reloaded.print_trainable_parameters()

# Tokenize and evaluate with the fine-tuned model
input_text = test_dataset[0][text_key]  # Use any text example from your test dataset
inputs = tokenizer(input_text, return_tensors="pt")
outputs_fine_tuned = lora_reloaded(**inputs)

# Perform any necessary evaluation based on your task
print("Fine-Tuned Model Output:", outputs_fine_tuned)


In [5]:

import torch
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM
from datasets import load_dataset
from peft import get_peft_model, AutoPeftModelForCausalLM, LoraConfig
from sklearn.model_selection import train_test_split

def tokenize_batch(tokenizer, batch, text_key='verse_text', label_key='label'):
    inputs = tokenizer(batch[text_key], return_tensors="pt", padding=True, truncation=True)
    inputs['labels'] = batch[label_key]
    return inputs

# Load an appropriate dataset
dataset_name = "poem_sentiment"  # Replaced with the actual dataset name
dataset = load_dataset(dataset_name)

# Assuming that the dataset has a 'train' key, update this if needed
train_dataset = dataset['train']

# Check the structure of your dataset and find the correct key for the text
print("Dataset Structure:", train_dataset.features)

# Use the correct key for the text in your dataset
text_key = 'verse_text'  # Replace with the actual key for the text
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Set padding token to EOS token
tokenizer.pad_token = tokenizer.eos_token
train_inputs = tokenize_batch(tokenizer, train_dataset, text_key=text_key)

# Split the dataset into train and test sets
train_dataset, test_dataset = train_test_split(train_dataset, test_size=0.1, random_state=42)

# Tokenize and preprocess the datasets for training
train_inputs = tokenize_batch(tokenizer, train_dataset, text_key=text_key, label_key='label')  # Specify text_key and label_key here
test_inputs = tokenize_batch(tokenizer, test_dataset, text_key=text_key, label_key='label')  # Specify text_key and label_key here

# Load the Lora model
pretrained_model_name = "facebook/opt-350m"  # You can choose another model suitable for your task
lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
    lora_alpha=32,
    lora_dropout=0.05,
    base_model_name_or_path=pretrained_model_name
)

# Load the pre-trained model
pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)

# Use get_peft_model to apply LoRA
lora_model = get_peft_model(pretrained_model, lora_config)

# Print trainable parameters
print("Trainable Parameters Before Fine-Tuning:")
lora_model.print_trainable_parameters()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    per_device_train_batch_size=8,
    num_train_epochs=1,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    save_total_limit=1,
    save_steps=500,
)

# Fine-tune the LoRA model
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_inputs,
    eval_dataset=test_inputs,
)
trainer.train()

# Save the fine-tuned LoRA model
lora_model.save_pretrained("./fine_tuned_model")

# Load the fine-tuned LoRA model for inference
fine_tuned_model = AutoPeftModelForCausalLM.from_pretrained("./fine_tuned_model")

# Print trainable parameters after fine-tuning
print("\nTrainable Parameters After Fine-Tuning:")
fine_tuned_model.print_trainable_parameters()

# Tokenize and evaluate with the fine-tuned model
input_text = test_dataset[0][text_key]  # Use any text example from your test dataset
inputs = tokenizer(input_text, return_tensors="pt")
outputs_fine_tuned = fine_tuned_model(**inputs)

# Perform any necessary evaluation based on your task
print("Fine-Tuned Model Output:", outputs_fine_tuned)


Dataset Structure: {'id': Value(dtype='int32', id=None), 'verse_text': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive', 'no_impact', 'mixed'], id=None)}
Trainable Parameters Before Fine-Tuning:
trainable params: 1,572,864 || all params: 332,769,280 || trainable%: 0.472659014678278


ValueError: The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,attention_mask,head_mask,past_key_values,inputs_embeds,labels,use_cache,output_attentions,output_hidden_states,return_dict,label,labels,label_ids.