In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from torch.utils.data import Dataset

In [2]:
data = pd.read_csv('Finance_data.csv')

In [3]:
data.dropna(subset=['What are your savings objectives?', 'Avenue'], inplace=True)

In [4]:
data['input_text'] = data['What are your savings objectives?'].str.lower().str.strip()
data['output_text'] = data['Avenue'].str.lower().str.strip()

In [5]:
data = data[data['input_text'].str.strip() != '']
data = data[data['output_text'].str.strip() != '']
data.drop_duplicates(subset=['input_text', 'output_text'], inplace=True)

In [6]:
tokenizer = AutoTokenizer.from_pretrained('tiiuae/falcon-7b')

In [7]:
tokenizer.pad_token = tokenizer.eos_token

In [8]:
def tokenize_data(data):
    return tokenizer(
        list(data['input_text']),
        padding='max_length',
        truncation=True,
        return_tensors='pt',
        max_length=128  # Set maximum length for tokens
    ), tokenizer(
        list(data['output_text']),
        padding='max_length',
        truncation=True,
        return_tensors='pt',
        max_length=128
    )

In [9]:
train_encodings, train_labels = tokenize_data(data)

In [10]:
class FinancialAdviceDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Return input and label tensors
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels['input_ids'][idx]
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

In [11]:
train_dataset = FinancialAdviceDataset(train_encodings, train_labels)

In [None]:
model = AutoModelForCausalLM.from_pretrained('tiiuae/falcon-7b')