# Fine-Tuning TinyLlama_v1.1_math_code on Quantitative Finance StackExchange Dataset

## Library Imports

In [None]:
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
from datasets import load_dataset, DatasetDict
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import mlflow
import os
from huggingface_hub import HfApi, Repository, create_repo

## Settings

In [None]:
# override default Huggingface Cache Location (C: drive)
os.environ['HF_HUB_CACHE'] = 'models/.HF_HUB_CACHE'
load_dotenv()

In [3]:
RANDOM_STATE = 7
TRAIN_PROP_1 = 0.9
TRAIN_PROP_2 = 0.9
NUM_EPOCH = 5
BATCH_SIZE = 32
# GRAD_ACCUMULATE = 1
MAX_TOKEN_LENGTH = 512
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Clean and Format Data

In [4]:
quant_SE = load_dataset('theblackcat102/quant-stackexchange-posts')

In [None]:
quant_SE.set_format(type='pandas')
quant_SE_df = quant_SE['train'][:]
quant_SE_df.head()

In [6]:
quant_SE_df.to_csv('data/raw/theblackcat102-quant-stackexchange-posts.csv', index = False)

In [None]:
quant_SE_left = quant_SE_df.loc[:,["AcceptedAnswerId", "ParentId","Title", "Body", "Score"]]
quant_SE_left = quant_SE_left.rename(columns={"Body": "Answer"})
quant_SE_left

In [None]:
quant_SE_right = quant_SE_df.loc[:,["Id", "Body"]]
quant_SE_right

In [None]:
quant_SE_right = quant_SE_right.rename(columns={"Body": "Question"})
quant_SE_right

In [10]:
quant_SE_clean = quant_SE_left.merge(quant_SE_right, left_on="AcceptedAnswerId", right_on="Id", how="inner")

In [None]:
quant_SE_clean

In [None]:
quant_SE_clean.dtypes

In [None]:
quant_SE_clean.Score = pd.to_numeric(quant_SE_clean.Score, downcast='integer')
quant_SE_clean.dtypes

In [None]:
quant_SE_clean = quant_SE_clean.loc[quant_SE_clean["Score"] >= 0,:]
quant_SE_clean

In [None]:
quant_SE_clean["text"] = quant_SE_clean["Title"].str.cat(quant_SE_clean["Question"], sep=' ', na_rep='')
quant_SE_clean["text"] = quant_SE_clean["text"].str.cat(quant_SE_clean["Answer"], sep=' ', na_rep='')
quant_SE_clean

In [None]:
quant_SE_clean = quant_SE_clean.loc[:,"text"]
quant_SE_clean.head()

In [None]:
isinstance(quant_SE_clean, pd.DataFrame)

In [None]:
quant_SE_clean = pd.DataFrame(quant_SE_clean)
quant_SE_clean.head()

In [None]:
quant_SE_clean = quant_SE_clean.dropna()
quant_SE_clean.head()

In [None]:
quant_SE_clean.shape

## Training-Testing Split

In [21]:
quant_SE_clean_train, quant_SE_clean_test = train_test_split(
    quant_SE_clean,
    random_state = RANDOM_STATE,
    train_size = TRAIN_PROP_1
)

In [22]:
quant_SE_clean_training, quant_SE_clean_validation = train_test_split(
    quant_SE_clean_train,
    random_state = RANDOM_STATE,
    train_size = TRAIN_PROP_2
)

In [None]:
quant_SE_clean_test.head()

In [None]:
quant_SE_clean_test.shape

In [None]:
quant_SE_clean_training.head()

In [None]:
quant_SE_clean_training.shape

In [None]:
quant_SE_clean_validation.head()

In [None]:
quant_SE_clean_validation.shape

In [29]:
quant_SE_clean_training.to_csv('data/prepped/quant_SE_clean_training.csv', index=False)
quant_SE_clean_validation.to_csv('data/prepped/quant_SE_clean_validation.csv', index=False)
quant_SE_clean_test.to_csv('data/prepped/quant_SE_clean_test.csv', index=False)

## Model Training

In [30]:
model_name = "TinyLlama/TinyLlama_v1.1_math_code"

In [31]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [None]:
tokenizer

In [None]:
model

In [34]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], return_tensors="pt", truncation=True, padding="max_length", return_overflowing_tokens=True, max_length=MAX_TOKEN_LENGTH)

In [None]:
quant_SE_training = load_dataset('csv', data_files="data/prepped/quant_SE_clean_training.csv")
quant_SE_validation = load_dataset('csv', data_files="data/prepped/quant_SE_clean_validation.csv")
quant_SE_test = load_dataset('csv', data_files="data/prepped/quant_SE_clean_test.csv")

In [36]:
quant_SE_dataset = DatasetDict({
    'train': quant_SE_training['train'],
    'validation': quant_SE_validation['train'],
    'test': quant_SE_test['train']
})

In [None]:
quant_SE_dataset

In [None]:
quant_SE_tokenized = quant_SE_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

In [None]:
quant_SE_tokenized

In [40]:
training_args = TrainingArguments(
    output_dir="models",                      # Where to save the model
    num_train_epochs=NUM_EPOCH,               # Total number of epochs
    learning_rate=2e-5,                       # Fine-tuned learning rate
    per_device_train_batch_size=BATCH_SIZE,   # Batch size per GPU for training
    per_device_eval_batch_size=BATCH_SIZE,    # Batch size per GPU for evaluation
    weight_decay=0.01,                        # Regularization to prevent overfitting
    eval_strategy="steps",                    # Evaluate every `eval_steps`
    eval_steps=500,                           # Evaluate every 500 steps
    save_strategy="steps",                    # Save model every `save_steps`
    save_steps=500,                           # Save model every 500 steps
    save_total_limit=2,                       # Keep only the last 2 saved models
    logging_steps=100,                        # Log every 100 steps
    warmup_steps=500,                         # Warmup steps for learning rate scheduler
    load_best_model_at_end=True,              # Load the best model at the end
    greater_is_better=True,                   # Best model is based on higher metric values
    fp16=True,                                # Enable mixed precision training for faster computation
    report_to="mlflow",                       # Report training metrics to MLflow
    push_to_hub=True,                         # Push the model to Hugging Face Hub
    hub_model_id="wicakson0/TinyLlama_v1.1_math_code_finetuned_quant_SE", # Hugging Face model ID
    hub_token=os.getenv("HF_TOKEN"),          # Hugging Face authentication token
    logging_dir="logs",                     # Directory to store logs
)

In [None]:
mlflow.set_tracking_uri("file:///E:/Current_Workdir/llm-quantstackexchange-finetune/report")  # Replace with your MLflow tracking directory path
mlflow.set_experiment("TinyLlama_v1.1_math_code_finetuned_quant_SE")  # Experiment name

In [42]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM: MLM (Masked Language Modeling) is set to False
)

In [43]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=quant_SE_tokenized['train'],
    eval_dataset=quant_SE_tokenized['validation'],
    data_collator=data_collator,
)

In [44]:
del quant_SE_training, quant_SE_validation, quant_SE_test
torch.cuda.empty_cache()

In [None]:
with mlflow.start_run():
    trainer.train()

In [None]:
model.save_pretrained("model/finetuned_model")
tokenizer.save_pretrained("model/finetuned_model")

In [None]:
trainer.push_to_hub(commit_message="Fine-tuned TinyLlama/TinyLlama_v1.1_math_code using cleaned theblackcat102/quant-stackexchange-posts", 
                    blocking=True)