# Fine-Tuning TinyLlama_v1.1_math_code on Quantitative Finance StackExchange Dataset

## Library Imports

In [38]:
import os
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split

## Settings

In [3]:
# override default Huggingface Cache Location (C: drive)
os.environ['HF_HUB_CACHE'] = '../../models/.HF_HUB_CACHE'

In [50]:
RANDOM_STATE = 7
TRAIN_PROP_1 = 0.9
TRAIN_PROP_2 = 0.9

## Clean and Format Data

In [5]:
quant_SE = load_dataset('theblackcat102/quant-stackexchange-posts')

Downloading readme: 100%|██████████| 1.05k/1.05k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 29.6M/29.6M [00:04<00:00, 6.71MB/s]
Generating train split: 100%|██████████| 46554/46554 [00:00<00:00, 282667.51 examples/s]


In [7]:
quant_SE.set_format(type='pandas')
quant_SE_df = quant_SE['train'][:]
quant_SE_df.head()

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,ParentId,Score,ViewCount,Body,Title,ContentLicense,FavoriteCount,CreationDate,LastActivityDate,LastEditDate,LastEditorUserId,OwnerUserId,Tags
0,1,1,,,26,5651.0,To get the ball rolling... I will answer this ...,What are some good technical and non-technical...,CC BY-SA 2.5,,2011-01-31T21:02:03.567,2011-01-31T21:39:02.880,2011-01-31T21:13:15.943,23.0,6,"[learning, finance, books, quantitative, analy..."
1,2,2,,1.0,6,,I like Statistics and Data Analysis for Financ...,,CC BY-SA 2.5,,2011-01-31T21:05:44.007,2011-01-31T21:05:44.007,,,33,
2,3,1,,,12,1800.0,"I want to start learning quantitative finance,...",What blogs or articles online should I read to...,CC BY-SA 2.5,,2011-01-31T21:07:18.193,2011-01-31T21:53:08.947,2017-04-13T12:46:23.000,-1.0,27,"[learning, finance]"
3,4,2,,1.0,5,,"John C. Hull's [""Options, Futures, and \nOther...",,CC BY-SA 2.5,,2011-01-31T21:08:35.820,2011-01-31T21:08:35.820,,,17,
4,5,1,120.0,,19,899.0,How do you model concentration risk of credit ...,Concentration risk in credit portfolio,CC BY-SA 2.5,,2011-01-31T21:08:50.550,2014-07-16T15:52:43.620,2011-02-01T16:53:40.297,40.0,40,"[risk, credit]"


In [13]:
quant_SE_df.to_csv('data/raw/theblackcat102-quant-stackexchange-posts.csv', index = False)

In [24]:
quant_SE_clean = quant_SE_df.loc[:,"Body"]
quant_SE_clean.head()

0    To get the ball rolling... I will answer this ...
1    I like Statistics and Data Analysis for Financ...
2    I want to start learning quantitative finance,...
3    John C. Hull's ["Options, Futures, and \nOther...
4    How do you model concentration risk of credit ...
Name: Body, dtype: object

In [27]:
isinstance(quant_SE_clean, pd.DataFrame)

False

In [28]:
quant_SE_clean = pd.DataFrame(quant_SE_clean)
quant_SE_clean.head()

Unnamed: 0,Body
0,To get the ball rolling... I will answer this ...
1,I like Statistics and Data Analysis for Financ...
2,"I want to start learning quantitative finance,..."
3,"John C. Hull's [""Options, Futures, and \nOther..."
4,How do you model concentration risk of credit ...


In [35]:
quant_SE_clean = quant_SE_clean.rename(columns={"Body": "text"})
quant_SE_clean.head()

Unnamed: 0,text
0,To get the ball rolling... I will answer this ...
1,I like Statistics and Data Analysis for Financ...
2,"I want to start learning quantitative finance,..."
3,"John C. Hull's [""Options, Futures, and \nOther..."
4,How do you model concentration risk of credit ...


In [36]:
quant_SE_clean = quant_SE_clean.dropna()
quant_SE_clean.head()

Unnamed: 0,text
0,To get the ball rolling... I will answer this ...
1,I like Statistics and Data Analysis for Financ...
2,"I want to start learning quantitative finance,..."
3,"John C. Hull's [""Options, Futures, and \nOther..."
4,How do you model concentration risk of credit ...


In [37]:
quant_SE_clean.shape

(46554, 1)

## Training-Testing Split

In [51]:
quant_SE_clean_train, quant_SE_clean_test = train_test_split(
    quant_SE_clean,
    random_state = RANDOM_STATE,
    train_size = TRAIN_PROP_1
)

In [52]:
quant_SE_clean_training, quant_SE_clean_validation = train_test_split(
    quant_SE_clean_train,
    random_state = RANDOM_STATE,
    train_size = TRAIN_PROP_2
)

In [53]:
quant_SE_clean_test.head()

Unnamed: 0,text
17163,I have a multipart question about futures and ...
45846,How are SOFR implied vols calculated? Are they...
5010,the convention for most market makers of optio...
1566,I tried a search with google but I can't find ...
36389,Look at this thesis which provide algebra and ...


In [54]:
quant_SE_clean_test.shape

(4656, 1)

In [55]:
quant_SE_clean_training.head()

Unnamed: 0,text
13610,Let $P>K$ and $r=0$.\nThen you can short the p...
31921,I am constructing equity factors and I am give...
8894,I am reading a report which talks about season...
45890,Is it possible for a volatility surface $\sigm...
9072,"Assuming a set portfolio optimization problem,..."


In [56]:
quant_SE_clean_training.shape

(37708, 1)

In [57]:
quant_SE_clean_validation.head()

Unnamed: 0,text
23313,From a mathematical perspective and under the ...
29093,On top of @Gordon answers which gives the math...
29763,If you apply a simulation Scheme (log-Euler di...
45925,The question mentions that 1 million Bernoulli...
7733,I have $\frac{dS_t}{S_t} = rdt + \sigma dW_t$ ...


In [58]:
quant_SE_clean_validation.shape

(4190, 1)

In [59]:
quant_SE_clean_training.to_csv('data/prepped/quant_SE_clean_training.csv', index=False)
quant_SE_clean_validation.to_csv('data/prepped/quant_SE_clean_validation.csv', index=False)
quant_SE_clean_test.to_csv('data/prepped/quant_SE_clean_test.csv', index=False)