# Simulate data

In this notebook we generate data, following instructions from the part "4. Simulation experiments" of the paper.

Parameters | Distribution:
---    | ---
'S'- spot price    |     Unif[500, 1500]
'sigma' - volatility|     Unif[0.1, 1]
'tau' - time to maturity |     Unif[2w, 2y]
'r'  - risk-free rate  |   Unif[0.1%, 5%]
'K'  - strike price  |   S/z; z ~ N(1, 0.1)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Parameters
seed = 123456

param_bounds = {'S': [500, 1000],
'sigma': [0.1, 1],
'tau': [14/252, 2],
'r': [0.1/100, 5/100],
}
strikes_per_S = 4

sample_sizes = [5_000, 10_000, 20_000, 50_000, 100_000, 200_000]

For each sample of (S, sigma, tau, r) we generate 'strike_per_S' strikes. That is why each tuple (S, sigma, tau, r) will repeated 'strike_per_S' times.

In [3]:
total_num_samples = int(sum(sample_sizes)/strikes_per_S)

# generate data from uniform distribution for each parameter
np.random.seed(seed)
data = {}
for param in param_bounds.keys():
    data[param] = np.repeat( # repeat the same sampled value strikes_per_S times
        np.random.uniform(
            param_bounds[param][0], # lower bound
            param_bounds[param][1], # upper bound 
            total_num_samples), 
        strikes_per_S)

# generate 'strikes_per_S' number of strikes from normal with mean S and std 0.1*S, for each S
z = np.random.normal(
        1, # mean
        np.sqrt(0.1), # variance of 0.1, std of sqrt(0.1)
        total_num_samples*strikes_per_S)

data['K'] = data['S']/z

data = pd.DataFrame(data)
data.head(50)

Unnamed: 0,S,sigma,tau,r,K
0,563.484917,0.283213,0.103238,0.02412,658.109584
1,563.484917,0.283213,0.103238,0.02412,609.198525
2,563.484917,0.283213,0.103238,0.02412,1605.928372
3,563.484917,0.283213,0.103238,0.02412,466.900744
4,983.358919,0.800311,1.738972,0.026128,708.879406
5,983.358919,0.800311,1.738972,0.026128,980.936243
6,983.358919,0.800311,1.738972,0.026128,1004.086964
7,983.358919,0.800311,1.738972,0.026128,1232.26043
8,630.238003,0.925717,0.702539,0.023512,421.161253
9,630.238003,0.925717,0.702539,0.023512,756.810893


In [4]:
# Label the samples with the sample id, according to the sample_sizes list.
# This will make it easier to split dataframe into samples of different sizes.
data['sample_id'] = 0
for i, _ in enumerate(sample_sizes):
    data.loc[data.index.isin(range(sum(sample_sizes[:i]), sum(sample_sizes[:i+1]))), 'sample_id'] = i

In [5]:
data.loc[data['sample_id'] == 0]

Unnamed: 0,S,sigma,tau,r,K,sample_id
0,563.484917,0.283213,0.103238,0.024120,658.109584,0
1,563.484917,0.283213,0.103238,0.024120,609.198525,0
2,563.484917,0.283213,0.103238,0.024120,1605.928372,0
3,563.484917,0.283213,0.103238,0.024120,466.900744,0
4,983.358919,0.800311,1.738972,0.026128,708.879406,0
...,...,...,...,...,...,...
4995,931.343599,0.227621,0.402856,0.005217,871.756917,0
4996,715.839504,0.677604,1.057343,0.049729,1099.618927,0
4997,715.839504,0.677604,1.057343,0.049729,620.406587,0
4998,715.839504,0.677604,1.057343,0.049729,608.257981,0


In [6]:
data.loc[data['sample_id'] == 1]

Unnamed: 0,S,sigma,tau,r,K,sample_id
5000,967.136855,0.742754,1.662679,0.045813,629.602381,1
5001,967.136855,0.742754,1.662679,0.045813,798.610539,1
5002,967.136855,0.742754,1.662679,0.045813,1006.548993,1
5003,967.136855,0.742754,1.662679,0.045813,1069.223069,1
5004,796.731921,0.944887,0.573266,0.005583,681.409184,1
...,...,...,...,...,...,...
14995,885.361414,0.610179,1.347572,0.032117,1225.846969,1
14996,888.929481,0.530073,0.062902,0.033095,930.364466,1
14997,888.929481,0.530073,0.062902,0.033095,1009.502498,1
14998,888.929481,0.530073,0.062902,0.033095,1313.356772,1


In [7]:
data.loc[data['sample_id'] == 5]

Unnamed: 0,S,sigma,tau,r,K,sample_id
185000,850.151328,0.640552,0.206725,0.049158,670.441712,5
185001,850.151328,0.640552,0.206725,0.049158,978.090855,5
185002,850.151328,0.640552,0.206725,0.049158,1423.101687,5
185003,850.151328,0.640552,0.206725,0.049158,701.064168,5
185004,536.998290,0.180625,1.360713,0.036101,496.816154,5
...,...,...,...,...,...,...
384995,953.651240,0.898391,0.935096,0.038669,1351.468217,5
384996,695.038669,0.739430,1.129417,0.039003,1026.783793,5
384997,695.038669,0.739430,1.129417,0.039003,670.102103,5
384998,695.038669,0.739430,1.129417,0.039003,639.638656,5


In [8]:
# save the data
data.to_csv('../data/simulated/simulation_1.csv', index=False)