In [1]:
from src.dataset.create_dataset import *
from src.dataset.get_act_volatility import *
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv('./data/ETH-USD_weekly_return_volatility_detailed.csv')

In [8]:
df.head(5)

Unnamed: 0,Date,Year,Month,Day,Weekday,Week_Number,Year_Week,Open,High,Low,Close,Volume,Adj Close,Return,Short_MA,Long_MA,mean_return,volatility
0,2017-11-09,2017,11,9,Thursday,45,2017-45,308.64,329.45,307.06,320.88,893249984,320.88,0.0,320.88,320.88,-0.528333,5.966071
1,2017-11-10,2017,11,10,Friday,45,2017-45,320.67,324.72,294.54,299.25,885985984,299.25,-6.741,310.065,310.065,-0.528333,5.966071
2,2017-11-11,2017,11,11,Saturday,45,2017-45,298.59,319.45,298.19,314.68,842300992,314.68,5.156,311.603333,311.603333,-0.528333,5.966071
3,2017-11-12,2017,11,12,Sunday,46,2017-46,314.69,319.15,298.51,307.91,1613479936,307.91,-2.152,310.68,310.68,1.477,3.277433
4,2017-11-13,2017,11,13,Monday,46,2017-46,307.02,328.42,307.02,316.72,1041889984,316.72,2.861,311.888,311.888,1.477,3.277433


#### Preprocessing

I will drop the year 2017 and 2022, because they dont have full year of data, and I will use the data from 2018 and 2019 as my training set, and the data from 2020 and 2021 as my testing set

In [152]:
df = df[df['Year'] > 2017]
df = df[df['Year'] < 2022]

In [153]:
df['Year'].unique()

array([2018, 2019, 2020, 2021], dtype=int64)

In [154]:
## Add 1 to week number to make it start from 1 instead of 0

df['Week_Number'] = df['Week_Number'].astype(int) + 1

### Labeling

We need to label the data, so we can use it for training. There are many ways to do this, but we will use the following: 

Label the datapoint as green(1) if the mean_return is geater than -0.05% and red(0) if the mean_return is smaller than -0.05%.

The rule is simple and easy to understand. I can also come up with more complex rules, but I will not do that here.

In [155]:
## Add randomness to this
# df['label'] = np.random.randint(0, 2, size=len(df))
df['label'] = df['mean_return'].apply(lambda x: 1 if x > -0.1 else 0)

In [156]:
df.head(2)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Return,Date,Week_Number,Year,Day,Weekday,mean_return,volatility,label
53,755.757019,782.530029,742.004028,772.640991,772.640991,2595760128,2.102,2018-01-01,1,2018,1,Monday,5.571833,5.133605,1
54,772.346008,914.830017,772.346008,884.44397,884.44397,5783349760,14.47,2018-01-02,1,2018,2,Tuesday,5.571833,5.133605,1


I applied labels based on mean_return of the week. This is because the mean_return is the average return of the week and I want to trade on the weeks that are green.


In [157]:
## Saving the dataframe to csv
df.to_csv('./data/ETH-USD_weekly_return_volatility_detailed_labeled.csv', index=False)

In [158]:
df.head(2)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Return,Date,Week_Number,Year,Day,Weekday,mean_return,volatility,label
53,755.757019,782.530029,742.004028,772.640991,772.640991,2595760128,2.102,2018-01-01,1,2018,1,Monday,5.571833,5.133605,1
54,772.346008,914.830017,772.346008,884.44397,884.44397,5783349760,14.47,2018-01-02,1,2018,2,Tuesday,5.571833,5.133605,1


### Preparing train/test datasets

In [159]:
df_train = df[(df['Year'] == 2018) | (df['Year'] == 2019)]
df_test = df[(df['Year'] == 2020) | (df['Year'] == 2021)]

df_train = df_train[['Year', 'Week_Number', 'Return', 'label', 'Adj Close']]
df_test = df_test[['Year', 'Week_Number', 'Return', 'label', 'Adj Close']]

df_train_gp = df_train.groupby(['Year', 'Week_Number', 'label'])[['Return', 'Adj Close']].agg([np.mean, np.std])
df_train_gp.reset_index(['Year', 'Week_Number', 'label'], inplace=True)
df_train_gp.columns = ['Year', 'Week_Number', 'label', 'mean_return', 'volatility', 'mean_adj_close', 'std_price']
df_train_gp.drop(['std_price'], axis=1, inplace=True)
df_train_gp.fillna(0, inplace=True)

df_train_gp.to_csv('./data/train.csv', index=False)

df_test_gp = df_test.groupby(['Year', 'Week_Number', 'label'])[['Return', 'Adj Close']].agg([np.mean, np.std])
df_test_gp.reset_index(['Year', 'Week_Number', 'label'], inplace=True)
df_test_gp.columns = ['Year', 'Week_Number', 'label', 'mean_return', 'volatility', 'mean_adj_close', 'std_price']
df_test_gp.drop(['std_price'], axis=1, inplace=True)
df_test_gp.fillna(0, inplace=True)

df_test_gp.to_csv('./data/test.csv', index=False)
