In [0]:
import pandas as pd
import matplotlib.pyplot as plt
!pip install arch
! pip install pyflux
import pyflux as pf

Collecting arch
[?25l  Downloading https://files.pythonhosted.org/packages/f5/9e/7628ae53df0494a08d256dcdccc2519aa058c7c7ccb0fc6ca4a380b0bc86/arch-4.11-cp36-cp36m-manylinux1_x86_64.whl (711kB)
[K     |▌                               | 10kB 17.4MB/s eta 0:00:01[K     |█                               | 20kB 2.2MB/s eta 0:00:01[K     |█▍                              | 30kB 3.2MB/s eta 0:00:01[K     |█▉                              | 40kB 2.1MB/s eta 0:00:01[K     |██▎                             | 51kB 2.6MB/s eta 0:00:01[K     |██▊                             | 61kB 3.1MB/s eta 0:00:01[K     |███▏                            | 71kB 3.6MB/s eta 0:00:01[K     |███▊                            | 81kB 4.1MB/s eta 0:00:01[K     |████▏                           | 92kB 4.5MB/s eta 0:00:01[K     |████▋                           | 102kB 3.5MB/s eta 0:00:01[K     |█████                           | 112kB 3.5MB/s eta 0:00:01[K     |█████▌                          | 122kB 3.5MB

In [0]:
!git clone https://github.com/vmeylan/machine_learning_for_finance.git

### Data Analysis

In [0]:
data = pd.read_csv('machine_learning_for_finance/data/preprocessed_data/enj.csv').drop(['Unnamed: 0', '_INTERCEPT', '_REALIZED_VOL', '_VOL_PROXY', '_NEGATIVE_RETURNS', '_BTC_REALIZED_VOL', '_BTC_VOL_PROXY','BTC_low', 'BTC_high', 'high', 'low', '_BTC_NEGATIVE_RETURNS', 'time'], axis=1)

In [0]:
data.head()

In [0]:
data.columns

In [0]:
data.loc[:,'VOL'] = (data._RETURNS**2).ewm(halflife=10).mean()
data.loc[:,'_FROM_EXCHANGE_EWMA'] = (data._FROM_EXCHANGE_TRANSACTIONS**2).ewm(halflife=10).mean()

In [0]:
plt.plot((data._RETURNS**2).ewm(halflife=10).mean())
plt.title('EW volatility vs. number of trading days elapsed of ENJ coin')
plt.xticks(ticks = [0,200,400,600,800])

In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="white")

# Compute the correlation matrix
corr = data.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0, vmin=-1,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.suptitle('Correlation matrix of the variables, ENJ token')

pass

In [0]:
data = data.loc[~data.isna().any(axis=1)] # drop NaN rows

In [0]:
X = data.drop('VOL', axis=1)
y = data.VOL

In [0]:
import sklearn
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=2, shuffle=True)

### *) Statistical Significance

In [0]:
# Computation of the F scores for each feature to see which features are statistically significant

from sklearn.feature_selection import f_regression

p_val = f_regression(X, y)[1]
p_val_table = pd.DataFrame({'p_val':p_val,
              'column_name': data.drop(['_RETURNS'], axis=1).columns}).sort_values('p_val')
display('Statistical significance to predict returns^2')
print(p_val_table.to_latex(index=False))

In [0]:
import sklearn
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=2, shuffle=True)

There are too many variables for the model to predict correctly with the amount of data (800 observations). This is called overfitting. We will fit simpler models.

In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

print('R2 for a Linear Regression with one variable only')
print('=================================================')
def LR_results(variable):

  X = (data.loc[:, variable].values.reshape(-1,1))

  reg = LinearRegression()


  reg.fit(X,y)
  

  return reg.score(X,y)


for variable in data.columns:
  if variable == 'VOL':
    continue
  print(' {:.2f} - {}'.format(LR_results(variable), variable))

In [0]:
df = pd.DataFrame([[0.32, 'BTC_close'],
 [0.10,'BTC_volume'],
 [0.00, '_BTC_RETURNS'],
 [0.00, '_TO_EXCHANGE_TRANSACTIONS'],
 [0.24, '_FROM_EXCHANGE_TRANSACTIONS'],
 [0.04, '_ONCHAIN_TRANSACTIONS'],
 [0.03, '_ONCHAIN_VOLUME'],
 [0.10 ,'close'],
 [0.03, 'volume']])
print(df.to_latex(index=False))

In [0]:
plt.plot(data.BTC_close, data.VOL, 'r+')
plt.title('VOL vs. BTC_close')
plt.figure()
plt.plot(data._FROM_EXCHANGE_TRANSACTIONS, data.VOL, 'r+')
plt.title('VOL vs. _FROM_EXCHANGE_TRANSACTIONS')
plt.xlim((0,1500))
pass

### *) Comparison with Volatility arima model

In [0]:
data

In [0]:
data.shape, y.shape

In [0]:
r = (data.VOL.shift(1) - data.VOL ) 
r[0]=0

In [0]:
# As the VOL process is clearly integrated we will take the first difference

r = data.VOL
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf

plot_pacf(r, lags=list(range(1,48)))
plt.title('Partial Autocorrelation of VOL (starting at 1 lag)')
plt.show()
plot_acf(r, lags=list(range(1,48)))
plt.title('Autocorrelation of VOL (starting at 1 lag)')
plt.show()

r = (data.VOL.shift(1) - data.VOL ) 
r[0]=0
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf

plot_pacf(r, lags=list(range(1,48)))
plt.title('Partial Autocorrelation of first difference of VOL (starting at 1 lag)')
plt.show()
plot_acf(r, lags=list(range(1,48)))
plt.title('Autocorrelation of first difference of VOL (starting at 1 lag)')
plt.show()


In [0]:
# ARIMAX model with the most correlated variables.
import  copy
data_temp = copy.copy(data)
data_temp.loc[:,'_RETURNS_2'] = y**2
df = data_temp
normalized_df=(df-df.mean())/df.std()
model = pf.ARIMAX(data=normalized_df, formula='VOL~1+_TO_EXCHANGE_TRANSACTIONS+_FROM_EXCHANGE_TRANSACTIONS+BTC_close+volume+_BTC_RETURNS',
                  ar=1,ma=1,integ=1, family=pf.Normal())
x = model.fit("MLE")
x.summary()

In [0]:
# ARIMAX model with the volume.
import  copy
data_temp = copy.copy(data)
data_temp.loc[:,'_RETURNS_2'] = y**2
df = data_temp
normalized_df=(df-df.mean())/df.std()
model = pf.ARIMAX(data=normalized_df, formula='_RETURNS_2~1+volume',
                  ar=1, ma=0, family=pf.Normal())
x = model.fit("MLE")
x.summary()

### *) Random Forest 

In [0]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rfr = RandomForestRegressor()

cv = GridSearchCV(rfr, {'n_estimators': [5, 10, 20, 40, 80],
                        'max_depth':[1,2,3,4, 10]})

cv.fit(X,y)



In [0]:
print('The best Cross validated R2 score is {:.2f}'.format(cv.best_score_))

# The random forest overfit the data.

In [0]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

rid = Ridge()
rid.fit(X, y)
print('R^2 train score '+str(rid.score(X,y)))


cv = GridSearchCV(rid, {'alpha': np.logspace(-5,5,10)}, cv=3, )

cv.fit(X,y)
print('The best Cross validated R2 score is {:.2f}'.format(cv.best_score_))

