# Loading modules and libraries

In [1]:
# Data manipulation 
import pandas as pd
# Dates handling
from datetime import datetime, timedelta
import pytz
from dateutil.relativedelta import relativedelta
# Chart modules
import matplotlib.pyplot as plt
from bokeh.plotting import figure
from bokeh.io import push_notebook,show, output_notebook
from bokeh.models import HoverTool
output_notebook()

# Linear algrebra
import numpy as np

# ML and Metrics module

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

# Data APIs
import yfinance as yf
from pytrends.request import TrendReq

# Technical indicators
from ta.momentum import rsi, stochrsi_d,stochrsi_k
from ta.trend import ema_indicator, macd_diff, vortex_indicator_neg, vortex_indicator_pos, adx, cci
from ta.volatility import bollinger_hband, bollinger_lband
from ta.volume import ease_of_movement

from modules.Utils.utils import loadFromDB, strategyTester
from modules.Utils.indicators import addIndicators

# Remove warnings
import warnings

warnings.filterwarnings('ignore')

# Utils functions

In [2]:
   
def is_far_from_level(value, levels, df):
    ave =  np.mean(df['High'] - df['Low'])
    return np.sum([abs(value - level) < ave for _, level in levels]) == 0

def detectSupportAndResistance(df)->list:
    """This function detect using window approach all supports and resistance from a dataframe over a time period.

    Args:
        df (pd.DataFrame): The DataFrame that we want to extract supports and resistances, it must containe OHLC at least.

    Returns:
        list: The list of all supports and resistances timestamped
    """
    pivots = []
    max_list = []
    min_list = []
    for i in range(5, len(df)-5):
        high_range = df['High'][i-5:i+4]
        current_max = high_range.max()

        if current_max not in max_list:
            max_list = []
        max_list.append(current_max)
        if len(max_list) == 5 and is_far_from_level(current_max, pivots, df):
            pivots.append((high_range.idxmax(), current_max))
        
        low_range = df['Low'][i-5:i+5]
        current_min = low_range.min()
        if current_min not in min_list:
            min_list = []
        min_list.append(current_min)
        if len(min_list) == 5 and is_far_from_level(current_min, pivots, df):
            pivots.append((low_range.idxmin(), current_min))
    return list({x:y} for x, y in pivots)



# Load the data

In [3]:
def prepareDataFrame(symbol='ETH',google_trend=True,interval='15m')->pd.DataFrame:
    """Get Datafram BTC and symbol data from yFinance and optionnaly google_trend data.

    Args:
        symbol (str, optional): The Symbol you want to predict from BTC. Defaults to 'ETH'.
        google_trend (bool, optional): Whether or not you want to add Google Trend Data to the Dataframe, is that case you need to put interval to 1h. Defaults to True.
        interval (str, optional): The interval to construct candlesticks : 1h, 15m, 1m... Defaults to '15m'.

    Returns:
        pd.DataFrame: The Dataframe containing all the data and the features.
    """
    n = 2
    current_date = datetime.today()
    past_date = current_date - relativedelta(days=n)

    hist_BTC = loadFromDB('BTC',interval)
    hist_ETH = loadFromDB('ETH',interval)
    
    if google_trend==True and interval=='1h':
        merged_BTC_ETH = pd.merge(hist_BTC,hist_ETH, left_on='Timestamp',right_on='Timestamp',how='inner',suffixes=('',f'_{symbol}')).dropna()
        starting_date = datetime.fromtimestamp(merged_BTC_ETH['Timestamp'].iloc[0])
        ending_date = datetime.fromtimestamp(merged_BTC_ETH['Timestamp'].iloc[-1])

        pytrends = TrendReq(hl='en-US', tz=360) 
        kw_list = ["BTC","Blockchain","Bitcoin"] # list of keywords to get data 

        pytrends.build_payload(kw_list, cat=0, timeframe='today 12-m') 
        data = pytrends.get_historical_interest(kw_list, year_start=starting_date.year, month_start=starting_date.month, day_start=starting_date.day, hour_start=starting_date.hour, year_end=ending_date.year, month_end=ending_date.month, day_end=ending_date.day, hour_end=ending_date.hour, cat=0, sleep=0)
        data = data.reset_index() 
        data.set_index('date',inplace=True)
        data.BTC = data.BTC + data.Bitcoin

        data.drop(columns=['Bitcoin','isPartial'],inplace=True)
        data.dropna(inplace=True)

        merged_BTC_ETH_Gtrend = pd.merge(merged_BTC_ETH,data, left_on='Timestamp',right_on='Timestamp',how='outer').dropna()
        merged_BTC_ETH_Gtrend = addIndicators(merged_BTC_ETH_Gtrend)
        return merged_BTC_ETH_Gtrend
    else:
        return  addIndicators(pd.merge(hist_BTC,hist_ETH, left_on='Timestamp',right_on='Timestamp',how='inner',suffixes=('',f'_{symbol}')).dropna())
        
    

In [4]:
symbol = 'KDA'

merged_BTC_ETH_Gtrend = prepareDataFrame(symbol,False, '1h')
merged_BTC_ETH_Gtrend.head()

Unnamed: 0,Open,High,Low,Close,Volume,Timestamp,Open_KDA,High_KDA,Low_KDA,Close_KDA,...,Stoch_RSI,Vortex,Bollinger_low,Bollinger_high,ADX,ATR,CCI,OVB,OVB_EMA200,EVM
199,9693.04,9706.87,9668.08,9681.75,1169.428501,1591686000,243.21,243.82,242.94,243.33,...,0.091276,-0.078257,9747.159182,9656.024818,14.63841,11.372941,-74.047531,-12339.553255,4195.571642,-10747100.0
200,9681.81,9690.62,9665.64,9683.46,1380.376751,1591689600,243.3,243.73,242.64,243.22,...,0.11112,-0.095971,9745.232203,9654.347797,15.155003,11.538378,-88.823293,-10959.176504,4044.778128,-16911190.0
201,9683.47,9685.0,9642.1,9669.7,1463.923988,1591693200,243.21,243.3,242.34,242.77,...,0.0,-0.067844,9745.886488,9651.461512,16.096837,11.884231,-126.960174,-12423.100492,3880.91864,-42726400.0
202,9669.7,9679.0,9624.53,9676.0,1891.550257,1591696800,242.76,242.96,241.25,242.38,...,0.07883,-0.049198,9745.597558,9652.250442,17.290842,12.349042,-125.923458,-10531.550235,3737.510989,-33936660.0
203,9675.99,9728.0,9663.57,9721.24,1943.34339,1591700400,242.37,243.39,242.24,243.32,...,0.578773,0.019189,9747.322184,9652.098816,16.902081,12.255926,43.206339,-8588.206845,3614.867031,145944800.0


# Choosing a SHIFT

In [5]:
SHIFT = 20
p = figure(title=f"BTC vs {symbol} with Shiffting", x_axis_label=f'{symbol} value ($)', y_axis_label='BTC value ($)',width=1500, height=600,)

p.circle(merged_BTC_ETH_Gtrend[f'Close_{symbol}'],merged_BTC_ETH_Gtrend['Close'],color='blue',legend_label=f'BTC vs {symbol}',alpha=0.3,size=4)
p.circle(merged_BTC_ETH_Gtrend[f'Close_{symbol}'].iloc[:-SHIFT].values,merged_BTC_ETH_Gtrend['Close'].shift(SHIFT).dropna().values,color='red',legend_label=f'BTC vs {symbol} shiffted',alpha=0.5,size=4)

p.legend.location = "top_left"

# show the results
show(p)

# Separing Features and Labels

In [6]:
X = merged_BTC_ETH_Gtrend.drop(columns=[f'Open_{symbol}',f'Close_{symbol}',f'High_{symbol}',f'Low_{symbol}',f'Volume_{symbol}','Timestamp','EVM']).iloc[:-SHIFT].values
y = merged_BTC_ETH_Gtrend[f'Close_{symbol}'].iloc[SHIFT:].values
X_train, X_test, y_train, y_test =  train_test_split(X,y,shuffle=True,test_size=0.5,)

# Training models

## Linear regression

In [7]:
LR_model = LinearRegression().fit(X_train,y_train)
print(f'Final score for Linear Regression on test set {round(LR_model.score(X_test,y_test)*100)} %')
y_pred = LR_model.predict(X_test)
print(f'Final MAE for Linear Regression : {mean_absolute_error(y_test, y_pred)}')
print(f'Final MSE for Linear Regression : {mean_squared_error(y_test, y_pred,squared=False)}')


Final score for Linear Regression on test set 84 %
Final MAE for Linear Regression : 362.57674399181985
Final MSE for Linear Regression : 493.771005528378


## Random Forest

In [28]:
from multiprocessing import cpu_count
RF_model = RandomForestRegressor(n_estimators=75,max_depth=20,n_jobs=cpu_count()).fit(X_train,y_train)
print(f'\nFinal score for Random Forest on test set {round(RF_model.score(X_test,y_test)*100)} %')
y_pred = RF_model.predict(X_test)
print(f'Final MAE for Random Forest  : {mean_absolute_error(y_test, y_pred)}')
print(f'Final MSE for Random Forest  : {mean_squared_error(y_test, y_pred,squared=False)}')


Final score for Random Forest on test set 100 %
Final MAE for Random Forest  : 25.54443929885624
Final MSE for Random Forest  : 46.6781913591222


# Evalutate the model

In [32]:
df_ETH_with_pred = merged_BTC_ETH_Gtrend.iloc[SHIFT:].dropna()
df_ETH_with_pred['Close_pred_RF'] = RF_model.predict(X)
df_ETH_with_pred['Close_pred_LR'] = LR_model.predict(X)
p = figure(title=f"Prediction on {symbol} from BTC", x_axis_label='value ($)', y_axis_label='value ($)',width=1500, height=600,)
# Add the HoverTool to the figure
p.add_tools(HoverTool(
    tooltips=[
    ("Price", "@y{0.00} $"),
    ("Price", "@y{0.00} $"),
], formatters={
        '$x': 'printf',
        '$y' : 'printf',
    },
    mode='vline'
))
p.line(df_ETH_with_pred.index, df_ETH_with_pred[f'Close_{symbol}'], legend_label="real data", line_width=2,)
p.line(df_ETH_with_pred.index, df_ETH_with_pred.Close_pred_RF, legend_label="prediction data RF", line_width=2, color='orange')
#p.line(df_ETH_with_pred.index, df_ETH_with_pred.Close_pred_LR, legend_label="prediction data LR", line_width=2, color='red')
p.legend.location = "top_left"

show(p)