In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import os
from constants import download_dir, output_dir
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import auc

In [2]:
import mlflow.sklearn

In [3]:
mlflow.set_tracking_uri("http://localhost:5000")

In [4]:
import os

os.getcwd()
print(output_dir)
path_monthly_1m = os.path.join(output_dir, "BTCUSDT-15m-monthly_data",
                               "BTCUSDT-15m.csv")
path_daily_1m = os.path.join(output_dir, "BTCUSDT-15m-daily_data",
                             "BTCUSDT-15m.csv")
print(path_monthly_1m)

D:\KISHORE\Binance-Data-Downloader\extracted_data
D:\KISHORE\Binance-Data-Downloader\extracted_data\BTCUSDT-15m-monthly_data\BTCUSDT-15m.csv


In [79]:
df = pd.read_csv(path_monthly_1m)

In [80]:
df.drop(["open_time", "close_time"], axis=1, inplace=True)

In [81]:
df.head()

Unnamed: 0,open,high,low,close,volume,quote_volume,count,taker_buy_volume,taker_buy_quote_volume,entry,...,CDLUPSIDEGAP2CROWS,CDLXSIDEGAP3METHODS,BETA,CORREL,LINEARREG,LINEARREG_ANGLE,LINEARREG_INTERCEPT,LINEARREG_SLOPE,TSF,VAR
0,7182.43,7182.44,7178.75,7179.01,70.909,509145.78482,140,32.597,234063.27884,,...,0,0,,,,,,,,
1,7179.01,7179.01,7175.25,7177.93,99.42,713539.55348,148,16.311,117066.92118,7179.01,...,0,0,,,,,,,,
2,7177.77,7182.6,7177.0,7181.11,69.33,497793.35929,104,43.723,313920.02981,7177.93,...,0,0,,,,,,,,
3,7179.1,7179.1,7172.94,7175.25,97.368,698627.39382,193,36.616,262734.68999,7181.11,...,0,0,,,,,,,,
4,7174.71,7177.14,7173.28,7175.61,33.725,241979.5088,124,15.885,113984.45772,7175.25,...,0,0,,,,,,,,4.746816


In [82]:
column_names = df.columns

missing_df = pd.DataFrame(
    {
        "name": column_names,
        "missing": df.isnull().mean().tolist()
    },
    columns=["name", "missing"])

missing_df["missing"] = missing_df["missing"] * 100

missing_df.sort_values("missing", ascending=False, inplace=True)

In [83]:
missing_df[:10]

Unnamed: 0,name,missing
19,entered_before,99.888391
17,longs_win_after,55.971509
16,shorts_win_after,55.041241
25,EMA-200,0.012655
63,TRIX,0.005596
35,TEMA,0.005533
70,HT_DCPHASE,0.004006
26,HT_TRENDLINE,0.004006
23,DEMA,0.003689
24,EMA-50,0.003116


In [84]:
df.drop(["entered_before", "longs_win_after", "shorts_win_after"],
        axis=1,
        inplace=True)

In [85]:
df.replace(np.nan, -500, inplace=True)

In [86]:
df.if_long.value_counts()

-1    880043
 1    692326
 0        81
Name: if_long, dtype: int64

In [87]:
df["if_long"] = df["if_long"].replace(-1, 0)
df["if_short"] = df["if_short"].replace(-1, 0)

In [88]:
df.if_long.value_counts()

0    880124
1    692326
Name: if_long, dtype: int64

In [89]:
df.if_short.value_counts()

0    865496
1    706954
Name: if_short, dtype: int64

In [90]:
int_cols = df.select_dtypes(include='int64').columns
df[int_cols] = df[int_cols].astype(float)

In [91]:
dtype_dict = {}
for col in df.columns:
    col_dtype = df[col].dtype.name
    if col_dtype not in dtype_dict:
        dtype_dict[col_dtype] = [col]
    else:
        dtype_dict[col_dtype].append(col)
print(dtype_dict)

{'float64': ['open', 'high', 'low', 'close', 'volume', 'quote_volume', 'count', 'taker_buy_volume', 'taker_buy_quote_volume', 'entry', 'if_short', 'if_long', 'long_target', 'short_target', 'long_stop_loss', 'short_stop_loss', 'dual_loss', 'BB_upper', 'BB_middle', 'BB_lower', 'DEMA', 'EMA-50', 'EMA-200', 'HT_TRENDLINE', 'KAMA', 'MA', 'MIDPOINT', 'MIDPRICE', 'SAR', 'SAREXT', 'SMA', 'T3', 'TEMA', 'TRIMA', 'WMA', 'ADX', 'ADXR', 'APO', 'AROON_up', 'AROON_down', 'AROONOSC', 'BOP', 'CCI', 'CMO', 'DX', 'MACD', 'MACD_signal', 'MACD_hist', 'MFI', 'MINUS_DI', 'MINUS_DM', 'MOM', 'PLUS_DI', 'PLUS_DM', 'PPO', 'ROC', 'ROCP', 'ROCR', 'ROCR100', 'RSI', 'TRIX', 'ULTOSC', 'WILLR', 'AD', 'ADOSC', 'OBV', 'HT_DCPERIOD', 'HT_DCPHASE', 'HT_PHASOR_inphase', 'HT_PHASOR_quadrature', 'HT_TRENDMODE', 'AVGPRICE', 'MEDPRICE', 'TYPPRICE', 'WCLPRICE', 'ATR', 'NATR', 'TRANGE', 'CDL2CROWS', 'CDL3BLACKCROWS', 'CDL3INSIDE', 'CDL3LINESTRIKE', 'CDL3OUTSIDE', 'CDL3STARSINSOUTH', 'CDL3WHITESOLDIERS', 'CDLABANDONEDBABY', 'CDLA

## Long Model

In [92]:
mlflow.sklearn.autolog()

In [93]:
long_df = df.drop("if_short", axis=1)

In [94]:
long_y = df["if_long"]

In [95]:
long_x = long_df.drop("if_long", axis=1)

In [96]:
columns_to_drop = [
    'open', 'high', 'low', 'close', 'volume', 'quote_volume', 'count',
    'taker_buy_volume', 'taker_buy_quote_volume'
]

In [97]:
long_x.drop(columns_to_drop, axis=1, inplace=True)

In [98]:
X_train, X_test, y_train, y_test = train_test_split(long_x,
                                                    long_y,
                                                    test_size=0.1)

In [99]:
long_clf = RandomForestClassifier(max_depth=100,
                                  random_state=0,
                                  oob_score=True)

In [100]:
long_clf.fit(X_train, y_train)

2023/01/24 02:38:08 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '3239a2ba800d42f6bb0689113f4836d7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


RandomForestClassifier(max_depth=100, oob_score=True, random_state=0)

In [32]:
import pickle

# Save the model to a file
with open("long_clf.pkl", "wb") as f:
    pickle.dump(long_clf, f)

In [None]:
short_df = df.drop("if_long", axis=1)

In [None]:
short_y = df["if_short"]

In [None]:
short_x = short_df.drop("if_short", axis=1)

In [None]:
short_x.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(short_x,
                                                    short_y,
                                                    test_size=0.1)

In [None]:
short_clf = RandomForestClassifier(max_depth=100,
                                   random_state=0,
                                   oob_score=True)

In [None]:
short_clf.fit(X_train, y_train)