In [None]:
# !pip install pyarrow

In [None]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pickle

#import xgbtune 

In [None]:
df = pd.read_parquet("../data/dataset.parquet")
df

In [None]:
# target = "target_close"
target = "close_percent_change_1"
# target = "up_down_close_1"

x = df.drop(columns=[target]).set_index("date")
y = df[["date", target]].set_index("date")
y

In [None]:
table_info = pd.DataFrame()

# Print columns that are not int, float, bool or category
for col in x.columns:
    dtype = x[col].dtype
    if dtype not in ["int", "float64", "bool", "category"]:
        table_info = pd.concat([table_info, pd.DataFrame({"Column Name": [col], "Data Type": [x[col].dtype]})])

table_info

In [None]:
# Convert columns to float if they are not int, float, bool or category. Handle Cannot cast DatetimeArray to dtype float64 (XGBoosted models cannot use strings, but categories as enumerated values)
for col in x.columns:
    dtype = x[col].dtype
    if dtype not in ["int", "float64", "bool", "category"]:
        try:
            x[col] = x[col].astype("float")
        except:
            # drop datetime columns
            x = x.drop(columns=[col])

            print(f"Dropped Column: {col}")

In [None]:
# # Normalizing the features between 0 and 1
# y_scaler = MinMaxScaler()
# y = y_scaler.fit_transform(y.values.reshape(-1, 1)).flatten()

# x_scaler = MinMaxScaler()
# x = x_scaler.fit_transform(x)

# y

In [None]:
#    All Features   | Target
# +-----------------+---------+
# | x_train         | y_train | <- 85% of the data which is used for training
# |                 |         |
# +-----------------+---------+
# | x_test          | y_test  | <- 15% of the data which is used for testing
# +-----------------+---------+


# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, shuffle=False)
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, shuffle=True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True)
# x_train

In [None]:
# y_test

In [None]:
x_train

In [None]:
import numpy

from itertools import count, takewhile
def frange(start, stop, step):
    return takewhile(lambda x: x< stop, count(start, step))

# list(frange(0, 1, 0.1))
numpy.linspace(5, 10, num=6).tolist()

In [None]:
y_train

In [None]:
from xgboost import XGBClassifier

# model = xgb.XGBRegressor(n_estimators=100, max_depth=7, eta=0.1, subsample=1, colsample_bytree=.3)
# model = xgb.XGBRegressor(n_estimators=150, max_depth=7, eta=0.05, subsample=1, colsample_bytree=.3)
# model = xgb.XGBRegressor(n_estimators=100, max_depth=7, eta=0.05, subsample=1, colsampvle_bytree=.3)
model = XGBClassifier(n_estimators=500, max_depth=5, eta=0.05, subsample=0.9, colsample_bytree=0.4)
model.fit(x_train, y_train)



pickle.dump(model, open("../model/xgboost_model.pkl", "wb")) # Save model as Python pickle object
model.save_model("../model/xgboost_model.json")

In [None]:
# print(model.best_params_)
# model


In [None]:
predicted = model.predict(x_test)
y_test["predicted"] = predicted
y_test

In [None]:
import sklearn.metrics as metrics

accuracy = metrics.accuracy_score(y_test[target], y_test["predicted"])
print("accuracy:", accuracy)

balanced_accuracy_score = metrics.balanced_accuracy_score(y_test[target], y_test["predicted"])
print("balanced_accuracy_score:", balanced_accuracy_score)

recall = metrics.recall_score(y_test[target], y_test["predicted"])
print("recall:", recall)

precision = metrics.precision_score(y_test[target], y_test["predicted"])
print("precision:", precision)

average_precision = metrics.average_precision_score(y_test[target], y_test["predicted"])
print("average_precision:", average_precision)

f1 = metrics.f1_score(y_test[target], y_test["predicted"])
print("f1:", f1)

# more metrics: https://scikit-learn.org/stable/modules/model_evaluation.html


accuracy: 0.5153922542204568
recall: 0.6316793893129771
precision: 0.5287539936102237
f1: 0.5756521739130435

In [None]:
# Confidaence interval
from sklearn.utils import resample
from sklearn.metrics import accuracy_score

# configure bootstrap
n_iterations = 1000
n_size = int(len(y_test) * 0.5)
# run bootstrap
stats = list()

for i in range(n_iterations):
    # prepare train and test sets
    test = resample(y_test, n_samples=n_size)
    # calculate accuracy
    accuracy = accuracy_score(test[target], test["predicted"])
    stats.append(accuracy)

# plot scores
plt.hist(stats)
plt.show()

In [None]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test[target], y_test["predicted"]))
# print(classification_report(y_test, predicted))

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test[target], y_test["predicted"])


In [None]:
# Feature importance
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
# indices
# Print the feature ranking
print("Feature ranking:")
for f in range(x_train.shape[1]):
    print(f"{f + 1}. feature {x_train.columns[indices[f]]} ({importances[indices[f]]})")

In [None]:
# Save predicted values
y_test.sort_index().to_parquet("../data/predicted.parquet")