In [403]:
import pandas as pd
import numpy as np
import lightgbm as lgb
pd.options.mode.chained_assignment = None  # default='warn'
from sklearn.metrics import accuracy_score
from sklearn.model_selection import TimeSeriesSplit,GridSearchCV
import plotly.graph_objects as go
# import seaborn as sns

In [404]:
import talib
from talib import MA_Type

In [423]:
class Model(object):
    def __init__(self, df_stocks_train, df_stocks_val, df_stocks_test, features):
        self.df_stocks_train = df_stocks_train
        self.df_stocks_val = df_stocks_val
        self.df_stocks_test = df_stocks_test
        self.features = features
        
    def split_data(self):
        X_train, y_train = self.df_stocks_train[self.features], self.df_stocks_train["target"]
        X_val, y_val = self.df_stocks_val[self.features], self.df_stocks_val["target"]
        X_test, y_test = self.df_stocks_test[self.features], self.df_stocks_test["target"]
        return X_train, y_train, X_val, y_val, X_test, y_test
    
    def lgbm(self,sample_weight, params):
        
        X_train, y_train, X_val, y_val, X_test, y_test = self.split_data()
        
        sample_weights_train = np.abs(self.df_stocks_train["return"])*100
        sample_weights_val = np.abs(self.df_stocks_val["return"])*100
        sample_weights_test = np.abs(self.df_stocks_test["return"])*100
        
        train_data = lgb.Dataset(X_train, label=y_train, 
                                 weight=sample_weights_train if sample_weight else None
                                )
        valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data,
                                 weight=sample_weights_val if sample_weight else None
                                )
        test_data = lgb.Dataset(X_test, label=y_test, reference=train_data,
                                weight=sample_weights_test if sample_weight else None
                                )
        train_data.returns = np.array(sample_weights_train)
        valid_data.returns = np.array(sample_weights_val)
        test_data.returns = np.array(sample_weights_test) 
        
        eval_result ={}
        model = lgb.train(
                            params=params,
                            train_set=train_data,
                            valid_sets=[train_data, valid_data, test_data],
                            valid_names=['train', 'valid', 'test'],
                            # num_boost_round=50,
                            callbacks=[
                                lgb.log_evaluation(1),
                                lgb.record_evaluation(eval_result)
                            ],
                            feval=custom_profit_metric
                        )  
        valid_profit = eval_result["valid"]["custom_profit"]
        test_profit = eval_result["test"]["custom_profit"]
        correlation = np.corrcoef(valid_profit, test_profit)[0, 1]
        best_iteration = np.argmax(valid_profit)
        best_score = valid_profit[best_iteration].round(5)
        
        return best_iteration, best_score, correlation, model
        

In [405]:
def custom_profit_metric(y_pred, train_data):
    y_true = train_data.get_label()
    # print("y_true",len(y_true))
    if train_data.get_weight() is None:
        # returns = np.ones(len(y_true))
        returns = train_data.returns
    else:
        returns = np.asarray(train_data.get_weight(), dtype=np.float32)

    # returns = np.asarray(train_data.get_group(), dtype=np.float32)
    # print("returns",returns)    
    y_pred_rounded = np.where(y_pred >= 0.5, 1, 0)
    

    # profit = (y_true * returns - (1 - y_true) * returns) * (2 * y_true - 1) * (2 * y_pred_rounded - 1)
    profit = returns* (2 * y_true - 1) * (2 * y_pred_rounded - 1)
    profit = np.sum(profit)
    # profit = 5 
    return 'custom_profit', profit, True

In [406]:
# df_stocks = pd.read_csv('starbucks.csv')
df_stocks = pd.read_csv('apple.csv')
df_stocks["return"] = df_stocks["Close"].pct_change()
df_stocks["volume_change"] = df_stocks["Volume"].pct_change()
df_stocks = df_stocks.dropna()
df_stocks = df_stocks[df_stocks["Date"] < "2023-10-01"]

In [407]:
# output = talib.ROCP(df_stocks["Close"], timeperiod=60).shift(1)

In [408]:
# ATR                  Average True Range for returns
# df_stocks['ATR'] = talib.ATR(df_stocks['High'], df_stocks['Low'], df_stocks['Close'], timeperiod=14).shift(1)
# ax = df_stocks['ATR'].plot(figsize=(15, 5))
# ax2 = ax.twinx()
# df_stocks['Close'].plot(ax=ax2, color='r')

In [409]:
train_start_date = '2015-07-01'
train_end_date = '2021-01-01'
val_end_date = '2021-07-01'
test_end_date = '2021-10-01'
df_stocks_train = df_stocks.query('Date >= @train_start_date & Date < @train_end_date')

In [410]:
median_returns = df_stocks_train['return'].median()
df_stocks["median_return_train"] = median_returns
df_stocks["target"] = (df_stocks["return"] > df_stocks["median_return_train"]).astype(int)
df_stocks.drop(["median_return_train"], axis=1, inplace=True)

In [411]:
for i in range(1, 15):
    df_stocks[f"return_t-{i}"] = df_stocks["return"].shift(i)
    df_stocks[f"volume_t-{i}"] = df_stocks["Volume"].shift(i)
    df_stocks[f"volume_change_t-{i}"] = df_stocks["volume_change"].shift(i)
    # df_stocks[f"open_t-{i}"] = df_stocks["Open"].shift(-i)
    # df_stocks[f"high_t-{i}"] = df_stocks["High"].shift(-i)
    # df_stocks[f"low_t-{i}"] = df_stocks["Low"].shift(-i)
    # df_stocks[f"close_t-{i}"] = df_stocks["Close"].shift(-i)

ma_list = [7,14,30,60,90]
for i in ma_list:
    df_stocks[f'return_MA_{i}'] = df_stocks['return_t-1'].rolling(window=i).mean()
    
    df_stocks[f'ROCP_{i}'] = talib.ROCP(df_stocks["Close"], timeperiod=i).shift(1)
    
df_stocks["upper_band"], df_stocks["middle_band"], df_stocks["lower_band"] = talib.BBANDS(df_stocks["return_t-1"].values, matype=MA_Type.T3)
df_stocks['ATR_14'] = talib.ATR(df_stocks['High'], df_stocks['Low'], df_stocks['Close'], timeperiod=14).shift(1)

# df_stocks['prev_consecutive_returns'] = 0

# # Identify consecutive positive returns
# mask_positive = df_stocks['return_t-1'] >= 0
# df_stocks.loc[mask_positive, 'prev_consecutive_returns'] = df_stocks['return'].apply(lambda x: 1 if x >= 0 else 0).groupby((mask_positive != mask_positive.shift(-1)).cumsum()).cumsum().shift(fill_value=5)

# # Identify consecutive negative returns
# mask_negative = df_stocks['return_t-1'] < 0
# df_stocks.loc[mask_negative, 'prev_consecutive_returns'] = df_stocks['return'].apply(lambda x: -1 if x < 0 else 0).groupby((mask_negative != mask_negative.shift(-1)).cumsum()).cumsum().shift(fill_value=5)
df_stocks = df_stocks.dropna()
df_stocks.reset_index(drop=True, inplace=True)

In [412]:
df_stocks

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,Company_name,return,...,return_MA_30,ROCP_30,return_MA_60,ROCP_60,return_MA_90,ROCP_90,upper_band,middle_band,lower_band,ATR_14
0,2010-05-14,9.11,9.16,8.91,9.06,7.68,759362800,AAPL,Apple,-0.018418,...,0.003489,0.100119,0.004264,0.276625,0.002303,0.208115,0.087079,0.007389,-0.072301,0.393241
1,2010-05-17,9.10,9.15,8.85,9.08,7.70,762834800,AAPL,Apple,0.002208,...,0.002717,0.074733,0.003911,0.249655,0.002070,0.182768,0.074156,0.005502,-0.063152,0.388010
2,2010-05-18,9.18,9.23,8.94,9.01,7.64,782678400,AAPL,Apple,-0.007709,...,0.002434,0.065728,0.004063,0.261111,0.002283,0.205843,0.032927,0.003106,-0.026716,0.381723
3,2010-05-19,8.91,9.03,8.74,8.87,7.52,1025726800,AAPL,Apple,-0.015538,...,0.002021,0.052570,0.004027,0.258380,0.002212,0.198138,0.028975,0.000380,-0.028216,0.375172
4,2010-05-20,8.64,8.71,8.44,8.49,7.20,1282915200,AAPL,Apple,-0.042841,...,0.001386,0.032596,0.004047,0.259943,0.001965,0.171731,0.011505,-0.003156,-0.017817,0.369088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3363,2023-09-25,174.20,176.97,174.15,176.08,175.85,46172700,AAPL,Apple,0.007380,...,-0.000493,-0.017868,-0.001231,-0.076407,0.000251,0.015808,0.023903,-0.001823,-0.027548,3.544152
3364,2023-09-26,174.82,175.20,171.66,171.96,171.73,64588900,AAPL,Apple,-0.023398,...,-0.000213,-0.009618,-0.001138,-0.071259,0.000333,0.023304,0.020510,-0.000924,-0.022359,3.492427
3365,2023-09-27,172.62,173.04,169.05,170.43,170.21,66921800,AAPL,Apple,-0.008897,...,-0.001306,-0.041792,-0.001913,-0.113471,0.000033,-0.004227,0.022381,-0.002692,-0.027765,3.558682
3366,2023-09-28,169.34,172.03,167.62,170.69,170.47,56294400,AAPL,Apple,0.001526,...,-0.001229,-0.039560,-0.001931,-0.114465,-0.000218,-0.026392,0.017256,-0.004979,-0.027215,3.589490


In [413]:
column_index = df_stocks.columns.get_loc("target")
features = df_stocks.columns[column_index+1:]
# features = features.append(pd.Index(['return']))
features

Index(['return_t-1', 'volume_t-1', 'volume_change_t-1', 'return_t-2',
       'volume_t-2', 'volume_change_t-2', 'return_t-3', 'volume_t-3',
       'volume_change_t-3', 'return_t-4', 'volume_t-4', 'volume_change_t-4',
       'return_t-5', 'volume_t-5', 'volume_change_t-5', 'return_t-6',
       'volume_t-6', 'volume_change_t-6', 'return_t-7', 'volume_t-7',
       'volume_change_t-7', 'return_t-8', 'volume_t-8', 'volume_change_t-8',
       'return_t-9', 'volume_t-9', 'volume_change_t-9', 'return_t-10',
       'volume_t-10', 'volume_change_t-10', 'return_t-11', 'volume_t-11',
       'volume_change_t-11', 'return_t-12', 'volume_t-12',
       'volume_change_t-12', 'return_t-13', 'volume_t-13',
       'volume_change_t-13', 'return_t-14', 'volume_t-14',
       'volume_change_t-14', 'return_MA_7', 'ROCP_7', 'return_MA_14',
       'ROCP_14', 'return_MA_30', 'ROCP_30', 'return_MA_60', 'ROCP_60',
       'return_MA_90', 'ROCP_90', 'upper_band', 'middle_band', 'lower_band',
       'ATR_14'],
      d

In [414]:
# for feature in features:
#     df_stocks[feature] = pd.qcut(df_stocks[feature], q=5, labels=False)

In [415]:
df_stocks = df_stocks.query('Date >= @train_start_date').reset_index(drop=True)
df_stocks_train = df_stocks.query('Date >= @train_start_date & Date < @train_end_date')
df_stocks_val = df_stocks.query("Date >= @train_end_date & Date < @val_end_date")
# df_stocks_test = df_stocks.query("Date >= @val_end_date")
df_stocks_test = df_stocks.query("Date >= @val_end_date & Date < @test_end_date")

In [416]:
sample_weights_train = np.abs(df_stocks_train["return"])*100
sample_weights_val = np.abs(df_stocks_val["return"])*100
sample_weights_test = np.abs(df_stocks_test["return"])*100

In [417]:
X_train, y_train = df_stocks_train[features], df_stocks_train["target"]
X_val, y_val = df_stocks_val[features], df_stocks_val["target"]
X_test, y_test = df_stocks_test[features], df_stocks_test["target"]

In [400]:
tscv = TimeSeriesSplit(n_splits=10,test_size=62)

# Split the data into training and testing sets using time series split
for train_index, val_index in tscv.split(df_stocks):
    X_train, X_val = df_stocks[features].iloc[train_index], df_stocks[features].iloc[val_index]
    y_train, y_val = df_stocks["target"].iloc[train_index], df_stocks["target"].iloc[val_index]
    
    # Now you can train and evaluate your model on X_train, y_train, X_test, y_test
    # For example, print the indices of training and testing sets
    print("Train indices:", train_index, "Validation indices:", val_index)

Train indices: [   0    1    2 ... 1454 1455 1456] Validation indices: [1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470
 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484
 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498
 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512
 1513 1514 1515 1516 1517 1518]
Train indices: [   0    1    2 ... 1516 1517 1518] Validation indices: [1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532
 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546
 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560
 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574
 1575 1576 1577 1578 1579 1580]
Train indices: [   0    1    2 ... 1578 1579 1580] Validation indices: [1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594
 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608
 1609 1610 16

In [427]:
from itertools import product
# Define the elements
params = {
    'learning_rate': [0.01, 0.025, 0.05, 0.1, 0.15, 0.2],
    'max_depth': [-1,5, 7, 9, 11, 13],
    "n_estimators": [100]
}

# Get all combinations
combinations = list(product(*params.values()))
number_of_splits = 5
test_size = 120
combination_length = len(combinations)
tscv = TimeSeriesSplit(n_splits=number_of_splits,test_size=test_size)

# Display the combinations
print(f"Fitting {number_of_splits} folds for each of {combination_length} candidates, totalling {number_of_splits * combination_length} fits")
for combo in combinations:
    current_params = dict(zip(params.keys(), combo))
    current_split = 1
    val_current_score = 0
    test_current_score = 0
    for train_index, test_index in tscv.split(df_stocks):
        # X_train, X_val = df_stocks[features].iloc[train_index], df_stocks[features].iloc[val_index]
        # y_train, y_val = df_stocks["target"].iloc[train_index], df_stocks["target"].iloc[val_index]
        df_stocks_train, df_stocks_val, df_stocks_test = df_stocks[features].iloc[train_index[:-test_size]], df_stocks[features].iloc[train_index[-test_size:]], df_stocks[features].iloc[test_index]
        lgbm_modell = Model(df_stocks_train, df_stocks_val, df_stocks_test, features)
        
        print(f"[CV {current_split}/{tscv.n_splits}], {current_params};, val_score: {val_current_score}, test_score: {test_current_score}")
        current_split += 1
    # print(current_params)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5], {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 100};, val_score: 0, test_score: 0
[CV 2/5], {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 100};, val_score: 0, test_score: 0
[CV 3/5], {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 100};, val_score: 0, test_score: 0
[CV 4/5], {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 100};, val_score: 0, test_score: 0
[CV 5/5], {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 100};, val_score: 0, test_score: 0
[CV 1/5], {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100};, val_score: 0, test_score: 0
[CV 2/5], {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100};, val_score: 0, test_score: 0
[CV 3/5], {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100};, val_score: 0, test_score: 0
[CV 4/5], {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100};, val_score: 0, test_score: 0
[CV 5/5], {'learning_rat

In [418]:
import warnings
warnings.filterwarnings("ignore")
# Base parameters with early stopping
base_params = {"objective": "binary", "metric": ["auc"], 
               }

base_model = lgb.LGBMClassifier(**base_params)

# Hyperparameter grid (without n_estimators)
param_grid = {
    "n_estimators": [100, 200,250],
    'learning_rate': [0.05, 0.1,],
    'max_depth': [5, 7, 9, 11, 13],
    # Add other hyperparameters as needed
}

# Time series cross-validation
tscv = TimeSeriesSplit(n_splits=5, test_size=62)

# Grid search with time-based cross-validation
grid_search = GridSearchCV(
    base_model,
    param_grid,
    scoring='roc_auc',  # Use the metric you are interested in optimizing
    verbose=3,
    cv=tscv  # Use TimeSeriesSplit for cross-validation
    # early_stopping_rounds=10
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train,verbose=3)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best Score:", grid_search.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END learning_rate=0.05, max_depth=5, n_estimators=100;, score=nan total time=   0.0s
[CV 2/5] END learning_rate=0.05, max_depth=5, n_estimators=100;, score=nan total time=   0.0s
[CV 3/5] END learning_rate=0.05, max_depth=5, n_estimators=100;, score=nan total time=   0.0s
[CV 4/5] END learning_rate=0.05, max_depth=5, n_estimators=100;, score=nan total time=   0.0s
[CV 5/5] END learning_rate=0.05, max_depth=5, n_estimators=100;, score=nan total time=   0.0s
[CV 1/5] END learning_rate=0.05, max_depth=5, n_estimators=200;, score=nan total time=   0.0s
[CV 2/5] END learning_rate=0.05, max_depth=5, n_estimators=200;, score=nan total time=   0.0s
[CV 3/5] END learning_rate=0.05, max_depth=5, n_estimators=200;, score=nan total time=   0.0s
[CV 4/5] END learning_rate=0.05, max_depth=5, n_estimators=200;, score=nan total time=   0.0s
[CV 5/5] END learning_rate=0.05, max_depth=5, n_estimators=200;, score=nan total time=   0.0

ValueError: 
All the 150 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "d:\miniconda\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
TypeError: fit() got an unexpected keyword argument 'verbose'


In [366]:
train_data = lgb.Dataset(X_train, label=y_train, 
                        #  weight=sample_weights_train
                         )
valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data,
                        #  weight=sample_weights_val
                         )
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data,
                        # weight=sample_weights_test
                        )
train_data.returns = np.array(sample_weights_train)
valid_data.returns = np.array(sample_weights_val)
test_data.returns = np.array(sample_weights_test)

eval_result ={}
params = {"objective": "binary", 
          "metric": [None], 
          "verbose": 1,
          # "learning_rate":0.1,
          "learning_rate":0.05,
          # "max_depth" :11,
          "num_leaves": 2**5-1,
          # "colsample_bytree":0.8,
          # "lambda_l1":0.1,
          "lambda_l2":0.1,
          # "early_stopping_rounds": 40,
          }
    
model = lgb.train(params, train_data, valid_sets=[train_data, valid_data, test_data], valid_names=['train', 'valid', 'test'], num_boost_round=50,
                  callbacks=[lgb.log_evaluation(1),
                             lgb.record_evaluation(eval_result)],
                  feval=custom_profit_metric
                  )
# print(f"Best iteration: {model.best_iteration}", f"Best score: {model.best_score['valid_1']['custom_profit'].round(5)}")
valid_profit = eval_result["valid"]["custom_profit"]
test_profit = eval_result["test"]["custom_profit"]
correlation = np.corrcoef(valid_profit, test_profit)[0, 1]
best_iteration = np.argmax(valid_profit)
best_score = valid_profit[best_iteration].round(5)
print(f"Best iteration: {best_iteration+1}", f"Best score: {best_score}", f"Correlation: {correlation.round(5)}")

[LightGBM] [Info] Number of positive: 693, number of negative: 694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000649 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14280
[LightGBM] [Info] Number of data points in the train set: 1387, number of used features: 56
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499640 -> initscore=-0.001442
[LightGBM] [Info] Start training from score -0.001442
[1]	train's custom_profit: 754.595	valid's custom_profit: 18.2156	test's custom_profit: -12.4202
[2]	train's custom_profit: 916.012	valid's custom_profit: 9.26656	test's custom_profit: 6.52912
[3]	train's custom_profit: 960.974	valid's custom_profit: 10.0081	test's custom_profit: -2.01901
[4]	train's custom_profit: 1016.69	valid's custom_profit: -10.1308	test's custom_profit: 9.49288
[5]	train's custom_profit: 1136.89	valid's custom_profit: -3.16891	test's custom_profit: -5.72678
[6]	train's custom_profit

In [344]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=df_stocks_train['Date'], y=df_stocks_train['Close'], mode='lines', name='Train'))
fig.add_trace(go.Scatter(x=df_stocks_val['Date'], y=df_stocks_val['Close'], mode='lines', name='Validation'))
fig.add_trace(go.Scatter(x=df_stocks_test['Date'], y=df_stocks_test['Close'], mode='lines', name='Test'))

fig.update_layout(title='Close Price over Time', xaxis_title='Date', yaxis_title='Close')

fig.show()

In [345]:
best_iteration = np.argmax(eval_result["valid"]["custom_profit"]) + 1
# best_iteration = 91
best_iteration

17

In [346]:
# eval_result["valid"]["custom_profit"]
# eval_results_df = pd.DataFrame(eval_result["valid"])
# eval_results_df

In [347]:
eval_result["valid"]["custom_profit"]
eval_results_df = pd.DataFrame(eval_result["valid"])
eval_results_df = eval_results_df.sort_values(by="custom_profit", ascending=False)
eval_results_df.index += 1
eval_results_df = eval_results_df.drop_duplicates(subset=["custom_profit"])
eval_results_df.head(10)
# eval_results_df = eval_results_df[(eval_results_df["custom_profit"] > 15) & (eval_results_df["custom_profit"] < 25)]

Unnamed: 0,custom_profit
17,26.659565
18,26.57814
12,25.849257
13,25.349255
2,23.068471
9,22.154657
3,21.782084
14,21.195422
20,16.686449
1,16.164002


In [349]:
importances = model.feature_importance(importance_type='split' ,iteration=best_iteration)
normalized_importances = (importances / importances.sum()) * 100
normalized_importances = np.round(normalized_importances, 2)
list(sorted(zip(features, normalized_importances), key=lambda xx: xx[1], reverse=True))

[('volume_change_t-3', 4.9),
 ('return_t-9', 4.9),
 ('return_t-4', 4.51),
 ('volume_t-14', 4.31),
 ('return_t-1', 3.92),
 ('return_t-7', 3.92),
 ('volume_t-4', 3.33),
 ('return_t-8', 3.33),
 ('ATR_14', 3.33),
 ('return_t-5', 2.94),
 ('volume_change_t-7', 2.94),
 ('volume_t-3', 2.75),
 ('volume_change_t-12', 2.55),
 ('volume_change_t-13', 2.55),
 ('volume_change_t-14', 2.55),
 ('return_MA_90', 2.55),
 ('return_t-12', 2.35),
 ('return_t-6', 2.16),
 ('volume_t-11', 2.16),
 ('ROCP_90', 2.16),
 ('return_t-3', 1.96),
 ('volume_change_t-8', 1.76),
 ('volume_change_t-4', 1.57),
 ('volume_change_t-5', 1.57),
 ('volume_t-9', 1.57),
 ('return_t-10', 1.57),
 ('return_t-14', 1.57),
 ('ROCP_60', 1.57),
 ('upper_band', 1.57),
 ('volume_t-5', 1.37),
 ('return_t-13', 1.37),
 ('return_MA_7', 1.37),
 ('return_MA_14', 1.37),
 ('volume_change_t-10', 1.18),
 ('return_t-11', 1.18),
 ('volume_change_t-1', 0.98),
 ('volume_t-8', 0.98),
 ('volume_change_t-9', 0.98),
 ('volume_t-10', 0.98),
 ('volume_t-12', 0.98

In [350]:
# predictions = model.predict(X_val)
predictions = model.predict(X_val, num_iteration = best_iteration)
train_pred_rounded = (predictions >= 0.5).astype(int)
df_stocks_val["prediction"] = train_pred_rounded
df_stocks_val["prediction"] = df_stocks_val["prediction"].map({1: 1, 0: -1})
df_stocks_val["investment"] = df_stocks_val["prediction"] * df_stocks_val["return"]
# # df_stocks_val["correct"] = (df_stocks_val["prediction"] == df_stocks_val["label"]).astype(int)
# df_stocks_val.groupby('correct')['profit_loss'].sum()
df_stocks_val["investment"].sum()

0.2667545220123476

In [351]:
# Ensemble predictions for the validation set
iterations = eval_results_df.head(5).index
predictions = []

for iteration in iterations:
    pred = model.predict(X_val, num_iteration=iteration)
    predictions.append(pred)

# Average the predictions
ensemble_predictions = np.mean(predictions, axis=0)
train_pred_rounded = (ensemble_predictions >= 0.5).astype(int)
df_stocks_val["prediction"] = train_pred_rounded
df_stocks_val["prediction"] = df_stocks_val["prediction"].map({1: 1, 0: -1})
df_stocks_val["investment"] = df_stocks_val["prediction"] * df_stocks_val["return"]
# # df_stocks_val["correct"] = (df_stocks_val["prediction"] == df_stocks_val["label"]).astype(int)
# df_stocks_val.groupby('correct')['profit_loss'].sum()
df_stocks_val["investment"].sum()

0.2278328659030412

In [352]:
accuracy_score(y_val, train_pred_rounded)

0.5080645161290323

In [353]:
# Ensemble predictions for the test set
iterations = eval_results_df.head(5).index
predictions = []

for iteration in iterations:
    pred = model.predict(X_test, num_iteration=iteration)
    predictions.append(pred)
ensemble_predictions = np.mean(predictions, axis=0)
train_pred_rounded = (ensemble_predictions >= 0.5).astype(int)
df_stocks_test["prediction"] = train_pred_rounded
df_stocks_test["prediction"] = df_stocks_test["prediction"].map({1: 1, 0: -1})
df_stocks_test["investment"] = df_stocks_test["prediction"] * df_stocks_test["return"]
# # df_stocks_test["correct"] = (df_stocks_test["prediction"] == df_stocks_test["label"]).astype(int)
# df_stocks_test.groupby('correct')['profit_loss'].sum()
df_stocks_test["investment"].sum()

0.11536222455203626

In [354]:
predictions = model.predict(X_test, num_iteration =best_iteration)
train_pred_rounded = (predictions >= 0.5).astype(int)
df_stocks_test["prediction"] = train_pred_rounded
df_stocks_test["prediction"] = df_stocks_test["prediction"].map({1: 1, 0: -1})
df_stocks_test["investment"] = df_stocks_test["prediction"] * df_stocks_test["return"]
# # df_stocks_test["correct"] = (df_stocks_test["prediction"] == df_stocks_test["label"]).astype(int)
# df_stocks_test.groupby('correct')['profit_loss'].sum()
df_stocks_test["investment"].sum()

0.1181830990231223

In [355]:
accuracy_score(y_test, train_pred_rounded)

0.578125

In [356]:
pd.Series(train_pred_rounded).value_counts()

1    44
0    20
dtype: int64

In [288]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit

# Sample data
data = {
    'Date': pd.date_range(start='2023-01-01', periods=100, freq='D'),
    'Features': [i for i in range(100)],
    'Label': [1 if i % 2 == 0 else 0 for i in range(100)]
}

df = pd.DataFrame(data)

# Assuming your features are in X and labels in y
X = df[['Features']]
y = df['Label']

# Time series split
tscv = TimeSeriesSplit(n_splits=3,test_size=10)

# Split the data into training and testing sets using time series split
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Now you can train and evaluate your model on X_train, y_train, X_test, y_test
    # For example, print the indices of training and testing sets
    print("Train indices:", train_index, "Test indices:", test_index)

Train indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69] Test indices: [70 71 72 73 74 75 76 77 78 79]
Train indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79] Test indices: [80 81 82 83 84 85 86 87 88 89]
Train indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89] Test indices: [90 91 92 93 94 95 96 97 98 99]
