#### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.base import clone
import xgboost as xgb
import matplotlib.pyplot as plt
import joblib


#### Preprocessing

In [2]:
# Load data
df = pd.read_csv('features.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp').reset_index(drop=True)

# Split into feature and target
X = df.drop(['timestamp', 'target'], axis=1)
y = df['target']

# Discretize target into binary classes
y_binary = np.where(y > 0, 1, 0)  

# Train test split
split_idx = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y_binary[:split_idx], y_binary[split_idx:]

# Scale features 
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

#### Hyperparameter tuning and model training

In [3]:
base_model = xgb.XGBClassifier(random_state=11)

param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [3, 9],
    'learning_rate': [0.05, 0.2],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0],
    'reg_alpha': [0, 0.2],
    'reg_lambda': [0, 0.2]
}

ts_split = TimeSeriesSplit(n_splits=3)
grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, cv=ts_split, 
                           scoring='accuracy', n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
joblib.dump(best_model, 'xgboost_model.pkl')

Fitting 3 folds for each of 128 candidates, totalling 384 fits


['xgboost_model.pkl']

#### Model results
The model performance looks unexceptional, this may be due to lack of training data (only ~1600), or could reflect low information that can be extracted from our features

either 1. continue to train the model using new data after deployment, or 2. try to extract more features

In [4]:
# Model performance on cross validation splits
results = []
for i, (train_idx, eval_idx) in enumerate(ts_split.split(X_train)):
    model = clone(best_model)
    model.fit(X_train.iloc[train_idx], y_train[train_idx])
    y_pred = model.predict(X_train.iloc[eval_idx])
    acc = accuracy_score(y_train[eval_idx], y_pred)
    results.append(f"Split {i}: {acc:.4f}")

# Model performance on test set
y_pred = best_model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
results.append(f"Test: {test_acc:.4f}")

print("\nResults:\n" + "\n".join(results))


Results:
Split 0: 0.5354
Split 1: 0.5666
Split 2: 0.5609
Test: 0.5311
