In [7]:
import pandas as pd
import numpy as np

df = pd.read_csv("./train.csv")
df

Unnamed: 0,id,date,hour,bc_price,bc_demand,ab_price,ab_demand,transfer,bc_price_evo
0,0,0.452060,1.000000,0.074096,0.578846,0.005029,0.494821,0.489912,UP
1,1,0.455555,0.574468,0.033025,0.349003,0.001554,0.264889,0.829386,DOWN
2,2,0.027521,0.617021,0.098325,0.533918,0.003467,0.422915,0.414912,UP
3,3,0.455732,0.936170,0.041822,0.588515,0.002860,0.448731,0.525000,UP
4,4,0.000044,0.255319,0.051489,0.302440,0.003467,0.422915,0.414912,UP
...,...,...,...,...,...,...,...,...,...
28850,28850,0.026503,1.000000,0.082232,0.427551,0.003467,0.422915,0.414912,UP
28851,28851,0.451927,0.574468,0.033626,0.564564,0.002198,0.624806,0.553947,DOWN
28852,28852,0.907482,0.893617,0.055872,0.329664,0.003695,0.316416,0.602193,DOWN
28853,28853,0.915800,0.936170,0.044884,0.355549,0.003072,0.241326,0.420614,DOWN


In [8]:
# Drop id column
df = df.drop(columns=['id'], errors='ignore')

# Encode target
df['bc_price_evo'] = df['bc_price_evo'].map({'UP': 1, 'DOWN': 0})

In [9]:
# Process hour
df['hour_sin'] = np.sin(2 * np.pi * df['hour'])
df['hour_cos'] = np.cos(2 * np.pi * df['hour'])
df = df.drop(columns=['hour'], errors='ignore')

# Feature engineering: differences and ratios
df['price_diff'] = df['bc_price'] - df['ab_price']
df['demand_diff'] = df['bc_demand'] - df['ab_demand']
df['price_demand_ratio_bc'] = df['bc_price'] / (df['bc_demand'] + 1e-6)
df['price_demand_ratio_ab'] = df['ab_price'] / (df['ab_demand'] + 1e-6)

In [10]:
# Drop rows with NaN created
df = df.dropna().reset_index(drop=True)

In [11]:
from sklearn.preprocessing import StandardScaler

# Scale numeric features (for LogisticRegression / SVM)
numeric_cols = ['bc_price', 'bc_demand', 'ab_price', 'ab_demand', 'transfer',
                'price_diff', 'demand_diff', 'price_demand_ratio_bc', 'price_demand_ratio_ab']

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


In [13]:
X = df.drop(columns=['bc_price_evo'])
y = df['bc_price_evo']

print(X.shape)
print(y.shape)


(28855, 12)
(28855,)


In [14]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

In [97]:
gb_param_dist = {
    'n_estimators': uniform(100, 800),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(2, 10),
    'subsample': uniform(0.6, 0.4),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}

gb_model = GradientBoostingClassifier(random_state=42)

gb_random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=gb_param_dist,
    n_iter=50,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

gb_random_search.fit(X, y)
best_gb = gb_random_search.best_estimator_

print("Best GB params:", gb_random_search.best_params_)
print("Best GB CV accuracy:", gb_random_search.best_score_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best GB params: {'learning_rate': np.float64(0.10877874303668693), 'max_depth': 9, 'min_samples_leaf': 4, 'min_samples_split': 9, 'n_estimators': 491, 'subsample': np.float64(0.9396893641976711)}
Best GB CV accuracy: 0.9094438748464556


In [122]:
y_real_test = gb_random_search.predict(df_test)
print(y_real_test.shape)

(9619,)


In [123]:
test = pd.read_csv('./test.csv')

submission = pd.DataFrame({
    'id': test['id'],
    'bc_price_evo': np.where(y_real_test == 1, 'UP', 'DOWN')
})

submission.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")

Submission file saved as submission.csv


#Futher testing

In [125]:
gb_param_dist = {
    'n_estimators': randint(100, 1000),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(2, 10),
    'subsample': uniform(0.6, 0.4),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}

gb_model = GradientBoostingClassifier(random_state=42)

gb_random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=gb_param_dist,
    n_iter=200,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

gb_random_search.fit(X, y)
best_gb = gb_random_search.best_estimator_

print("Best GB params:", gb_random_search.best_params_)
print("Best GB CV accuracy:", gb_random_search.best_score_)

Fitting 3 folds for each of 200 candidates, totalling 600 fits
Best GB params: {'learning_rate': np.float64(0.12489462263598237), 'max_depth': 9, 'min_samples_leaf': 8, 'min_samples_split': 4, 'n_estimators': 972, 'subsample': np.float64(0.9002007699061254)}
Best GB CV accuracy: 0.9119738101229881
[CV] END learning_rate=0.010155753168202867, max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=559, subsample=0.8446612641953124; total time=  33.3s
[CV] END learning_rate=0.012652992231973307, max_depth=2, min_samples_leaf=2, min_samples_split=3, n_estimators=876, subsample=0.6063865008880857; total time=  16.9s
[CV] END learning_rate=0.04467293070155442, max_depth=2, min_samples_leaf=4, min_samples_split=3, n_estimators=971, subsample=0.902144564127061; total time=  26.9s
[CV] END learning_rate=0.16502656467222293, max_depth=3, min_samples_leaf=4, min_samples_split=7, n_estimators=962, subsample=0.9687496940092467; total time=  40.6s
[CV] END learning_rate=0.088976303635113

In [None]:
from scipy.stats import randint, uniform
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# --- Parameter search space ---
rf_param_dist = {
    'n_estimators': randint(100, 1000),        # number of trees
    'max_depth': randint(3, 30),               # tree depth
    'min_samples_split': randint(2, 10),       # min samples to split
    'min_samples_leaf': randint(1, 10),        # min samples per leaf
    'max_features': uniform(0.5, 0.5),         # fraction of features (0.5–1.0)
    'bootstrap': [True, False],                # whether to bootstrap samples
    'criterion': ['gini', 'entropy', 'log_loss']  # splitting criterion
}

# --- Model & search ---
rf_model = RandomForestClassifier(random_state=42)

rf_random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=rf_param_dist,
    n_iter=200,              # number of random combinations
    scoring='accuracy',
    cv=3,                    # 3-fold cross validation
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# --- Fit search ---
rf_random_search.fit(X, y)

# --- Results ---
best_rf = rf_random_search.best_estimator_
print("Best RF params:", rf_random_search.best_params_)
print("Best RF CV accuracy:", rf_random_search.best_score_)


Fitting 3 folds for each of 200 candidates, totalling 600 fits
[CV] END bootstrap=True, criterion=gini, max_depth=17, max_features=0.8659969709057025, min_samples_leaf=5, min_samples_split=8, n_estimators=221; total time=  20.0s
[CV] END bootstrap=False, criterion=entropy, max_depth=23, max_features=0.8087407548138583, min_samples_leaf=6, min_samples_split=6, n_estimators=847; total time= 2.1min
[CV] END bootstrap=True, criterion=entropy, max_depth=28, max_features=0.7248770666848828, min_samples_leaf=4, min_samples_split=7, n_estimators=962; total time= 1.4min
[CV] END bootstrap=True, criterion=log_loss, max_depth=3, max_features=0.8029799873905057, min_samples_leaf=7, min_samples_split=3, n_estimators=140; total time=   3.6s
[CV] END bootstrap=False, criterion=log_loss, max_depth=11, max_features=0.811649063413779, min_samples_leaf=2, min_samples_split=2, n_estimators=147; total time=  17.3s
[CV] END bootstrap=True, criterion=log_loss, max_depth=7, max_features=0.9858560476945519, mi



In [None]:
from scipy.stats import randint, uniform
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# --- Parameter search space ---
xgb_param_dist = {
    'n_estimators': randint(100, 1000),          # number of boosting rounds
    'max_depth': randint(3, 15),                 # tree depth
    'learning_rate': uniform(0.01, 0.3),         # shrinkage
    'subsample': uniform(0.6, 0.4),              # fraction of samples per tree (0.6–1.0)
    'colsample_bytree': uniform(0.6, 0.4),       # fraction of features per tree
    'gamma': uniform(0, 0.5),                    # minimum loss reduction for further split
    'reg_lambda': uniform(0.5, 2.0),             # L2 regularization
    'reg_alpha': uniform(0, 1.0),                # L1 regularization
    'min_child_weight': randint(1, 10)           # minimum sum of weights in child
}

# --- Model & search ---
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',   # avoid warning
    random_state=42,
    use_label_encoder=False,
    n_jobs=-1
)

xgb_random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_param_dist,
    n_iter=200,              # number of random combinations
    scoring='accuracy',
    cv=3,                    # 3-fold cross-validation
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# --- Fit search ---
xgb_random_search.fit(X, y)

# --- Results ---
best_xgb = xgb_random_search.best_estimator_
print("Best XGB params:", xgb_random_search.best_params_)
print("Best XGB CV accuracy:", xgb_random_search.best_score_)
