In [2]:
###libraries that i can remember, adding as i work 
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.decomposition import PCA
import os
import pickle
import matplotlib.pyplot as plt

# directory to save outputs
output_dir = "best_model_outputs"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#DATA FROM PREPROCESSING 
df = pd.read_csv('train_preprocessed.csv')  

# converting th e  time column to seconds
def time_to_seconds(time_str):
    try:
        h, m, s = map(int, time_str.split(':'))
    except ValueError:
        h, m = map(int, time_str.split(':'))
        s = 0
    return h * 3600 + m * 60 + s

df['time_seconds'] = df['time'].apply(time_to_seconds)  

# # features and target separation, since I change the name of bg to target during the preprocessing
X = df.drop(columns=['p_num', 'id', 'target', 'time'])  
X['time_seconds'] = df['time_seconds']  
y = df['target']  

### apply PCA to extract top 5 components
pca = PCA(n_components=5, random_state=123)
pca_components = pca.fit_transform(X)
pca_df = pd.DataFrame(pca_components, columns=[f'PC{i+1}' for i in range(5)])

# ## combine the top 5 PCs with the remaining original features
X_combined = pd.concat([X, pca_df], axis=1)

# train test split  (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# parameter grid for the randomsearch 
param_dist = {
    'max_depth': np.arange(1, 50),  # max_depth values for XGBoost
    'learning_rate': np.linspace(0.01, 0.3, 10),  # Learning rate for XGBoost
    'n_estimators': [100, 200, 300, 400, 500],  # Number of trees
    'subsample': np.linspace(0.5, 1.0, 5),  # Subsample ratio of the training instances
    'colsample_bytree': np.linspace(0.5, 1.0, 5)  # Subsample ratio of columns
}

# ## XGBoost model
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

#  randomizedSearchCV with return_train_score enabled
random_search = RandomizedSearchCV(
    xgb, param_distributions=param_dist, n_iter=50, scoring='r2', cv=5, random_state=42, n_jobs=-1, return_train_score=True
)
random_search.fit(X_train, y_train)

# results from the random search
results = pd.DataFrame(random_search.cv_results_)

# R-squared vs max_depth for train and test
train_scores_depth = []
test_scores_depth = []
depth_values = results['param_max_depth'].unique()

for depth in depth_values:
    subset = results[results['param_max_depth'] == depth]
    train_scores_depth.append(subset['mean_train_score'].mean())
    test_scores_depth.append(subset['mean_test_score'].mean())

plt.figure(figsize=(10, 6))
plt.plot(depth_values, train_scores_depth, label="Train R-squared", marker='o')
plt.plot(depth_values, test_scores_depth, label="Test R-squared", marker='o')
plt.xlabel("Max Depth")
plt.ylabel("R-squared")
plt.title("R-squared vs Max Depth (Random Search with Top 5 PCs + Original Features, XGBoost)")
plt.legend()
plt.grid()
plt.savefig(os.path.join(output_dir, "r_squared_vs_max_depth_random_search_xgboost_combined.png"))
plt.close()

# R-squared vs colsample_bytree for train and test
train_scores_features = []
test_scores_features = []
features_values = results['param_colsample_bytree'].unique()

for colsample in features_values:
    subset = results[results['param_colsample_bytree'] == colsample]
    train_scores_features.append(subset['mean_train_score'].mean())
    test_scores_features.append(subset['mean_test_score'].mean())

plt.figure(figsize=(10, 6))
plt.plot(features_values, train_scores_features, label="Train R-squared", marker='o')
plt.plot(features_values, test_scores_features, label="Test R-squared", marker='o')
plt.xlabel("Colsample by Tree")
plt.ylabel("R-squared")
plt.title("R-squared vs Colsample by Tree (Random Search with Top 5 PCs + Original Features, XGBoost)")
plt.legend()
plt.grid()
plt.savefig(os.path.join(output_dir, "r_squared_vs_colsample_bytree_random_search_xgboost_combined.png"))
plt.close()

# best model from random search
best_model = random_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Best model test set score:", test_score)

# Save the best model
model_filename = os.path.join(output_dir, "best_model.pkl")
with open(model_filename, 'wb') as model_file:
    pickle.dump(best_model, model_file)

#final test set score to a text file
score_filename = os.path.join(output_dir, "best_model_score.txt")
with open(score_filename, 'w') as score_file:
    score_file.write(f"Best model test set score: {test_score}\n")

# ## just checking the importance in the ascendingg
feature_importances = pd.DataFrame({
    'feature': X_combined.columns,
    'importance': best_model.feature_importances_
}).sort_values(by='importance', ascending=False)

feature_importances_filename = os.path.join(output_dir, "feature_importances_best_model.csv")
feature_importances.to_csv(feature_importances_filename, index=False)

print(f"Model, score, and feature importances saved in {output_dir}.")


import pickle
import os



# PCA model to the output directory
pca_filename = os.path.join(output_dir, "best_model_pca.pkl")
with open(pca_filename, 'wb') as pca_file:
    pickle.dump(pca, pca_file)

print(f"PCA model saved as {pca_filename}.")


##lets also add the pkl files for pca and the model to the current dir too
with open("best_model.pkl", 'wb') as model_file:
    pickle.dump(best_model, model_file)

with open("best_model_pca.pkl", 'wb') as pca_file:
    pickle.dump(pca, pca_file)

print(f"Model and PCA saved in both {output_dir} and the working directory.")

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

NameError: name 'pca' is not defined