In [5]:
# ------------------------------
# Full XGBoost Pipeline with Hyperparameter Tuning
# ------------------------------

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report

# ------------------------------
# Step 1: Encode target
# ------------------------------
# Assuming your dataframe is named 'data'
data=pd.read_csv('startup_final_dataset.csv')
data["status"] = data["status"].map({"closed": 0, "acquired": 1})

# ------------------------------
# Step 2: Split features and target
# ------------------------------
X = data.drop("status", axis=1)
y = data["status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# ------------------------------
# Step 3: Compute scale_pos_weight
# ------------------------------
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print("Scale_pos_weight for XGBoost:", scale_pos_weight)

# ------------------------------
# Step 4: Define pipeline
# ------------------------------
xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Optional for XGBoost
    ('xgb', XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        scale_pos_weight=scale_pos_weight
    ))
])

# ------------------------------
# Step 5: Define hyperparameter grid
# ------------------------------
param_grid = {
    'xgb__n_estimators': [100, 300, 500],
    'xgb__max_depth': [3, 5, 7],
    'xgb__learning_rate': [0.01, 0.05, 0.1],
    'xgb__subsample': [0.6, 0.8, 1.0],
    'xgb__colsample_bytree': [0.6, 0.8, 1.0],
    'xgb__gamma': [0, 1, 5],
    'xgb__reg_alpha': [0, 0.1, 0.5],
    'xgb__reg_lambda': [1, 1.5, 2]
}

# ------------------------------
# Step 6: RandomizedSearchCV
# ------------------------------
search = RandomizedSearchCV(
    xgb_pipeline,
    param_distributions=param_grid,
    n_iter=25,  # number of random combinations
    scoring='roc_auc',
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# ------------------------------
# Step 7: Fit pipeline
# ------------------------------
search.fit(X_train, y_train)

# ------------------------------
# Step 8: Best hyperparameters
# ------------------------------
print("Best hyperparameters:", search.best_params_)

# ------------------------------
# Step 9: Evaluate on test set
# ------------------------------
y_pred = search.predict(X_test)
y_prob = search.predict_proba(X_test)[:, 1]

print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print(classification_report(y_test, y_pred))


Scale_pos_weight for XGBoost: 0.5471698113207547
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best hyperparameters: {'xgb__subsample': 0.6, 'xgb__reg_lambda': 1.5, 'xgb__reg_alpha': 0, 'xgb__n_estimators': 100, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.05, 'xgb__gamma': 0, 'xgb__colsample_bytree': 0.8}
ROC-AUC: 0.83
              precision    recall  f1-score   support

           0       0.58      0.71      0.64        65
           1       0.82      0.72      0.77       120

    accuracy                           0.72       185
   macro avg       0.70      0.72      0.70       185
weighted avg       0.74      0.72      0.72       185



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [6]:
import joblib

# Save the entire pipeline
joblib.dump(search.best_estimator_, "xgb_pipeline.pkl")


['xgb_pipeline.pkl']

In [7]:
data.head()

Unnamed: 0,relationships,funding_rounds,funding_total_usd,milestones,has_VC,has_angel,avg_participants,status,startup_age,execution_velocity,rounds_per_year
0,3,3,375000,3,0,1,1.0,1,3,0.75,0.75
1,9,4,40100000,1,1,0,4.75,1,9,0.1,0.4
2,5,1,2600000,2,0,0,4.0,1,1,1.0,0.5
3,5,3,40000000,1,0,0,3.3333,1,5,0.166667,0.5
4,2,2,1300000,1,1,1,1.0,0,2,0.333333,0.666667
