In [2]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   - -------------------------------------- 5.5/150.0 MB 37.2 MB/s eta 0:00:04
   ---- ----------------------------------- 17.0/150.0 MB 48.8 MB/s eta 0:00:03
   ------- -------------------------------- 29.4/150.0 MB 51.8 MB/s eta 0:00:03
   ----------- ---------------------------- 41.9/150.0 MB 54.4 MB/s eta 0:00:02
   -------------- ------------------------- 54.5/150.0 MB 55.1 MB/s eta 0:00:02
   ----------------- ---------------------- 64.5/150.0 MB 53.4 MB/s eta 0:00:02
   -------------------- ------------------- 75.8/150.0 MB 53.1 MB/s eta 0:00:02
   ----------------------- ---------------- 87.0/150.0 MB 53.4 MB/s eta 0:00:02
   -------------------------- ------------- 99.1/150.0 MB 53.6 MB/s eta 0:00:01
   ---------------------------- ---------- 110.4/150.0 MB 53.8

In [4]:
import time
import numpy as np
import pandas as pd
from patsy import dmatrices
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [5]:
df_patient = pd.read_csv('dfdata.csv')

In [6]:
print(df_patient.shape)

(100000000, 9)


In [7]:
formula = 'outcome ~ pregnant + glucose + pressure + triceps + insulin + mass + pedigree + age'

# Define frac values for different sample sizes, starting large and going down
frac_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]

# Initialize results table
results_table = pd.DataFrame(columns=['Model Description', 'Sample Size', 'Test Accuracy', 'Time Taken (s)'])

#loop through each frac value
for frac in frac_values:
    # Subsample the dataset
    df_patient_sub = df_patient.sample(frac=frac, random_state=32)
    sample_size = len(df_patient_sub)


    try:
        #patsy dmatrices
        Y, X = dmatrices(formula, df_patient_sub)

        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X,
            np.ravel(Y),
            test_size=0.25,
            random_state=42
        )

        # Start timing
        start_time = time.time()

        # Train model
        xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
        xgb_model.fit(X_train, y_train)

        # Predict & evaluate
        y_pred = xgb_model.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)

        # Stop timing
        end_time = time.time()
        time_taken = end_time - start_time

        # Append to results table
        results_table = pd.concat([
            results_table,
            pd.DataFrame([{
                'Model Description': f'XGBoost (frac={frac})',
                'Sample Size': sample_size,
                'Test Accuracy': round(test_accuracy, 4),
                'Time Taken (s)': round(time_taken, 2)
            }])
        ], ignore_index=True)

        print(f"Done: frac={frac}, sample_size={sample_size}, test_acc={test_accuracy:.4f}")

    except Exception as e:
        print(f"Error processing frac={frac}: {e}")

# Final Results
print("\n=== Results Summary ===")
print(results_table)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  results_table = pd.concat([


Done: frac=1e-05, sample_size=1000, test_acc=0.9640


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Done: frac=0.0001, sample_size=10000, test_acc=0.9784


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Done: frac=0.001, sample_size=100000, test_acc=0.9864


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Done: frac=0.01, sample_size=1000000, test_acc=0.9916


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Done: frac=0.1, sample_size=10000000, test_acc=0.9931
Error processing frac=1: Unable to allocate 763. MiB for an array with shape (100000000, 1) and data type float64

=== Results Summary ===
       Model Description Sample Size  Test Accuracy  Time Taken (s)
0   XGBoost (frac=1e-05)        1000         0.9640            0.15
1  XGBoost (frac=0.0001)       10000         0.9784            0.14
2   XGBoost (frac=0.001)      100000         0.9864            0.35
3    XGBoost (frac=0.01)     1000000         0.9916            3.50
4     XGBoost (frac=0.1)    10000000         0.9931           34.76
