In [1]:
import time
import numpy as np
import pandas as pd
from patsy import dmatrices
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [5]:
df_patient = pd.read_csv('dfdata.csv')

In [6]:
print(df_patient.head())
print(df_patient.shape)

   pregnant  glucose  pressure  triceps  insulin  mass  pedigree   age  \
0         1    106.0      74.0     14.0    140.0  29.3     0.326  21.0   
1         9    126.0      48.0     19.0    145.0  29.0     0.160  35.0   
2        15     81.0      50.0     18.0    106.0  37.4     0.361  46.0   
3         1    107.0      76.0     34.0    105.0  26.4     0.262  24.0   
4         2    118.0      78.0     15.0    255.0  28.4     0.199  21.0   

   outcome  
0      0.0  
1      0.0  
2      0.0  
3      0.0  
4      0.0  
(9636650, 9)


In [7]:
# Replace with your actual formula, e.g., 'outcome ~ age + gender + bmi'
formula = 'outcome ~ pregnant + glucose + pressure + triceps + insulin + mass + pedigree + age'

# Define frac values for different sample sizes, starting large and going down
frac_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]

# Initialize results table
results_table = pd.DataFrame(columns=['Model Description', 'Sample Size', 'Test Accuracy', 'Time Taken (s)'])

# Loop through each frac value
for frac in frac_values:
    # Subsample the dataset
    df_patient_sub = df_patient.sample(frac=frac, random_state=32)
    sample_size = len(df_patient_sub)


    try:
        # Patsy dmatrices
        Y, X = dmatrices(formula, df_patient_sub)

        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X,
            np.ravel(Y),
            test_size=0.25,
            random_state=42
        )

        # Start timing
        start_time = time.time()

        # Train model
        xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
        xgb_model.fit(X_train, y_train)

        # Predict & evaluate
        y_pred = xgb_model.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)

        # Stop timing
        end_time = time.time()
        time_taken = end_time - start_time

        # Append to results table
        results_table = pd.concat([
            results_table,
            pd.DataFrame([{
                'Model Description': f'XGBoost (frac={frac})',
                'Sample Size': sample_size,
                'Test Accuracy': round(test_accuracy, 4),
                'Time Taken (s)': round(time_taken, 2)
            }])
        ], ignore_index=True)

        print(f"Done: frac={frac}, sample_size={sample_size}, test_acc={test_accuracy:.4f}")

    except Exception as e:
        print(f"Error processing frac={frac}: {e}")

# Final Results
print("\n=== Results Summary ===")
print(results_table)

Parameters: { "use_label_encoder" } are not used.

  results_table = pd.concat([


Done: frac=1e-05, sample_size=96, test_acc=0.8750


Parameters: { "use_label_encoder" } are not used.



Done: frac=0.0001, sample_size=964, test_acc=0.9419


Parameters: { "use_label_encoder" } are not used.



Done: frac=0.001, sample_size=9637, test_acc=0.9759


Parameters: { "use_label_encoder" } are not used.



Done: frac=0.01, sample_size=96366, test_acc=0.9873


Parameters: { "use_label_encoder" } are not used.



Done: frac=0.1, sample_size=963665, test_acc=0.9918


Parameters: { "use_label_encoder" } are not used.



Done: frac=1, sample_size=9636650, test_acc=0.9931

=== Results Summary ===
       Model Description Sample Size  Test Accuracy  Time Taken (s)
0   XGBoost (frac=1e-05)          96         0.8750            0.16
1  XGBoost (frac=0.0001)         964         0.9419            0.04
2   XGBoost (frac=0.001)        9637         0.9759            0.14
3    XGBoost (frac=0.01)       96366         0.9873            0.75
4     XGBoost (frac=0.1)      963665         0.9918            8.60
5       XGBoost (frac=1)     9636650         0.9931           86.38
