In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import cross_val_score, StratifiedKFold
import time

In [4]:
data = pd.read_csv('dfdata_10M.csv')

In [5]:
data.shape

(10000000, 9)

In [6]:
data.isna().sum()

pregnant    0
glucose     0
pressure    0
triceps     0
insulin     0
mass        0
pedigree    0
age         0
outcome     0
dtype: int64

In [8]:
from xgboost import XGBClassifier
dataset_sizes = [100, 1000, 10000, 100000, 1000000, 10000000]
results = []
for sz in dataset_sizes:
    subsample = data.sample(n=sz, random_state=2021)
    X = subsample.drop(columns=['outcome'])
    y = subsample['outcome']
    mask = ~y.isna()
    X = X[mask]
    y = y[mask]
    model = XGBClassifier(use_label_encoder=False, eval_metric='error', verbosity=0)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
    start = time.time()
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    end = time.time()
    results.append({
        'Method': 'XGBoost in Python via scikit-learn and 5-fold CV',
        'Dataset Size': sz,
        'Testing-set predictive performance': round(scores.mean(), 4),
        'Time taken for the model to be fit': round(end - start, 2)
    })
# Final results
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Method,Dataset Size,Testing-set predictive performance,Time taken for the model to be fit
0,XGBoost in Python via scikit-learn and 5-fold CV,100,0.93,0.73
1,XGBoost in Python via scikit-learn and 5-fold CV,1000,0.952,0.3
2,XGBoost in Python via scikit-learn and 5-fold CV,10000,0.9753,0.51
3,XGBoost in Python via scikit-learn and 5-fold CV,100000,0.9869,1.46
4,XGBoost in Python via scikit-learn and 5-fold CV,1000000,0.9918,16.58
5,XGBoost in Python via scikit-learn and 5-fold CV,10000000,0.9931,193.31
