**XGBoost in Python via scikit-learn and 5-fold CV**

In [43]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold

In [45]:
dfdata = pd.read_csv("dfdata.csv")
dfdata.head()

Unnamed: 0,pregnant,glucose,pressure,triceps,insulin,mass,pedigree,age,outcome
0,7,139,70,42,440,33.1,0.323,34,0
1,2,123,70,31,180,28.7,0.347,24,0
2,1,123,70,21,37,28.7,0.176,37,0
3,8,118,62,19,485,34.0,0.472,33,0
4,1,143,68,28,200,31.0,1.353,45,1


In [50]:
X = dfdata.drop(columns=['outcome'])
y = dfdata['outcome']

sizes = [100, 1000, 10000, 100000, 1000000, 10000000]
results = []
for sz in sizes:
    sampled_data = dfdata.sample(n=sz, replace=True)
    X_sample = sampled_data.drop(columns=['outcome'])
    y_sample = sampled_data['outcome']
    
    model = XGBClassifier(objective="binary:logistic", eval_metric="error", max_depth=2, eta=0.3)
    start_time = time.time()
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
    
    cv_scores = cross_val_score(model, X_sample, y_sample, cv=kf, scoring='accuracy')
    
    end_time = time.time()
    avg_accuracy = np.mean(cv_scores)
    time_taken = end_time - start_time
    results.append({
        'Method used': 'XGBoost in Python via scikit-learn and 5-fold CV',
        'Dataset size': sz,
        'Testing-set predictive performance': avg_accuracy,
        'Time taken for the model to be fit': time_taken
    })
sk_learn_cv = pd.DataFrame(results)
sk_learn_cv

Unnamed: 0,Method used,Dataset size,Testing-set predictive performance,Time taken for the model to be fit
0,XGBoost in Python via scikit-learn and 5-fold CV,100,0.89,0.34478
1,XGBoost in Python via scikit-learn and 5-fold CV,1000,0.969,0.28067
2,XGBoost in Python via scikit-learn and 5-fold CV,10000,0.9748,0.203745
3,XGBoost in Python via scikit-learn and 5-fold CV,100000,0.98024,1.606005
4,XGBoost in Python via scikit-learn and 5-fold CV,1000000,0.982015,14.266764
5,XGBoost in Python via scikit-learn and 5-fold CV,10000000,0.982007,132.343916
