In [3]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import time

# Function to load the dataset for a given size
def load_data(dataset_size):
    # Define the file path based on dataset size
    file_path = f"Data_{dataset_size}.csv"
    # Load the dataset
    data = pd.read_csv(file_path)
    return data

# Function to train XGBoost and return accuracy and time taken
def train_xgboost(data, labels, dataset_size):
    start_time = time.time()

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

    # Create the XGBoost model
    model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")

    # Perform 5-fold cross-validation
    accuracy = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()

    # Measure the time taken to train the model
    end_time = time.time()
    time_taken = end_time - start_time

    return accuracy, time_taken

# List of dataset sizes to train on
dataset_sizes = [100, 1000, 10000, 100000, 1000000]

# Create empty lists to store results
accuracies = []
time_taken_list = []

# Loop over each dataset size
for dataset_size in dataset_sizes:
    # Load the data
    data = load_data(dataset_size)
    labels = data['outcome']  # Assuming 'outcome' is the target variable
    features = data.drop('outcome', axis=1)

    # Train the XGBoost model and get results
    accuracy, time_taken = train_xgboost(features, labels, dataset_size)

    # Store the results
    accuracies.append(accuracy)
    time_taken_list.append(time_taken)

    # Print the results for this dataset size
    print(f"Dataset Size: {dataset_size}")
    print(f"Accuracy: {accuracy:.4f}, Time Taken: {time_taken:.4f} seconds\n")

# Optionally, create a DataFrame to display the results
results_df = pd.DataFrame({
    'Dataset Size': dataset_sizes,
    'Accuracy': accuracies,
    'Time Taken (s)': time_taken_list
})

print("\nSummary of Results:")
print(results_df)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Dataset Size: 100
Accuracy: 0.8875, Time Taken: 0.5786 seconds



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Dataset Size: 1000
Accuracy: 0.9412, Time Taken: 2.0458 seconds



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Dataset Size: 10000
Accuracy: 0.9727, Time Taken: 1.3518 seconds



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Dataset Size: 100000
Accuracy: 0.9861, Time Taken: 4.6678 seconds



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Dataset Size: 1000000
Accuracy: 0.9914, Time Taken: 46.9417 seconds


Summary of Results:
   Dataset Size  Accuracy  Time Taken (s)
0           100  0.887500        0.578555
1          1000  0.941250        2.045814
2         10000  0.972750        1.351846
3        100000  0.986125        4.667815
4       1000000  0.991364       46.941744
