#### BrisT1D Blood Glucose Prediction Competition
Using historical blood glucose readings, insulin dosage, carbohydrate intake, and smartwatch activity data to predict future blood glucose

Kaggle - https://www.kaggle.com/competitions/brist1d.

In [1]:
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor

In [2]:
# Load datasets
train_data = TabularDataset('train.csv')
test_data = TabularDataset('test.csv')

In [3]:
train_data.head()

Unnamed: 0,id,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,...,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00,bg+1:00
0,p01_0,p01,06:10:00,,,9.6,,,9.7,,...,,,,,,,,,,13.4
1,p01_1,p01,06:25:00,,,9.7,,,9.2,,...,,,,,,,,,,12.8
2,p01_2,p01,06:40:00,,,9.2,,,8.7,,...,,,,,,,,,,15.5
3,p01_3,p01,06:55:00,,,8.7,,,8.4,,...,,,,,,,,,,14.8
4,p01_4,p01,07:10:00,,,8.4,,,8.1,,...,,,,,,,,,,12.7


In [4]:
train_data.shape

(177024, 508)

In [5]:
test_data.shape

(3644, 507)

In [6]:
# Function to optimize memory usage by downcasting
def optimize_memory(df):
    for col in df.select_dtypes(include=['float']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    for col in df.select_dtypes(include=['int']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

In [7]:
# train_data = optimize_memory(train_data)
# test_data = optimize_memory(test_data)

In [8]:
# Define the target column for prediction
target = 'bg+1:00'

In [None]:
# Initialize the AutoGluon TabularPredictor for regression
predictor = TabularPredictor(
    label=target,                   # Target column for blood glucose level prediction
    eval_metric='rmse',             # Evaluation metric as required by the competition
    problem_type='regression'       # Set to regression for continuous value prediction
).fit(
    train_data,
    presets='best_quality',         # Best quality preset for more accurate predictions
    time_limit=3600 * 11,           # Training time limit (11 hours, adjust as needed)
    verbosity=2,                    # Detailed logs for tracking
    excluded_model_types=['KNN'],   # Exclude KNN for efficiency on large datasets
    ag_args_fit={'num_cpus': 10, 'memory_ratio': 0.8}  # Optimized CPU, GPU, and memory usage
)

# Summarize results after training
results = predictor.fit_summary()

No path specified. Models will be saved in: "AutogluonModels\ag-20241114_175825"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.8.19
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          20
Memory Avail:       22.22 GB / 31.82 GB (69.8%)
Disk Space Avail:   633.57 GB / 953.00 GB (66.5%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfit

[36m(_ray_fit pid=20692)[0m [1000]	valid_set's rmse: 1.70294
[36m(_ray_fit pid=20692)[0m [2000]	valid_set's rmse: 1.6239[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[36m(_ray_fit pid=19292)[0m [3000]	valid_set's rmse: 1.57209[32m [repeated 2x across cluster][0m


In [None]:
# Prepare test data without the target column for prediction
test_data_no_target = test_data.drop(columns=[target], errors='ignore')

In [None]:
# Make predictions on the test data
predictions = predictor.predict(test_data_no_target)

In [None]:
# Prepare the submission file
submission = pd.DataFrame({'id': test_data['id'], target: predictions})
submission.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv.")

In [None]:
results

In [None]:
# Get the leaderboard to see the performance of the models
leaderboard = predictor.leaderboard(silent=True)

# Print the leaderboard for reference
leaderboard

In [None]:
# Get the top 10 models based on the leaderboard
top_10_models = leaderboard['model'].head(10).tolist()
top_10_models

In [None]:
# Iterate through the top 10 models and print their hyperparameters
for model in top_10_models:
    hyperparameters = predictor.info()['model_info'][model]['hyperparameters']
    print(f"Model: {model}")
    print("Hyperparameters:")
    print(hyperparameters)
    print("-" * 50)

In [None]:
# Get details of the best model
best_model = predictor.get_model_best()
print(f"Best Model: {best_model}")

# Get model hyperparameters and other info
best_model_info = predictor.info()
print(best_model_info)

In [None]:
# Feature importance
feature_importance = predictor.feature_importance(train_data)
print("Feature Importance:\n", feature_importance)

In [None]:
submission.head()

In [None]:
feature_importance

In [None]:
# Get the top 10 models based on the leaderboard
leaderboard = predictor.leaderboard(silent=True)
top_10_models = leaderboard['model'].head(10).tolist()

# Prepare the test data without the target column for prediction
test_data_no_target = test_data.drop(columns=[target], errors='ignore')

# Make predictions with each top model and save to individual submission files
for model in top_10_models:
    predictions = predictor.predict(test_data_no_target, model=model)
    submission = pd.DataFrame({'id': test_data['id'], target: predictions})
    submission_file = f"submission_{model}.csv"
    submission.to_csv(submission_file, index=False)
    print(f"Submission file saved for model {model} as {submission_file}.")