In [None]:
import pandas as pd
import pandera as pa
from pandera import Column, DataFrameSchema, Check
from sklearn.model_selection import train_test_split
from pycaret.regression import (
    setup,
    compare_models,
    plot_model,
    finalize_model,
    save_model,
    predict_model,
)

In [None]:
halfmarathon_df = pd.read_csv('../data/processed/halfmarathon_cleaned.csv')

In [None]:
halfmarathon_df.sample(5)
# halfmarathon_df.info()


In [None]:
schema = DataFrameSchema(
    {
        'gender': Column(
            pa.Int,
            checks=Check.in_range(0, 1),
            coerce=True,
            nullable=False
        ),
        'pace_5k_sec': Column(
            pa.Float,
            checks=Check.in_range(0, 765),
            coerce=True,
            nullable=True
        ),
        'time_5k_sec': Column(
            pa.Float,
            checks=Check.in_range(0, 3825),
            coerce=True,
            nullable=True
        ),
        'finish_time_sec': Column(
            pa.Int,
            checks=Check.in_range(3550, 10550),
            coerce=True,
            nullable=False
        ),
        'age': Column(
            pa.Int,
            checks=Check.in_range(6, 120),
            coerce=True,
            nullable=False
        )
    }
)

In [None]:
schema.validate(halfmarathon_df)

In [None]:
X = halfmarathon_df.drop('finish_time_sec', axis=1)
y = halfmarathon_df['finish_time_sec']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.1,
                                                    random_state=88)

In [None]:
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [None]:
exp = setup(data=train_df, target='finish_time_sec' ,session_id=88, categorical_features=['gender'])

# normalized_exp = setup(data=halfmarathon_df, target='finish_time_sec' ,session_id=123, normalize=True, normalize_method='zscore',categorical_features=['gender'])

In [None]:
best_models = exp.compare_models(sort='MSE', n_select=5)
# best_models_normalized = exp.compare_models(sort='MAE', n_select=5)

In [None]:
tuned_best_models = [exp.tune_model(m, optimize='RMSE') for m in best_models]

In [None]:
best_model = exp.compare_models(best_models + tuned_best_models, sort="RMSE")

In [None]:
# exp.plot_model(best_model, plot='error')
exp.plot_model(best_model, plot='feature')
# exp.plot_model(best_model, plot='manifold')

In [None]:
best_final_model = exp.finalize_model(best_model)

In [None]:
predict_model(best_final_model, data=test_df)

In [None]:
final_train = pd.concat([train_df, test_df])

In [None]:
exp_all= setup(data=final_train, target='finish_time_sec', session_id=88)

In [None]:
final_model = finalize_model(best_model)

In [None]:
save_model(final_model, '../models/halfmarathon_predictor')