In [1]:
import boto3
import pandas as pd
from io import BytesIO
import os
from dotenv import load_dotenv
from pycaret.regression import *

load_dotenv()

True

In [2]:
load_dotenv()

# Konfiguracja połączenia
session = boto3.session.Session()
client = session.client('s3',
    region_name=os.getenv('DO_SPACES_REGION'),
    endpoint_url=os.getenv('DO_SPACES_ENDPOINT'),
    aws_access_key_id=os.getenv('DO_SPACES_KEY'),
    aws_secret_access_key=os.getenv('DO_SPACES_SECRET')
)

BUCKET_NAME = os.getenv('DO_SPACES_BUCKET')

# Wczytaj oczyszczone dane
obj = client.get_object(Bucket=BUCKET_NAME, Key='data/processed/halfmarathon_clean.csv')
df = pd.read_csv(BytesIO(obj['Body'].read()))

print(f"Wczytano {len(df)} rekordów")
print(df.head())

Wczytano 17730 rekordów
  sex   age  time_5km_s  time_half_s
0   M  32.0       877.0       3899.0
1   M  38.0       888.0       3983.0
2   M  28.0       946.0       4104.0
3   M  36.0       971.0       4216.0
4   M  29.0       972.0       4227.0


In [3]:
from datetime import datetime

# Setup PyCaret
reg = setup(
    data=df,
    target='time_half_s',
    session_id=123,
    normalize=True,
    categorical_features=['sex'],
    numeric_features=['age', 'time_5km_s'],
    fold=5,
    verbose=False
)

# Porównaj modele
best_models = compare_models(n_select=3, sort='MAE')

# Wybierz najlepszy
best = best_models[0]

# Finalizuj model
final_model = finalize_model(best)

print(f"\nNajlepszy model: {best}")

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
par,Passive Aggressive Regressor,291.1228,174513.9472,417.4171,0.8812,0.0579,0.0376,0.018
huber,Huber Regressor,291.2947,170852.8964,413.013,0.8837,0.057,0.0377,0.02
gbr,Gradient Boosting Regressor,295.1658,164210.0858,405.0585,0.8881,0.0513,0.0385,0.11
lasso,Lasso Regression,296.6818,167422.3432,408.8384,0.886,0.0559,0.0388,0.28
llar,Lasso Least Angle Regression,296.6819,167422.247,408.8383,0.886,0.0559,0.0388,0.016
lr,Linear Regression,296.6972,167418.7697,408.8322,0.886,0.0559,0.0388,0.318
ridge,Ridge Regression,296.6972,167418.74,408.8323,0.886,0.0559,0.0388,0.192
lar,Least Angle Regression,296.6972,167418.7697,408.8322,0.886,0.0559,0.0388,0.024
br,Bayesian Ridge,296.6973,167418.7894,408.8323,0.886,0.0559,0.0388,0.014
omp,Orthogonal Matching Pursuit,296.8369,167682.5806,409.1586,0.8858,0.0561,0.0388,0.018



Najlepszy model: PassiveAggressiveRegressor(random_state=123)


In [4]:
import pickle
from io import BytesIO

# Zapisz model do pamięci
model_buffer = BytesIO()
pickle.dump(final_model, model_buffer)
model_buffer.seek(0)

# Nazwa pliku z datą
model_date = datetime.now().strftime('%Y%m%d')
model_filename = f'halfmarathon_model_v{model_date}.pkl'

# Upload do Spaces
client.put_object(
    Bucket=BUCKET_NAME,
    Key=f'models/{model_filename}',
    Body=model_buffer.getvalue(),
    ContentType='application/octet-stream'
)

# Upload jako latest.pkl
model_buffer.seek(0)
client.put_object(
    Bucket=BUCKET_NAME,
    Key='models/latest.pkl',
    Body=model_buffer.getvalue(),
    ContentType='application/octet-stream'
)

print(f"Model zapisany jako: models/{model_filename} i models/latest.pkl")


Model zapisany jako: models/halfmarathon_model_v20260103.pkl i models/latest.pkl


In [6]:
pickle.dump(final_model, open('../models/latest.pkl', 'wb'))