In [1]:
# Importowanie bibliotek
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Wczytanie danych
bike_data = pd.read_csv('daily-bike-share.csv')

# Usunięcie brakujących danych
bike_data = bike_data.dropna()

# Definicja zmiennych
numeric_features = ['temp', 'atemp', 'hum', 'windspeed']
categorical_features = ['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']
target = 'rentals'

# Poprawa zmiennych numerycznych dla PowerTransformer
for col in numeric_features:
    if bike_data[col].min() <= 0:
        bike_data[col] += abs(bike_data[col].min()) + 1

X = bike_data[numeric_features + categorical_features].copy()
y = bike_data[target].copy()

# Podział na zbiory treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Pipeline dla zmiennych numerycznych
numeric_transformer = Pipeline(steps=[
    ('powertransformer', PowerTransformer()),
    ('scaler', StandardScaler()),
    ('polynomialfeatures', PolynomialFeatures(degree=2))
])

# Pipeline dla zmiennych kategorycznych
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Łączenie przetwarzania w ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Pipeline z modelem ElasticNet
pipeline_en = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', ElasticNet(max_iter=20000, tol=1e-4))
])

# Parametry dla ElasticNet
params_en = {
    'regressor__alpha': [0.1, 1, 10, 100],
    'regressor__l1_ratio': [0.1, 0.5, 0.9]
}

grid_search_en = GridSearchCV(pipeline_en, params_en, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search_en.fit(X_train, y_train)

# Najlepszy model ElasticNet
best_model_en = grid_search_en.best_estimator_

# Predykcje ElasticNet
y_test_pred_en = best_model_en.predict(X_test)

# Metryki ElasticNet
mae_test_en = mean_absolute_error(y_test, y_test_pred_en)
rmse_test_en = np.sqrt(mean_squared_error(y_test, y_test_pred_en))
r2_test_en = r2_score(y_test, y_test_pred_en)

# Pipeline końcowy z modelem Random Forest
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=0))
])

# Parametry do GridSearch dla Random Forest
param_grid_rf = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [10, 20, None],
    'regressor__min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search_rf.fit(X_train, y_train)

# Najlepszy model Random Forest
best_model_rf = grid_search_rf.best_estimator_

# Predykcje Random Forest
y_test_pred_rf = best_model_rf.predict(X_test)

# Metryki Random Forest
mae_test_rf = mean_absolute_error(y_test, y_test_pred_rf)
rmse_test_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))
r2_test_rf = r2_score(y_test, y_test_pred_rf)

# Wyświetlenie wyników
print("Metryki modelu ElasticNet:")
print(f"MAE: {mae_test_en:.2f}")
print(f"RMSE: {rmse_test_en:.2f}")
print(f"R^2: {r2_test_en:.2f}")

print("\nMetryki modelu Random Forest:")
print(f"MAE: {mae_test_rf:.2f}")
print(f"RMSE: {rmse_test_rf:.2f}")
print(f"R^2: {r2_test_rf:.2f}")


Metryki modelu ElasticNet:
MAE: 326.24
RMSE: 454.40
R^2: 0.58

Metryki modelu Random Forest:
MAE: 258.47
RMSE: 358.68
R^2: 0.74
