In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [41]:

df = pd.read_csv('SeoulBikeData.csv')

In [42]:

df.columns = [col.strip().lower().replace(' ', '_').replace('(', '').replace(')', '') 
              for col in df.columns]

In [43]:

numeric_cols = ['hour', 'temperature_c', 'humidity_percent', 
                'wind_speed_ms', 'visibility_10m', 
                'dew_point_temperature_c', 'solar_radiation_mjm2',
                'rainfall_mm', 'snowfall_cm']
categorical_cols = ['seasons', 'holiday', 'functioning_day']

In [44]:

numeric_cols = [col for col in numeric_cols if col in df.columns]
categorical_cols = [col for col in categorical_cols if col in df.columns]

In [45]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

In [46]:

models = {
    "Linear Regression": Pipeline([
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ]),
    "Random Forest": Pipeline([
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(n_estimators=100, random_state=42))
    ])
}

In [47]:

y = df['rented_bike_count'] 

In [48]:
results = []
for name, pipeline in models.items():
    try:
        scores = cross_val_score(
            pipeline, df.drop('rented_bike_count', axis=1), y,
            cv=5,
            scoring='neg_mean_squared_error',
            n_jobs=-1
        )
        results.append({
            'Model': name,
            'Mean MSE': f"{-np.mean(scores):.1f}",
            'Std MSE': f"{np.std(scores):.1f}"
        })
    except Exception as e:
        print(f"Error with {name}: {str(e)}")

In [49]:

if results:
    results_df = pd.DataFrame(results)
    print("\nCross-Validation Results:")
    print(results_df.to_markdown(index=False))
else:
    print("No models completed successfully")


Cross-Validation Results:
| Model             |   Mean MSE |   Std MSE |
|:------------------|-----------:|----------:|
| Linear Regression |     297960 |    134906 |
| Random Forest     |     242286 |    130088 |


In [50]:

print("\nKey Improvements:")
print("- Proper handling of categorical features (like 'Winter') via OneHotEncoder")
print("- Robust pipeline prevents data leakage")
print("- Automatic column name cleaning")


Key Improvements:
- Proper handling of categorical features (like 'Winter') via OneHotEncoder
- Robust pipeline prevents data leakage
- Automatic column name cleaning
