In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# RMSLE function definition
def rmsle(y_true, y_pred):
    y_pred = np.where(y_pred < 0, 0, y_pred)  # avoid negative predictions
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2))

# Load datasets
train = pd.read_csv('biketrain.csv')
test = pd.read_csv('biketest.csv')

# Datetime feature engineering example
train['datetime'] = pd.to_datetime(train['datetime'], format='%d%m%y %H%M')
train['hour'] = train['datetime'].dt.hour
train['month'] = train['datetime'].dt.month

# Target and features split
y = train['count']
X = train.drop(['count', 'datetime'], axis=1)

# Define columns (adjust if necessary)
numeric_cols = ['temp', 'atemp', 'humidity', 'windspeed', 'year', 'day']
cat_cols = ['season', 'weather', 'holiday', 'workingday', 'isweekend']

# Preprocessing pipelines
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, cat_cols)
])

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define your models dictionary with all models (including polynomial with scaler after poly)
models = {
    'Linear': Pipeline([
        ('pre', preprocessor), 
        ('model', LinearRegression())
    ]),
    'Ridge': Pipeline([
        ('pre', preprocessor), 
        ('model', RidgeCV(alphas=[0.1,1.0,10.0]))
    ]),
    'Polynomial Regression (Degree 2)': Pipeline([
        ('pre', preprocessor),
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('scaler', StandardScaler()),  # ITSR fine tuning
        ('model', LinearRegression())
    ]),
    'Lasso Polynomial Regression': Pipeline([
        ('pre', preprocessor),
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('scaler', StandardScaler()),  # ITSR fine tuning
        ('model', Lasso(alpha=0.001, max_iter=10000))  # Increased max_iter, ITSR fine tuning
    ]),
    'RandomForest': Pipeline([
        ('pre', preprocessor), 
        ('model', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
    ]),
    # GradientBoosting wrapped with log-transform for further tuning added below
}

# ITSR fine tuning: Wrap GradientBoosting with TransformedTargetRegressor for log-transform of target
from sklearn.compose import TransformedTargetRegressor

gb_pipe = Pipeline([
    ('pre', preprocessor),
    ('model', GradientBoostingRegressor(random_state=42))
])

regressor_log = TransformedTargetRegressor(
    regressor=gb_pipe,
    func=np.log1p,
    inverse_func=np.expm1
)

# Add tuned GradientBoosting regressors to models dict for easy access if needed
models['GradientBoosting (Tuned Log-Transform)'] = regressor_log

# Hyperparameter tuning for the GradientBoosting regressor with log-transform target
param_grid = {
    'regressor__model__n_estimators': [100, 200, 300],
    'regressor__model__max_depth': [3, 4, 5],
    'regressor__model__learning_rate': [0.01, 0.1, 0.2],
    'regressor__model__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    regressor_log,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_log_error',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Best parameters for GradientBoosting:", grid_search.best_params_)

best_gb_model = grid_search.best_estimator_

# Evaluate all models including tuned Gradient Boosting
results = {}
for name, pipe in models.items():
    if name == 'GradientBoosting (Tuned Log-Transform)':
        # Use already tuned model
        y_pred = best_gb_model.predict(X_val)
    else:
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_val)
    score = rmsle(y_val, y_pred)
    print(f"{name} RMSLE: {score}")
    results[name] = score

# Finally use the best_gb_model to predict on the test set and generate submission file
test['datetime'] = pd.to_datetime(test['datetime'], format='%d%m%y %H%M')
test['hour'] = test['datetime'].dt.hour
test['month'] = test['datetime'].dt.month

X_test = test.drop(['datetime'], axis=1)

test_preds = best_gb_model.predict(X_test)
test_preds = np.where(test_preds < 0, 0, test_preds)
test_preds_rounded = np.round(test_preds).astype(int)

submission = pd.DataFrame({
    'datetime': test['datetime'],
    'count': test_preds_rounded
})

submission.to_csv('submission.csv', index=False)
print("Saved submission to submission.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'biketrain.csv'