In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# ---------------------------------------------
# RMSLE Function (Corrected)
# ---------------------------------------------
def rmsle(y_true, y_pred):
    y_pred = np.maximum(0, y_pred)  # Avoid negative predictions
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))




In [4]:
# ---------------------------------------------
# Load Data
# ---------------------------------------------
train = pd.read_csv("bike_train.csv")
test = pd.read_csv("bike_test.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'bike_train.csv'

In [None]:
# ---------------------------------------------
# Q4: Feature Engineering
# ---------------------------------------------
def add_datetime_features(df):
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['hour'] = df['datetime'].dt.hour
    df['weekday'] = df['datetime'].dt.weekday
    df['month'] = df['datetime'].dt.month
    df['year'] = df['datetime'].dt.year
    return df

train = add_datetime_features(train)
test = add_datetime_features(test)

target = "count"

# Drop unused columns
features = ['season','holiday','workingday','weather','temp','atemp','humidity','windspeed',
            'hour','weekday','month','year']

X = train[features]
y = train[target]
X_test_final = test[features]

# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# ---------------------------------------------
# MODEL 1: Linear Regression
# ---------------------------------------------
lr = LinearRegression()
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_valid)
score_lr = rmsle(y_valid, pred_lr)


# ---------------------------------------------
# MODEL 2: Polynomial Regression (Degree 2)
# ---------------------------------------------
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_valid_poly = poly.transform(X_valid)

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
pred_poly = poly_model.predict(X_valid_poly)
score_poly = rmsle(y_valid, pred_poly)

# ---------------------------------------------
# MODEL 3: Lasso Regression with Polynomial Features
# ---------------------------------------------
lasso = Lasso(alpha=0.001)
lasso.fit(X_train_poly, y_train)
pred_lasso = lasso.predict(X_valid_poly)
score_lasso = rmsle(y_valid, pred_lasso)


# ---------------------------------------------
# MODEL 4: Random Forest
# ---------------------------------------------
rf = RandomForestRegressor(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_valid)
score_rf = rmsle(y_valid, pred_rf)

# ---------------------------------------------
# Compare Results
# ---------------------------------------------
print("RMSLE Scores:")
print(f"Linear Regression:        {score_lr:.5f}")
print(f"Polynomial Regression:    {score_poly:.5f}")
print(f"Lasso Polynomial:         {score_lasso:.5f}")
print(f"Random Forest:            {score_rf:.5f}")

# ---------------------------------------------
# Choose best model for test set prediction
# ---------------------------------------------
best_model = rf      # CHANGE HERE IF ANY OTHER MODEL IS BETTER

final_predictions = best_model.predict(X_test_final)
final_predictions = np.maximum(0, final_predictions)

# Save submission file
submission = pd.DataFrame({
    "datetime": test["datetime"],
    "count": final_predictions
})

submission.to_csv("bike_predictions.csv", index=False)
print("Prediction file saved as bike_predictions.csv")