In [None]:
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import mean_squared_log_error\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.impute import SimpleImputer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('bike_train.csv')\n",
    "test = pd.read_csv('bike_test.csv')\n",
    "\n",
    "# keep original test datetime for submission\n",
    "test_ids = test['datetime'].copy()\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# datetime to proper dtype + time features\n",
    "train['datetime'] = pd.to_datetime(train['datetime'])\n",
    "test['datetime'] = pd.to_datetime(test['datetime'])\n",
    "\n",
    "for df in [train, test]:\n",
    "    df['hour'] = df['datetime'].dt.hour\n",
    "    df['day'] = df['datetime'].dt.day\n",
    "    df['month'] = df['datetime'].dt.month\n",
    "    df['year'] = df['datetime'].dt.year\n",
    "    df['weekday'] = df['datetime'].dt.weekday\n",
    "\n",
    "# drop leakage columns\n",
    "drop_cols = ['datetime', 'casual', 'registered']\n",
    "train = train.drop(columns=drop_cols, errors='ignore')\n",
    "test = test.drop(columns=['datetime'], errors='ignore')\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# target & split\n",
    "y = train['count']\n",
    "X = train.drop(columns=['count'])\n",
    "\n",
    "X_train, X_valid, y_train, y_valid = train_test_split(\n",
    "    X, y, test_size=0.2, random_state=42\n",
    ")\n",
    "\n",
    "def rmsle(y_true, y_pred):\n",
    "    y_pred = np.maximum(y_pred, 0)\n",
    "    return np.sqrt(mean_squared_log_error(y_true, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# feature types\n",
    "numeric_features = ['temp', 'atemp', 'humidity', 'windspeed',\n",
    "                    'hour', 'day', 'month', 'year', 'weekday']\n",
    "categorical_features = ['season', 'holiday', 'workingday', 'weather']\n",
    "\n",
    "numeric_transformer = Pipeline(steps=[\n",
    "    ('imputer', SimpleImputer(strategy='median')),\n",
    "    ('scaler', StandardScaler())\n",
    "])\n",
    "\n",
    "categorical_transformer = Pipeline(steps=[\n",
    "    ('imputer', SimpleImputer(strategy='most_frequent')),\n",
    "    ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
    "])\n",
    "\n",
    "preprocess = ColumnTransformer(\n",
    "    transformers=[\n",
    "        ('num', numeric_transformer, numeric_features),\n",
    "        ('cat', categorical_transformer, categorical_features)\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Linear Regression model only\n",
    "lin_model = Pipeline(steps=[\n",
    "    ('preprocess', preprocess),\n",
    "    ('model', LinearRegression())\n",
    "])\n",
    "\n",
    "lin_model.fit(X_train, y_train)\n",
    "lin_pred_valid = lin_model.predict(X_valid)\n",
    "print('Validation RMSLE (Linear Regression):', rmsle(y_valid, lin_pred_valid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train on full data\n",
    "lin_model.fit(X, y)\n",
    "\n",
    "# predict on test\n",
    "test_pred = lin_model.predict(test)\n",
    "test_pred = np.maximum(test_pred, 0)\n",
    "\n",
    "submission = pd.DataFrame({\n",
    "    'datetime': test_ids,\n",
    "    'Count_Predicted': test_pred\n",
    "})\n",
    "\n",
    "submission.to_csv('my_linear_regression_submission.csv', index=False)\n",
    "submission.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
