In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

weather_data = pd.read_csv('BicycleWeather.csv')
traffic_data = pd.read_csv('FremontBridge.csv')

traffic_data['Date'] = pd.to_datetime(traffic_data['Date'])
traffic_data['Date'] = traffic_data['Date'].dt.date  # Keep only the date part for daily aggregation
traffic_data['Total Traffic'] = traffic_data['Fremont Bridge East Sidewalk'] + traffic_data['Fremont Bridge West Sidewalk']
daily_traffic = traffic_data.groupby('Date')['Total Traffic'].sum().reset_index()

weather_data['DATE'] = pd.to_datetime(weather_data['DATE'], format='%Y%m%d')
weather_data['DATE'] = weather_data['DATE'].dt.date  # Keep only the date part

merged_data = pd.merge(daily_traffic, weather_data, left_on='Date', right_on='DATE', how='inner')

merged_data.replace(-9999, np.nan, inplace=True)

features = ['PRCP', 'SNWD', 'SNOW', 'TMAX', 'TMIN', 'AWND']
X = merged_data[features]
y = merged_data['Total Traffic']

pipelines = {
    'LinearRegression': Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()), 
                                  ('model', LinearRegression())]),
    'Lasso': Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()), 
                       ('model', Lasso())]),
    'Ridge': Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()), 
                       ('model', Ridge())])
}

param_distributions = {
    'Lasso': {'model__alpha': np.logspace(-4, 4, 100)},
    'Ridge': {'model__alpha': np.logspace(-4, 4, 100)}
}

cv = KFold(n_splits=10, shuffle=True, random_state=42)
results = {}

# Perform cross-validation and parameter tuning
for model_name, pipeline in pipelines.items():
    if model_name in ['Lasso', 'Ridge']:
        search = RandomizedSearchCV(pipeline, param_distributions[model_name], n_iter=20, cv=cv, 
                                    scoring='neg_mean_squared_error', random_state=42)
        search.fit(X, y)
        best_score = search.best_score_
        best_alpha = search.best_params_['model__alpha']
        results[model_name] = {'Best Score': best_score, 'Best Alpha': best_alpha}
    else:
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_squared_error')
        results[model_name] = {'Best Score': scores.mean(), 'Best Alpha': None}

results


  traffic_data['Date'] = pd.to_datetime(traffic_data['Date'])


{'LinearRegression': {'Best Score': -787963.9539958232, 'Best Alpha': None},
 'Lasso': {'Best Score': -785162.5410673141, 'Best Alpha': 45.34878508128591},
 'Ridge': {'Best Score': -787783.1320172554, 'Best Alpha': 1.9179102616724888}}