In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
# load bike traffic and weather data 
fremont = pd.read_csv('fremont.csv',index_col='Date',parse_dates=True)
fremont.columns = ['total','east','west'] # rename columns
weather = pd.read_csv('weather.csv',index_col='DATE',parse_dates=True)

# compute daily traffic
fremont = fremont.resample('d').sum()

# day of the week/month/year/covid
fremont['day_of_week'] = fremont.index.dayofweek
fremont['month'] = fremont.index.month
fremont['year'] = fremont.index.year
# covid column: 0 = pre covid times; 1 = pos covid times
fremont['covid'] = 0
fremont.loc['03-01-2020':,'covid'] = 1 

# holidays
from pandas.tseries.holiday import USFederalHolidayCalendar
calendar = USFederalHolidayCalendar()
holidays = calendar.holidays('10-03-2012','08-31-2023')
fremont['holidays'] = pd.Series(1,index=holidays,name='holidays')
fremont.fillna(0,inplace=True)

# hours of daylight
def get_hoursdaylight(date):
    axis = np.radians(23.44) # tilt of Earth's axis
    latitude = np.radians(47.61) # Seattle's latidude 
    days = (date-pd.to_datetime('2000-12-31')).days
    m = (1 - np.tan(latitude)*np.tan(axis*np.cos(days*2*np.pi/365.25)))
    return 24*np.degrees(np.arccos(1-m))/180
fremont['hours_daylight'] = fremont.index.map(get_hoursdaylight)

# fix TAVG column
weather.TAVG.fillna(0.5*(weather.TMAX+weather.TMIN),inplace=True)

# add weather columns
fremont['PRCP'] = weather.PRCP
fremont['TAVG'] = weather.TAVG
fremont['SNOW'] = weather.SNOW
fremont['AWND'] = weather.AWND

fremont

Unnamed: 0_level_0,total,east,west,day_of_week,month,year,covid,holidays,hours_daylight,PRCP,TAVG,SNOW,AWND
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2012-10-03,3521.0,1760.0,1761.0,2,10,2012,0,0.0,11.863813,0.00,56.0,0.0,16.33
2012-10-04,3475.0,1708.0,1767.0,3,10,2012,0,0.0,11.804946,0.00,56.5,0.0,14.54
2012-10-05,3148.0,1558.0,1590.0,4,10,2012,0,0.0,11.746107,0.00,59.5,0.0,12.75
2012-10-06,2006.0,1080.0,926.0,5,10,2012,0,0.0,11.687302,0.00,60.5,0.0,11.41
2012-10-07,2142.0,1191.0,951.0,6,10,2012,0,0.0,11.628541,0.00,60.5,0.0,2.91
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-27,2169.0,936.0,1233.0,6,8,2023,1,0.0,14.020464,0.00,72.0,0.0,6.26
2023-08-28,3027.0,1026.0,2001.0,0,8,2023,1,0.0,13.967394,0.00,66.0,0.0,6.71
2023-08-29,2767.0,842.0,1925.0,1,8,2023,1,0.0,13.913949,0.12,61.0,0.0,6.49
2023-08-30,3944.0,1263.0,2681.0,2,8,2023,1,0.0,13.860146,0.01,62.0,0.0,6.49


## Linear Regression pipeline

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures

In [9]:
# regression pipeline

num_features = ['hours_daylight','year','PRCP','TAVG','SNOW','AWND','covid','holidays']
cat_features = ['day_of_week']


num_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
    ])

cat_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
    ])



feature_processor = ColumnTransformer(transformers=[
    ('num_processor',num_processor,num_features),
    ('cat_processor',cat_processor, cat_features),
])

pipe = Pipeline(steps=[
    ('feature_processor',feature_processor),
    ('poly_features',PolynomialFeatures(degree=1)),
    ('reg',LinearRegression())
])

pipe