In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.matplotlib.rcParams['savefig.dpi'] = 144
import seaborn

## Anomaly Detection Lab: Incorporating Weather Data

In this lab, we go ahead and add weather data to our already-well-performing model. It seems reasonable to assume that weather affects the usage of the CitiBike system. The `weatherdata/nycp.csv` file contains daily National Weather Service records for Central Park. Add features from these records to your model.

The code here is essentially copy-and-pasted from the `anomaly2.ipynb` notebook, and includes the feature extraction pipelines for both our time-based features and our historical count features. 

Your task is the following
1. Engineer a new feature (or features) for the day's average temperature (as reported at Central Park).
2. Combine the time-based, historical, and (your new) weather-based features into a FeatureUnion
3. Fit a linear regression model and evaluate what the performance improvement is over our previous best model.

In [None]:
import zipfile
import pandas as pd
import numpy as np
import glob
from sklearn import base
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LinearRegression
from sklearn import metrics

def load_counts(fn):
    zf = zipfile.ZipFile(fn, 'r')
    df = pd.read_csv(zf.open(zf.namelist()[0]))
    counts = df['starttime'].str.split(' ', 1).apply(lambda x: x[0]).value_counts()
    if '-' in counts.index[0]:
        counts.index = pd.to_datetime(counts.index, format='%Y-%m-%d')
    else:
        counts.index = pd.to_datetime(counts.index, format='%m/%d/%Y')
    return counts.sort_index()

fns = glob.glob('tripdata/[0-9][0-9][0-9][0-9][0-9][0-9]-citibike-tripdata.zip')
counts = pd.concat([load_counts(fn) for fn in sorted(fns)])

In [None]:
class FourierComponents(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, period):
        self.period = period
    
    def fit(self, X, y=None):
        self.X0 = X[0]
        return self
    
    def transform(self, X):
        dt = (X - self.X0).days * 2 * np.pi / self.period
        return np.c_[np.sin(dt), np.cos(dt)]

class DayofWeek(base.BaseEstimator, base.TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def day_vector(self, day):
        v = np.zeros(7)
        v[day] = 1
        return v
    
    def transform(self, X):
        return np.stack(self.day_vector(d) for d in X.dayofweek)

class QuadBackground(base.BaseEstimator, base.TransformerMixin):
    
    def fit(self, X, y=None):
        self.X0 = X[0]
        return self
    
    def transform(self, X):
        days = (X - self.X0).days
        return np.c_[days, days**2]
    
class ColumnExtractor(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, cols):
        self.cols = cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.cols]
    
class IndexExtractor(base.BaseEstimator, base.TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.index

In [None]:
union = FeatureUnion([('date', QuadBackground()),
                      ('fourier-y', FourierComponents(365)),
                      ('fourier-2', FourierComponents(365/2.)),
                      ('fourier-m', FourierComponents(365/12.)),
                      ('fourier-8', FourierComponents(365/8.)),
                      ('dayofweek', DayofWeek()),])
time_pipe = Pipeline([('index', IndexExtractor()),
                      ('features', union)])

In [None]:
counts_df = pd.DataFrame({'counts': counts, 'previous': counts.shift(1).fillna(method='bfill'),
                          'rolling': counts.rolling(window=5).mean().shift(1).fillna(method='bfill')})

hist_pipe = Pipeline([('previous', ColumnExtractor(['previous', 'rolling']))])

### The Weather Data
Here's the dataset of weather. We added in a column with the dates already parsed for you. Explore it to determine which columns actually track temperature on a given day.

In [None]:
weather = pd.read_csv("weatherdata/nycp.csv")
#here, we'll even give you a nice datetime object to work with
weather["DT"] = pd.to_datetime(weather["DATE"].apply(str), format="%Y%m%d")
weather.head()

### What now?

Some hints:
- The first thing you'll need to do is line up the dates in the `weather` dataframe with the `counts_df` dataframe (i.e. join the two tables).
- You can use the `ColumnExtractor` transformer we've already written to pull out temperature as a feature
- From there, set up a `FeatureUnion` (combining the pipelines already here with your new feature extraction)
- Last but not least, train a Linear Regression model and compute the RMSE. How did it do?

Once you've gotten that all done, a fun one: how much does ridership increase for every degree Fahrenheit? (Hint: The coefficients of the linear model are stored in the .coef_ attribute of a LinearRegression object.)

Next steps: There are a few other interesting columns in this dataset. Add those into the regression as well to see how they do.