In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from pytz import timezone
from sklearn.ensemble.forest import RandomForestRegressor 
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score
import itertools
import seaborn as sns
import os, sys
import json
%matplotlib inline
sns.set_style("white")

In [3]:
def setup_datetable(mindate, futuredate):
    datereange = pd.date_range(mindate,futuredate,freq='D')

    dtf = '%Y-%m-%d'
    rng = [date.strftime(dtf) for date in datereange]
    
    date_frame = pd.DataFrame({'daterange_str' : rng,
                               'datetime' : datereange,
                               'dayofmonth' : datereange.day,
                               'dayofweek' : datereange.dayofweek,
                               'dayssincestart' : np.arange(0,len(rng),1)})
    return date_frame

In [None]:
def make_regression_features(df):
    '''Create features for regression given a dataframe
    '''
    
    from pandas.tseries.holiday import USFederalHolidayCalendar
    
    x_dayssincestart = df.dayssincestart

    x_dayofweek = df.dayofweek
    x_dayofweek = pd.get_dummies(x_dayofweek,prefix='dayofweek')
    
    x_dayofmonth = df.dayofmonth
    x_dayofmonth = pd.get_dummies(x_dayofmonth, prefix='dayofmonth')

    x_week1 = pd.Series([1 if day < 8 else 0 for day in df.dayofmonth],name='week1')
    x_week2 = pd.Series([1 if (day >= 8 and day < 16) else 0 for day in df.dayofmonth],name='week2')
    x_week3 = pd.Series([1 if (day >= 16 and day < 23) else 0 for day in df.dayofmonth],name='week3')
    x_week4 = pd.Series([1 if day >= 23 else 0 for day in df.dayofmonth],name='week4')

    x_isweekend = pd.Series([1 if (day == 5 or day == 6) else 0 for day in df.dayofweek],name='isweekend')
    
    x_istueswed = pd.Series([1 if (day == 1 or day == 2) else 0 for day in df.dayofweek],name='istueswed')
    
    x_isfrisat = pd.Series([1 if (day == 4 or day == 5) else 0 for day in df.dayofweek],name='isfrisat')
    
    #get holidays
    calendar = USFederalHolidayCalendar()
    holidays = calendar.holidays(start=df.datetime.min(), end=df.datetime.max())
    x_isholiday = pd.Series([1 if day in holidays.tolist() else 0 for day in df.datetime],name='x_isholiday')
    
    #find 3-day weekends
    x_isholidaywknd = pd.Series(np.zeros(len(x_isholiday)),name='x_isholidaywknd')
    x_weekendandholiday = x_isweekend + x_isholiday
    for i in range(1, len(x_weekendandholiday)-2):
        if x_weekendandholiday[i] == 1: 
            if x_weekendandholiday[i+1] == 1 and x_weekendandholiday[i+2] ==1:
                x_isholidaywknd[i] = 1
                x_isholidaywknd[i+1] = 1
                x_isholidaywknd[i+2] = 1
                
    x_week1tueswed = x_week1 & x_istueswed
    x_week2tueswed = x_week2 & x_istueswed
    x_week3tueswed = x_week3 & x_istueswed
    x_week4tueswed = x_week4 & x_istueswed
    
    x_week1frisat = x_week1 & x_isfrisat
    x_week2frisat = x_week2 & x_isfrisat
    x_week3frisat = x_week3 & x_isfrisat
    x_week4frisat = x_week4 & x_isfrisat
    
    x_week1wknd = x_week1 & x_isweekend
    x_week2wknd = x_week2 & x_isweekend
    x_week3wknd = x_week3 & x_isweekend
    x_week4wknd = x_week4 & x_isweekend

    X_vars = pd.concat([x_dayssincestart,x_dayofweek,x_dayofmonth,
                       x_week1,x_week2,x_week3,x_week4,
                       x_isweekend,x_istueswed,x_isfrisat,
                       x_isholiday,x_isholidaywknd,
                       x_week1tueswed,x_week2tueswed,x_week3tueswed,x_week4tueswed,
                       x_week1frisat,x_week2frisat,x_week3frisat,x_week4frisat,
                       x_week1wknd,x_week2wknd,x_week3wknd,x_week4wknd],
                       axis=1)
    
    return X_vars

In [7]:
def divide_by_dates(X_vars, startdate, enddate):
    
    startdate_ix = date_frame.loc[date_frame['datetime'] == startdate].index.tolist()
    enddate_ix = date_frame.loc[date_frame['datetime'] == enddate].index.tolist()
    
    X_vars_cur = X_vars.loc[startdate_ix[0]:enddate_ix[0]]

    #reformat for sklearn
    X = X_vars_cur.values

    return X_vars_cur, X

In [8]:
def lassoCV_regression(stop_type, zip_of_interest):

    from sklearn.linear_model import LassoCV
    from sklearn.cross_validation import KFold
    
    #start getting data!
    outdir = ''


    if stop_type == 'Pickups and Deliveries':
        tbl = pd.read_pickle(outdir + 'di_subandord_allcounts.pkl')
    if stop_type == 'Deliveries':
        tbl = pd.read_pickle(outdir + 'di_orders_allcounts.pkl')
    if stop_type == 'Pickups':
        tbl = pd.read_pickle(outdir + 'di_submissions_allcounts.pkl')

    #get rid of zero value on july 4th
    tbl.loc['2015-07-04'][:] = np.nan
    tbl.interpolate(inplace=True)

    #change zipcodes to strings
    zips_to_str = {zip_fl:str(zip_fl) for zip_fl in tbl.columns}
    tbl = tbl.rename(columns=zips_to_str)

    #extract just zipcodes of interest
    if zip_of_interest == 'all':
        y = tbl[list(zips_to_str.values())].sum(axis=1)
    else:
        zip_input_list = zip_of_interest.split(",")

        if len(zip_input_list) > 1:
            try:
                y = tbl[zip_input_list].sum(axis=1)
            except:
                raise ValueError('Zip not found in table')
        else:
            y = tbl[zip_input_list]

    y.name = 'y'

    mindate = tbl.index.min()
    maxdate = tbl.index.max()
    futuredate = '2016-01-30'

    date_frame = setup_datetable(mindate, futuredate)

    X_vars_all = make_regression_features(date_frame)
    
    X_vars,X = divide_by_dates(X_vars_all, mindate, maxdate)
    X_vars_future,X_future = divide_by_dates(X_vars_all, maxdate, pd.to_datetime(futuredate))

    #make cross-validated model
    kf = KFold(len(y),n_folds=10,shuffle=True)
    lasso = LassoCV(cv=kf)
    lasso_fit = lasso.fit(X,y)

    y_pred = lasso_fit.predict(X)
    y_pred_future = lasso_fit.predict(X_future)

    lasso_coefs = pd.DataFrame({'var':X_vars.columns,'coef_weight':lasso.coef_})
    lasso_coefs = lasso_coefs[abs(lasso_coefs.coef_weight) > 0.00001]
    lasso_coefs.sort(columns='coef_weight',ascending=False,inplace=True)
    
    plt.plot(y,'b')
    plt.plot(y_pred,'r')
    plt.plot(X_vars_future.dayssincestart,y_pred_future,'g')

    return lasso_coefs