In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error
plt.style.use('fivethirtyeight')
from datetime import datetime
from sklearn.model_selection import train_test_split
from pandas.tseries.holiday import USFederalHolidayCalendar
import scipy.stats as stats
pd.options.mode.chained_assignment = None  # default='warn'

____

In [None]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
master_weather_data = pd.read_csv('../data/master_weather_data.csv')
master_weather_data = master_weather_data.drop('Unnamed: 0', axis = 1)
master_weather_data['weather_zip'] = master_weather_data['weather_zip'].astype(str)
master_weather_data['date_time'] = pd.to_datetime(master_weather_data['date_time'], utc = True)

In [None]:
df = pd.read_csv('../data/with_weather.csv')
df = df.drop_duplicates('RIDE_ID')
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'], axis = 1)

In [None]:
df['started_on'] = pd.to_datetime(df['started_on'], utc = True)
df['started_on'] = pd.DatetimeIndex(df['started_on'])

In [None]:
df['started_on_hour'] = df['started_on'].apply(lambda x: pd.to_datetime(
    datetime.combine(x.date(), datetime.now().replace(microsecond=0,second=0,minute=0,hour=x.hour).time()),
    utc = True
))

In [None]:
df['day_of_week'] = df['started_on'].apply(lambda d: d.weekday())
df['weekend'] = df['day_of_week'].apply(lambda d: 1 if d in {5, 6} else 0)

In [None]:
federal_holidays = USFederalHolidayCalendar().holidays(
    start = np.min(df['started_on']),
    end = np.max(df['started_on'])
)
federal_holidays = set(pd.Series(federal_holidays).apply(lambda d: d.date()))
df['federal_holiday'] = df['started_on'].apply(lambda d: 1 if d.date() in federal_holidays else 0)

In [None]:
def get_hourly_data_for_zone(df, weather_data, zone_label):
    data = df[df['zone_label'] == zone_label]
    
    data = data[[
        'started_on_hour',
        'RIDE_ID',
        'day_of_week',
        'weekend',
        'HeatIndexC',
        'precipMM',
        'humidity',
        'federal_holiday',
        'zipcode'
        ]].set_index('started_on_hour').resample('H').agg({
        'RIDE_ID':'count',
        'day_of_week':'max',
        'weekend':'max',
        'HeatIndexC': 'mean',
        'precipMM': 'mean',
        'humidity': 'mean',
        'federal_holiday': 'max'
    }).rename(
        columns = {
            'RIDE_ID': 'num_rides'
        }
    ).reset_index('started_on_hour')
    
    complete_data = data[~(pd.isna(data['day_of_week']))]
    missing_data = data[pd.isna(data['day_of_week'])]
    
    # For NaN values (i.e. where total hourly rides are 0)
    missing_data['day_of_week'] = missing_data['started_on_hour'].apply(lambda d: d.weekday())
    missing_data['weekend'] = missing_data['day_of_week'].apply(lambda d: 1 if d in {5, 6} else 0)
    federal_holidays = USFederalHolidayCalendar().holidays(
        start = np.min(missing_data['started_on_hour']),
        end = np.max(missing_data['started_on_hour'])
    )
    federal_holidays = set(pd.Series(federal_holidays).apply(lambda d: d.date()))
    missing_data['federal_holiday'] = missing_data['started_on_hour'].apply(lambda d: 1 if d.date() in federal_holidays else 0)
    
    missing_data['common_zipcode'] = str(stats.mode(df[df['zone_label'] == zone_label]['zipcode'])[0][0])[:5]
    weather_data = weather_data[[
        'weather_zip',
        'date_time',
        'HeatIndexC',
        'precipMM',
        'humidity'
    ]]
    missing_data = pd.merge(
        missing_data.drop(
            ['HeatIndexC', 'precipMM', 'humidity'], axis = 1
            ), 
        weather_data,
        how = 'left',
        left_on = ['common_zipcode', 'started_on_hour'],
        right_on = ['weather_zip', 'date_time']
    ).drop(['weather_zip', 'date_time', 'common_zipcode'], axis = 1)
    
    zone_data = pd.concat([
        complete_data,
        missing_data
    ]).sort_values('started_on_hour').reset_index(drop = True)
    
    return zone_data.set_index('started_on_hour')

In [None]:
hourly_count = get_hourly_data_for_zone(df, master_weather_data, 4)

hourly_count = pd.concat([
    hourly_count,
    pd.get_dummies(hourly_count['day_of_week'], prefix='day_of_week')
], axis = 1).drop('day_of_week', axis = 1)

hourly_count = hourly_count[(hourly_count.index >= pd.to_datetime('2016-07-01', utc = True)) &
             (hourly_count.index <= pd.to_datetime('2017-02-07', utc = True))]

hourly_count['HeatIndexC'] = hourly_count['HeatIndexC'].replace(np.nan, np.mean(hourly_count['HeatIndexC']))
hourly_count['precipMM'] = hourly_count['precipMM'].replace(np.nan, np.mean(hourly_count['precipMM']))
hourly_count['humidity'] = hourly_count['humidity'].replace(np.nan, np.mean(hourly_count['humidity']))

In [None]:
hourly_count['hour'] = hourly_count.index.hour
hourly_count['day_of_month'] = hourly_count.index.day

____

In [None]:
X = hourly_count.drop('num_rides', axis = 1)
y = hourly_count['num_rides']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=RANDOM_SEED, shuffle = False)

In [None]:
reg = xgb.XGBRegressor(n_estimators=1000)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=50,
       verbose=False)

In [None]:
_ = plot_importance(reg, height=0.9)

In [None]:
y_pred = reg.predict(X_test)

In [None]:
evaluate_df = hourly_count.iloc[len(X_train) :len(hourly_count)]
evaluate_df['predicted_demand'] = y_pred
evaluate_df['absolute_error'] = abs(evaluate_df['num_rides'] - evaluate_df['predicted_demand'])
evaluate_df = evaluate_df[['num_rides', 'predicted_demand', 'absolute_error']]

In [None]:
plt.figure(figsize=(18,6))
plt.plot(np.arange(0, len(y_train)), y_train, 'g', label="history")
plt.plot(np.arange(len(y_train), len(y_train) + len(y_test)), y_test, marker='.', label="true")
plt.plot(np.arange(len(y_train), len(y_train) + len(y_test)), y_pred, 'r', label="prediction")
plt.ylabel('Value')
plt.xlabel('Time Step')
plt.legend()
plt.show();

____