In [None]:
import os, sys
import pandas as pd
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
import datetime
from datetime import date
%matplotlib inline

In [None]:
data_dir = '../Data/'

In [None]:
data_train = pd.read_table(data_dir + 'citibike_train.csv', sep='\t', error_bad_lines=False)
data_test = pd.read_csv(data_dir + 'citibike_test.csv', sep='\t', error_bad_lines=False)

In [None]:
data_train.drop(data_test.columns[2:20],axis = 1, inplace= True)
data_test.drop(data_test.columns[2:20],axis = 1, inplace= True)

In [None]:
def clean_citibike(data):
    data['datetime'] = pd.to_datetime(data['date'], format='%y-%m-%d')
    data.drop('date', axis = 1, inplace = True)
    #binary feature of each day
    data['date'] = data['datetime'].dt.date
    data['day_of_week'] = data.datetime.dt.weekday_name
    data['weekday'] = [0 if x in ['Saturday','Sunday'] else 1 for x in data['day_of_week']]

    for day in ['Monday', 'Tuesday','Wednesday', 'Thursday','Friday', 'Saturday','Sunday']:
        data[day] = data.day_of_week == day

    #holiday    
    holiday = ['2016-01-01','2016-01-18', '2016-02-15', '2016-05-30',
               '2016-07-04', '2016-09-05', '2016-10-10', '2016-11-11', '2016-11-24', '2016-12-25']

    holiday = pd.to_datetime(holiday, format='%Y-%m-%d')
    data['holiday'] = [1 if x in holiday else 0 for x in data.datetime]

    #cyclical characteristics of time
    data['hour_24'] = data.hour + data.pm * 12 
    data['time_x'] = np.cos((data.hour_24 + data.minute/60) * 2*np.pi /24)
    data['time_y'] = np.sin((data.hour_24 + data.minute/60) * 2*np.pi /24)
    data['dayofyear'] = data.datetime.dt.dayofyear
    data['day_x'] = np.cos((data.dayofyear) * 2*np.pi /365)
    data['day_y'] = np.sin((data.dayofyear) * 2*np.pi /365)

    data.drop('datetime', axis = 1, inplace = True)

    #Target 
    data['shortage'] = [1 if x<=5 else 0 for x in data['avail_bikes']]
    return data




In [None]:
d_train = clean_citibike(data_train)
d_test = clean_citibike(data_test)

In [None]:
# base rate
train_base = d_train.shortage.sum()/len(d_train)
test_base = d_test.shortage.sum()/len(d_test)
print(train_base , test_base)

In [None]:
data_train.columns

In [None]:
#X: features
features = [#'station_id', 
            #'tot_docks',
            'time_x','time_y','_lat','_long', 'Monday', 'Tuesday','Wednesday',
 'Thursday','Friday','Saturday','Sunday', 'weekday','day_x','day_y', 'holiday']
X_train = d_train.loc[:,features].as_matrix()
X_test = d_test.loc[:,features].as_matrix()

#Y: target
Y_train = (d_train['shortage']).as_matrix()
Y_test = (d_test['shortage']).as_matrix()


In [None]:
#logistic Regression
from sklearn.linear_model import LogisticRegression as lr
model = lr()
model.fit(X_train,Y_train)


In [None]:
((model.predict(X_test) > 0) == Y_test).sum()/X_test.shape[0]

# weather

In [None]:
weather_dir = '../Data/'
weather_filedir = data_dir + 'weather.csv'

weather_data = pd.read_csv(weather_filedir)


weather_features = ['pickup_datetime', 
                    'tempi', #temperature in farenheit
                    'presipi', #presipitation
                    'wspdi', #wind speed in mph
                   'rain', # rain, boolean
                   'snow',# snow, boolean
                   'thunder', # thunder, boolean
                   'visi' ]
weather = weather_data.loc[:,weather_features] #vibilitity in miles



In [None]:
weather.fillna(0, inplace=True)
weather['datetime'] = pd.to_datetime(weather['pickup_datetime'], format='%Y-%m-%d %H:%M::%S', infer_datetime_format=True)
weather['date'] = weather['datetime'].dt.date
weather['hour_24'] = weather['datetime'].dt.hour+1
weather['minute'] = weather['datetime'].dt.minute
weather.drop(['pickup_datetime', 'datetime'],axis = 1, inplace = True)

In [None]:
g = weather[['tempi', 'presipi', 'wspdi', 'rain', 'snow', 'thunder', 'visi']].groupby([weather['date'], weather['hour_24']])
weather_by_hour = g.mean()

for i in ['rain', 'snow', 'thunder']:
    weather_by_hour[i] = [1 if w>0 else 0 for w in weather_by_hour[i]]
    
weather_by_hour.reset_index(inplace = True)

weather_by_hour.head()

In [None]:
#joining weather data to original dataset

merged = pd.merge(data_train,weather_by_hour, on=['date','hour_24'], how = 'left')

In [None]:
merged

In [None]:
#X: features
feature = features + weather_features
X_train = merged.loc[:,features].as_matrix()
X_test = merged.loc[:,features].as_matrix()
#Y: target
Y_train = (merged['shortage']).as_matrix()
Y_test = (merged['shortage']).as_matrix()


In [None]:
#logistic Regression
from sklearn.linear_model import LogisticRegression as lr
model = lr()
model.fit(X_train,Y_train)


In [None]:
((model.predict(X_test) > 0) == Y_test).sum()/X_test.shape[0]