In [359]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn import preprocessing
import matplotlib.pyplot as plt

In [360]:
weather = pd.read_csv('../data/KNYC.txt')

getTimeWeather = lambda x: dt.datetime.strptime(x, '%m/%d/%Y %H:%M:%S')

weather.Date = [getTimeWeather(x) for x in weather.Date ]

weather.Date = [x.replace(hour = y) for x, y in zip(weather.Date, weather.Hour) ]

weather_useful = weather[['Date', 'Temperature', 'Precip']]

weather_useful = weather_useful[weather_useful.Date > dt.datetime(year = 2015, month = 1, day = 1)]

weather_useful.index = weather_useful.Date

In [362]:
weather_simple = weather_useful.drop('Date',  axis = 1)

weather_simple = weather_simple.resample('30min').mean().bfill()

In [363]:
def preprocess_data(data_set_name):
    data = pd.read_csv('../data/data_{}.csv'.format(data_set_name))
    getTime = lambda x: dt.datetime.strptime(x[:-5], '%Y-%m-%d %H:%M:%S')
    data.dropna(axis = 0, inplace=True)
    data['starttime'] = [getTime(x) for x in data['starttime']]
    data['stoptime'] = [getTime(x) for x in data['stoptime']]
    data['downsample'] = data['starttime']
    data['downsample'] = [x.replace(minute = 0, second = 0) for x in data['downsample']]
    data = data.merge(weather_useful, how = 'left', left_on = 'downsample', right_on = 'Date')
    data.drop(columns=['downsample', 'Date'], axis = 1, inplace= True)
    return data
    

In [364]:
data_train = preprocess_data('train')
data_train = data_train[data_train['tripduration'] < 50000]
data_train.to_pickle('../data/train.pkl')

Defaulting to column, but this will raise an ambiguity error in a future version
  if __name__ == '__main__':


In [365]:
data_val = preprocess_data('val')
data_val = data_val[data_val['tripduration'] < 50000]
data_val.to_pickle('../data/val.pkl')

Defaulting to column, but this will raise an ambiguity error in a future version
  if __name__ == '__main__':


In [366]:
data_test = preprocess_data('test')

data_test = data_test[data_test['tripduration'] < 50000]
data_test.to_pickle('../data/test.pkl')

Defaulting to column, but this will raise an ambiguity error in a future version
  if __name__ == '__main__':


In [371]:
# feature: area_id
# list areas = []
def transform(dataset, window = 30):
    nets = []
    for i in range(1, 32):
        inflow = dataset[dataset['end region']==i].stoptime
        outflow = dataset[dataset['end region']==i].starttime
        inflow.index = inflow
        outflow.index = outflow
        inflow = inflow.resample('{}min'.format(window)).count()
        outflow = outflow.resample('{}min'.format(window)).count()
        net = pd.DataFrame((inflow - outflow), columns = ['net_inflow'])
        net['area_id'] = i
        net['AR1'] = net['net_inflow'].shift(1)
        net['AR2'] = net['net_inflow'].shift(2)
        net['AR3'] = net['net_inflow'].shift(3)
        nets.append(net)
    nets = pd.concat(nets, axis = 0)
    nets.fillna(0, inplace = True)
    nets = nets.merge(weather_simple, left_index=True, right_index=True, how = 'left')
    return nets


        

In [372]:
net_train = transform(data_train)

net_val = transform(data_val)

net_test = transform(data_test)

In [373]:
net_train.to_pickle('../data/net_train.pkl')
net_val.to_pickle('../data/net_val.pkl')
net_test.to_pickle('../data/net_test.pkl')