In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd
import os

In [None]:
DATA_FILE = os.path.join('../data', 'train.csv')

In [None]:
data = pd.read_csv(DATA_FILE, parse_dates=['pickup_datetime', 'dropoff_datetime'])

In [None]:
data.iloc[0:5]

In [None]:
data.columns

In [None]:
data['pickup_hr'] = data['pickup_datetime'].dt.hour
data['pickup_min'] = data['pickup_datetime'].dt.minute
data['pickup_sec'] = data['pickup_datetime'].dt.second
data['pickup_day'] = data['pickup_datetime'].dt.dayofweek
data['pickup_date'] = data['pickup_datetime'].dt.day
data['pickup_mon'] = data['pickup_datetime'].dt.month
data['pickup_yr'] = data['pickup_datetime'].dt.year

In [None]:
data = data.assign(st_dist=lambda df: np.sqrt((df.pickup_longitude-df.dropoff_longitude)**2 + 
                                       (df.pickup_latitude-df.dropoff_latitude)**2))

In [None]:
data.columns

In [None]:
plt.subplot(2, 1, 1)
plt.plot(data['pickup_longitude'], data['pickup_latitude'], 'r.')
plt.ylabel('latitude')
plt.title('Pickup')
plt.subplot(2, 1, 2)
plt.plot(data['dropoff_longitude'], data['dropoff_latitude'], 'b.')
plt.ylabel('latitude')
plt.xlabel('longitude')
plt.title('Dropoff')
plt.show()

In [None]:
data['s_pickup_latitude'] = (data['pickup_latitude'] - data['pickup_latitude'].min())/(data['pickup_latitude'].max() - data['pickup_latitude'].min())
data['s_pickup_longitude'] = (data['pickup_longitude'] - data['pickup_longitude'].min())/(data['pickup_longitude'].max() - data['pickup_longitude'].min())

data['s_dropoff_latitude'] = (data['dropoff_latitude'] - data['dropoff_latitude'].min())/(data['dropoff_latitude'].max() - data['dropoff_latitude'].min())
data['s_dropoff_longitude'] = (data['dropoff_longitude'] - data['dropoff_longitude'].min())/(data['dropoff_longitude'].max() - data['dropoff_longitude'].min())

In [None]:
plt.subplot(2, 1, 1)
plt.plot(data['s_pickup_longitude'], data['s_pickup_latitude'], 'r.')
plt.ylabel('latitude')
plt.title('Pickup')
plt.subplot(2, 1, 2)
plt.plot(data['s_dropoff_longitude'], data['s_dropoff_latitude'], 'b.')
plt.ylabel('latitude')
plt.xlabel('longitude')
plt.title('Dropoff')
plt.show()

In [None]:
data['flag'] = (data['store_and_fwd_flag'] == 'Y').astype(int)

In [None]:
data['month_end'] = (data['pickup_datetime'].dt.is_month_end).astype(int)
data['month_start'] = (data['pickup_datetime'].dt.is_month_start).astype(int)

In [None]:
data['month_start'].hist()

In [None]:
plt.subplot(2, 1, 1)
plt.plot(data['trip_duration'][:1000], data['month_end'][:1000], 'r.')

plt.subplot(2, 1, 2)
plt.plot(data['trip_duration'][:1000], data['month_start'][:1000], 'b.')

plt.show()

In [None]:
data['flag'].hist()

In [None]:
data['pickup_yr'].hist()

In [None]:
from pandas.tseries.holiday import USFederalHolidayCalendar
from datetime import datetime

In [None]:
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start=datetime(2015, 12, 31), end=datetime(2017, 1, 1))

data['holiday'] = (data['pickup_datetime'].dt.date.astype('datetime64[ns]').isin(holidays)).astype(int)

In [None]:
data['holiday'].hist()

In [None]:
from haversine import haversine

def haversine_dist(pos):
    p1 = (pos[0], pos[1])
    p2 = (pos[2], pos[1])
    p3 = (pos[2], pos[3])
    dist = haversine(p1, p2) + haversine(p2, p3)
    dist /= 1000
    return dist

In [None]:
positions = data[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']]
data['h_dist'] = map(lambda p: haversine_dist(p), positions.as_matrix())

In [None]:
data['h_dist'].hist(bins=50)

In [None]:
from matplotlib import pyplot as plt

plt.hist(data['h_dist'], bins=50, normed=True, range=(-0.001, 0.04))
plt.hist(data['h_dist'], bins=50, normed=True, range=(0.04, 1))
plt.hist(data['h_dist'], bins=50, normed=True, range=(1, 1.4))
plt.show()

In [None]:
def travel_direction(positions):
    y = positions[1] - positions[3]
    x = positions[0] - positions[2]
    deg = np.degrees(np.arctan2(y, x))
    return np.round(deg, 2)

print travel_direction([40.738564, -73.980415, 40.731152, -73.999481])

In [None]:
data['travel_direction'] = map(lambda x: travel_direction(x), positions.as_matrix())

In [None]:
data['travel_direction'].hist(bins=25)

In [None]:
df1 = data[['pickup_longitude', 'pickup_latitude']].as_matrix()
df2 = data[['dropoff_longitude', 'dropoff_latitude']].as_matrix()
features = np.vstack([df1, df2])

In [None]:
from sklearn.cluster import MiniBatchKMeans

In [None]:
cluster = MiniBatchKMeans(n_clusters=16, max_iter=500, batch_size=100000, verbose=0, compute_labels=True, 
                          random_state=None, tol=0.0, max_no_improvement=10, n_init=3, reassignment_ratio=0.005)

In [None]:
cluster.fit(features)

In [None]:
plt.hist(cluster.labels_, bins=50)

In [None]:
data['pickup_cluster_label'] = cluster.predict(data[['pickup_longitude', 'pickup_latitude']])
data['dropoff_cluster_label'] = cluster.predict(data[['dropoff_longitude', 'dropoff_latitude']])

In [None]:
idx = data['pickup_cluster_label'].as_matrix()
data['pickup_cluster_longitude'] = cluster.cluster_centers_[idx][:, 0]
data['pickup_cluster_latitude'] = cluster.cluster_centers_[idx][:, 1]

idx = data['dropoff_cluster_label'].as_matrix()
data['dropoff_cluster_longitude'] = cluster.cluster_centers_[idx][:, 0]
data['dropoff_cluster_latitude'] = cluster.cluster_centers_[idx][:, 1]

In [None]:
positions = data[['pickup_cluster_latitude', 'pickup_cluster_longitude', 
                  'dropoff_cluster_latitude', 'dropoff_cluster_longitude']]
data['cluster_dist'] = map(lambda p: haversine_dist(p), positions.as_matrix())

In [None]:
data.head()

In [None]:
traffic = data[['pickup_cluster_label', 'dropoff_cluster_label', 'pickup_hr', 
                'pickup_mon', 'pickup_date']]
traffic['val'] = np.ones(traffic.shape[0])

In [None]:
grp = traffic.groupby(['pickup_mon', 'pickup_date', 'pickup_hr', 
                       'dropoff_cluster_label']).agg('count')

In [None]:
grp

In [None]:
type(grp)