In [1]:
import numpy as np
from pandas import read_csv, DataFrame
import matplotlib.pyplot as plt
%matplotlib inline

In [1]:
def distance(lat1, lng1, lat2, lng2):
    #return distance as meter if you want km distance, remove "* 1000"
    radius = 6371

    dLat = (lat2-lat1) * np.pi / 180
    dLng = (lng2-lng1) * np.pi / 180

    lat1 = lat1 * np.pi / 180
    lat2 = lat2 * np.pi / 180

    val = np.sin(dLat/2) * np.sin(dLat/2) + np.sin(dLng/2)\
    * np.sin(dLng/2) * np.cos(lat1) * np.cos(lat2)    
    ang = 2 * np.arctan2(np.sqrt(val), np.sqrt(1-val))
    return radius * ang

In [2]:
def transform_train(train_df):
    
    # Remove missing values
    train_df = train_df.dropna(how = 'any', axis = 'rows')
    
    # Remove key column
    #train_df = train_df.drop(['key'], axis=1)
    
    # Remove absurd passenger_count
    #train_df = train_df[(train_df['passenger_count'] >= 1) &
    #                (train_df['passenger_count'] <= 6)]

    # Remove passenger_count column
    #train_df = train_df.drop(['passenger_count'], axis=1)

    # Remove negative and extreme fare_amount values
    train_df = train_df[(train_df['fare_amount'] >= 2.5) & (train_df['fare_amount'] <= 300)]
    print('Maximum fare_amount: %.1f' % train_df['fare_amount'].max())

    # Remove no displacements
    train_df = train_df[(train_df['pickup_latitude'] != train_df['dropoff_latitude'])]
    train_df = train_df[(train_df['pickup_longitude'] != train_df['dropoff_longitude'])]

    # Remove absurd displacements
    # A fixed value of 0.01 was added to the extreme coordinates in the test set
    # to permit finding neighbors within a radius < 0.01 around
    # pickup and dropoff locations
    train_df = train_df[(train_df['pickup_longitude'] >= lng1min+0.01) &
                        (train_df['pickup_longitude'] <= lng1max+0.01)]
    train_df = train_df[(train_df['dropoff_longitude'] >= lng2min+0.01) &
                        (train_df['dropoff_longitude'] <= lng2max+0.01)]
    train_df = train_df[(train_df['pickup_latitude'] >= lat1min+0.01) &
                        (train_df['pickup_latitude'] <= lat1max+0.01)]
    train_df = train_df[(train_df['dropoff_latitude'] >= lat2min+0.01) &
                        (train_df['dropoff_latitude'] <= lat2max+0.01)]

    # Create new features - dayofweek,hour,month,year
    #train_df['dayofweek'] = train_df['pickup_datetime'].dt.dayofweek.astype('uint8')
    #train_df['weekend'] = np.where(train_df.dayofweek < 5,0,1).astype('uint8')
    train_df['hour'] = train_df['pickup_datetime'].dt.hour.astype('uint8')
    train_df['hour_circ'] = np.where(train_df['hour'] <= 12,
                                (train_df['hour'] - 0)/12,
                                (24 - train_df['hour'])/12).astype('float32')
    #train_df = train_df.drop(['dayofweek'], axis=1)
    #bins = IntervalIndex.from_tuples([(0, 6),
    #                                 (6, 12),
    #                                 (12, 16),
    #                                 (16,24)], closed='left')
    #train_df['h_class'] = cut(train_df['hour'], bins, labels=False)
    #train_df['h_class'] = np.where((train_df['hour'] >= 0) & (train_df['hour'] <= 6),1,0).astype('uint8')
    #train_df['h_class'] = np.where((train_df['hour'] >= 6) & (train_df['hour'] <= 12),2,train_df['h_class'])
    #train_df['h_class'] = np.where((train_df['hour'] >= 12) & (train_df['hour'] <= 16),3,train_df['h_class'])
    #train_df['h_class'] = np.where((train_df['hour'] >= 16) & (train_df['hour'] <= 20),4,train_df['h_class'])
    #train_df['h_class'] = np.where((train_df['hour'] >= 20) & (train_df['hour'] < 24),5,train_df['h_class'])
    #train_df['day'] = train_df['pickup_datetime'].dt.day.astype('uint8')
    #train_df['month'] = train_df['pickup_datetime'].dt.month.astype('uint8')
    #train_df['year'] = train_df['pickup_datetime'].dt.year.astype('uint16')

    # Create features for special rates
    #train_df['eightpmto6am'] = np.where((train_df['hour'] >= 6) &
    #                                (train_df['hour'] < 20),0,1).astype('uint8')
    #train_df['holiday'] = 
    #train_df['rushhour'] = np.where((train_df['hour'] >= 16) &
    #                                (train_df['hour'] < 20) &
    #                                (train_df['weekend'] == 0),1,0).astype('uint8')
    train_df = train_df.drop(['hour'], axis=1)

    # Create dataframes for the two periods
    train_df['period'] = np.where(train_df['pickup_datetime'] < '2012-09-01',0,1).astype('uint8')
    
    # Save memory
    train_df = train_df.drop(['pickup_datetime'], axis=1)
    print(train_df.shape)
    return train_df

In [3]:
# Read the test set
test_df = read_csv('test.csv', parse_dates=['pickup_datetime'], infer_datetime_format=True)
test_df.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12,-73.966046,40.789775,-73.988565,40.744427,1


In [4]:
# Calculate the extreme values for longitude and latitude in the test set
# to constrain the training set
lng1min = test_df.pickup_longitude.min()
lng2min = test_df.dropoff_longitude.min()
lat1min = test_df.pickup_latitude.min()
lat2min = test_df.dropoff_latitude.min()
#
lng1max = test_df.pickup_longitude.max()
lng2max = test_df.dropoff_longitude.max()
lat1max = test_df.pickup_latitude.max()
lat2max = test_df.dropoff_latitude.max()

In [5]:
types = {'fare_amount': 'float32',
         'pickup_longitude': 'float32',
         'pickup_latitude': 'float32',
         'dropoff_longitude': 'float32',
         'dropoff_latitude': 'float32'
         #'passenger_count': 'uint8'
        }

cols = ['fare_amount', 'pickup_datetime',
        'pickup_longitude', 'pickup_latitude',
        'dropoff_longitude', 'dropoff_latitude',
        
       #'passenger_count'
       ]

train_df = read_csv('train100K.csv', parse_dates=['pickup_datetime'], infer_datetime_format=True,
                   dtype = types, usecols=cols)

print(train_df.shape)

(100000, 6)


In [6]:
train_df = transform_train(train_df)

Maximum fare_amount: 250.0
(96756, 7)


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96756 entries, 0 to 99999
Data columns (total 7 columns):
fare_amount          96756 non-null float32
pickup_longitude     96756 non-null float32
pickup_latitude      96756 non-null float32
dropoff_longitude    96756 non-null float32
dropoff_latitude     96756 non-null float32
hour_circ            96756 non-null float32
period               96756 non-null uint8
dtypes: float32(6), uint8(1)
memory usage: 3.0 MB


In [8]:
train_df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,hour_circ,period
0,10.5,-73.978989,40.76162,-73.963959,40.770561,0.916667,1
1,7.5,-73.980347,40.763069,-73.988686,40.753979,0.916667,1
2,4.5,-74.002281,40.726578,-74.009583,40.723598,0.166667,0
3,14.5,-73.9328,40.7589,-73.995003,40.736599,0.083333,0
4,13.0,-73.975418,40.76107,-73.998444,40.724892,1.0,1
