## Taxi Trip Duration Data Cleaning 

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.shape

(1458644, 11)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
id                    1458644 non-null object
vendor_id             1458644 non-null int64
pickup_datetime       1458644 non-null object
dropoff_datetime      1458644 non-null object
passenger_count       1458644 non-null int64
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null object
trip_duration         1458644 non-null int64
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


In [5]:
# randomly sample 10% of the original dataset
df = train.sample(frac = 0.1, random_state = 12345)
df.shape

(145864, 11)

In [6]:
# remove trips with duration longer than 2 hrs
df['trip_duration_min'] = df['trip_duration']/60
df = df[df['trip_duration_min'] < 120]

In [7]:
# extract weekday and hour info from datetime columns
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

df['pickup_hour'] = df.pickup_datetime.dt.hour
df['pickup_weekday'] = df.pickup_datetime.dt.weekday_name
df['dropoff_hour'] = df.dropoff_datetime.dt.hour
df['dropoff_weekday'] = df.dropoff_datetime.dt.weekday_name

df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,trip_duration_min,pickup_hour,pickup_weekday,dropoff_hour,dropoff_weekday
676091,id3159139,1,2016-05-15 02:27:17,2016-05-15 02:30:13,1,-73.982048,40.727985,-73.977142,40.733807,N,176,2.933333,2,Sunday,2,Sunday
259910,id2239505,2,2016-06-01 12:20:08,2016-06-01 12:31:09,1,-73.974159,40.762939,-73.97654,40.759239,N,661,11.016667,12,Wednesday,12,Wednesday
1953,id3478777,1,2016-04-24 15:52:41,2016-04-24 16:19:18,1,-73.961739,40.81115,-73.973663,40.757576,N,1597,26.616667,15,Sunday,16,Sunday
1220368,id3115554,2,2016-02-09 07:32:23,2016-02-09 07:41:55,6,-73.967339,40.766239,-73.956276,40.784359,N,572,9.533333,7,Tuesday,7,Tuesday
581265,id2075806,2,2016-05-26 02:05:09,2016-05-26 02:09:06,1,-73.999649,40.728306,-73.996376,40.737839,N,237,3.95,2,Thursday,2,Thursday


In [8]:
# Export cleaned data as csv
df.to_csv('taxi_sample.csv')