In [10]:
import pandas as pd
import numpy as np

In [2]:
# Loading 10M rows to create an overview of the data
NUM_ROWS = 10000000

In [3]:
train_df = pd.read_csv('../dataset/train.csv', nrows=NUM_ROWS)

In [5]:
# Fetching the names of the columns that are present in the data
columns = train_df.columns

# Getting the count to see if the correct amout of data was retrieved
# Also getting the dtypes of each column to get an idea of the data
train_df.count(), train_df.dtypes

(key                  10000000
 fare_amount          10000000
 pickup_datetime      10000000
 pickup_longitude     10000000
 pickup_latitude      10000000
 dropoff_longitude     9999931
 dropoff_latitude      9999931
 passenger_count      10000000
 dtype: int64, key                   object
 fare_amount          float64
 pickup_datetime       object
 pickup_longitude     float64
 pickup_latitude      float64
 dropoff_longitude    float64
 dropoff_latitude     float64
 passenger_count        int64
 dtype: object)

In [6]:
# Points to note
#    -> something's off with dropoff_longitude and dropoff_latitide
# scan for null or missing values in both columns
train_df.isnull().any()

key                  False
fare_amount          False
pickup_datetime      False
pickup_longitude     False
pickup_latitude      False
dropoff_longitude     True
dropoff_latitude      True
passenger_count      False
dtype: bool

In [15]:
# for sure there are null values in the dropoff_longitude and dropoff_latitude
# These do not seem to be data in which null values can be substituted.
# Removing these value for now, will come to it later.
null_value_columns = ['dropoff_longitude', 'dropoff_latitude']

#fetching indexes to be dropped
drop_index_longitude = train_df[train_df['dropoff_longitude'].isnull()].index
drop_index_latitude = train_df[train_df['dropoff_latitude'].isnull()].index
drop_indexes = np.array(list(set(drop_index_longitude).union(set(drop_index_latitude))))
drop_indexes

array([4835072, 9060096, 8131337, 4165644, 9496338, 4789267, 8913939,
       2664981,  524834, 8631332, 2747686, 2455848, 2637865, 3310378,
       9028651, 2267436, 4236846, 8862512, 6442547,  340533, 6660408,
       9536062, 6678592, 9354560, 6189379,  794694,  574023, 1882440,
        428108, 9754957, 6571093, 3700567, 6501722, 4854887, 7844202,
       8891498, 1220978, 8190328, 5784187, 3244924, 7191178, 8552586,
       9715861, 4114839, 6358428,  120227, 3952804, 5616035,  895400,
       2455721, 9699243,  471472, 3162290, 4617652, 8160692, 1476796,
       2277566,  245696, 2794177, 3941824, 5591752, 6269652, 9088217,
       1521628, 9609188,  580338, 2087156, 9145845, 9093119])

In [16]:
train_df.drop(drop_indexes, inplace=True)
train_df.count()

key                  9999931
fare_amount          9999931
pickup_datetime      9999931
pickup_longitude     9999931
pickup_latitude      9999931
dropoff_longitude    9999931
dropoff_latitude     9999931
passenger_count      9999931
dtype: int64

In [17]:
# Since most of the column datatype is numerical value, it should be safe to use describe here. 
train_df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9999931.0,9999931.0,9999931.0,9999931.0,9999931.0,9999931.0
mean,11.33849,-72.50778,39.91936,-72.50897,39.91913,1.684805
std,9.799845,12.99413,9.322519,12.87532,9.23728,1.323421
min,-107.75,-3439.245,-3492.264,-3426.601,-3488.08,0.0
25%,6.0,-73.99207,40.73491,-73.99139,40.73403,1.0
50%,8.5,-73.98181,40.75263,-73.98016,40.75316,1.0
75%,12.5,-73.9671,40.76712,-73.96367,40.7681,2.0
max,1273.31,3457.626,3344.459,3457.622,3351.403,208.0


In [21]:
# I don't like scientific notation here, let's set the format type with pandas before calling describe API
# Credit to Stack Overflow here: 
# https://stackoverflow.com/questions/21137150/format-suppress-scientific-notation-from-python-pandas-aggregation-results/21140339
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [22]:
train_df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9999931.0,9999931.0,9999931.0,9999931.0,9999931.0,9999931.0
mean,11.338,-72.508,39.919,-72.509,39.919,1.685
std,9.8,12.994,9.323,12.875,9.237,1.323
min,-107.75,-3439.245,-3492.264,-3426.601,-3488.08,0.0
25%,6.0,-73.992,40.735,-73.991,40.734,1.0
50%,8.5,-73.982,40.753,-73.98,40.753,1.0
75%,12.5,-73.967,40.767,-73.964,40.768,2.0
max,1273.31,3457.626,3344.459,3457.622,3351.403,208.0
