In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
# Changing the format in which the 'describe' API returns data information. 
# Scientific notation is not very useful for a quick overview
# Credit to Stack Overflow here: 
# https://stackoverflow.com/questions/21137150/format-suppress-scientific-notation-from-python-pandas-aggregation-results/21140339
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [3]:
# Loading 10M rows to create an overview of the data
NUM_ROWS = 10000000

In [4]:
train_df = pd.read_csv('../dataset/train.csv', nrows=NUM_ROWS)

In [5]:
# Since the value of key is unique and is of no use in the final prediction. Removing it from the data
# processing to use it as key
train_df.set_index('key', drop=True, inplace=True)

In [6]:
# Fetching the names of the columns that are present in the data
columns = train_df.columns

# Getting the count to see if the correct amout of data was retrieved
# Also getting the dtypes of each column to get an idea of the data
train_df.count(), train_df.dtypes

(fare_amount          10000000
 pickup_datetime      10000000
 pickup_longitude     10000000
 pickup_latitude      10000000
 dropoff_longitude     9999931
 dropoff_latitude      9999931
 passenger_count      10000000
 dtype: int64, fare_amount          float64
 pickup_datetime       object
 pickup_longitude     float64
 pickup_latitude      float64
 dropoff_longitude    float64
 dropoff_latitude     float64
 passenger_count        int64
 dtype: object)

In [7]:
# Points to note
#    -> something's off with dropoff_longitude and dropoff_latitide
# scan for null or missing values in both columns
train_df.isnull().any()

fare_amount          False
pickup_datetime      False
pickup_longitude     False
pickup_latitude      False
dropoff_longitude     True
dropoff_latitude      True
passenger_count      False
dtype: bool

In [8]:
# for sure there are null values in the dropoff_longitude and dropoff_latitude
# These do not seem to be data in which null values can be substituted.
# Removing these value for now, will come to it later.
null_value_columns = ['dropoff_longitude', 'dropoff_latitude']

#fetching indexes to be dropped
drop_index_longitude = train_df[train_df['dropoff_longitude'].isnull()].index
drop_index_latitude = train_df[train_df['dropoff_latitude'].isnull()].index
drop_indexes = np.array(list(set(drop_index_longitude).union(set(drop_index_latitude))))
drop_indexes

array(['2011-11-18 20:26:12.0000004', '2012-12-11 12:00:48.0000004',
       '2011-06-20 11:15:44.0000003', '2012-12-11 12:57:00.00000013',
       '2012-07-18 13:59:42.0000003', '2013-09-05 00:02:14.0000003',
       '2012-12-11 12:58:32.0000003', '2013-10-10 07:48:01.0000002',
       '2013-07-04 03:50:29.0000001', '2011-06-20 11:34:44.0000001',
       '2012-12-11 10:14:13.0000004', '2012-12-11 13:25:30.0000001',
       '2012-12-11 12:34:20.0000006', '2012-12-11 11:45:18.0000004',
       '2011-11-29 17:40:35.0000001', '2012-12-11 12:20:25.0000004',
       '2013-11-05 22:08:03.0000007', '2013-06-20 20:55:19.0000002',
       '2011-03-03 11:06:46.0000001', '2013-10-10 08:07:16.0000003',
       '2011-11-06 20:54:08.0000002', '2013-11-05 06:31:08.0000001',
       '2013-03-21 18:07:07.0000001', '2013-11-15 07:35:14.0000002',
       '2012-12-11 13:05:09.0000003', '2012-12-11 13:06:59.0000004',
       '2013-11-04 20:59:15.0000001', '2012-12-11 12:50:52.00000010',
       '2013-03-14 10:22:15.0000

In [9]:
train_df.drop(drop_indexes, inplace=True)
train_df.count()

fare_amount          9999931
pickup_datetime      9999931
pickup_longitude     9999931
pickup_latitude      9999931
dropoff_longitude    9999931
dropoff_latitude     9999931
passenger_count      9999931
dtype: int64

In [10]:
train_df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9999931.0,9999931.0,9999931.0,9999931.0,9999931.0,9999931.0
mean,11.338,-72.508,39.919,-72.509,39.919,1.685
std,9.8,12.994,9.323,12.875,9.237,1.323
min,-107.75,-3439.245,-3492.264,-3426.601,-3488.08,0.0
25%,6.0,-73.992,40.735,-73.991,40.734,1.0
50%,8.5,-73.982,40.753,-73.98,40.753,1.0
75%,12.5,-73.967,40.767,-73.964,40.768,2.0
max,1273.31,3457.626,3344.459,3457.622,3351.403,208.0


In [11]:
# looking at the passenger_count column, it seems bizzare that a cab had 208 passengers.
# let's see what the distribution over passenger count looks like
train_df.groupby('passenger_count')['passenger_count'].count()

passenger_count
0        35263
1      6917305
2      1476610
3       439133
4       212015
5       707721
6       211867
7            2
9            3
49           1
51           1
129          1
208          9
Name: passenger_count, dtype: int64

In [12]:
# Since multiple cab rides are reporting the exact same number of extreme passenger count, 
# it seems like there might be two possibilities: 
#    1. These are outlires that were generated from the same taxi due to some technical fault
#    2. The data might be referring to some special taxi ( or a bus ) that would be able to seat more than 6 people.
# Either way, these are anomalies and might end up skewing are result. It should
# be safe to ignore these values for now.
drop_indexes = train_df[train_df.passenger_count > 6].index
train_df.drop(drop_indexes, inplace=True)
train_df.groupby('passenger_count')['passenger_count'].count()

passenger_count
0      35263
1    6917305
2    1476610
3     439133
4     212015
5     707721
6     211867
Name: passenger_count, dtype: int64

In [13]:
# Looking at the summary of the dataframe once again: 
train_df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9999914.0,9999914.0,9999914.0,9999914.0,9999914.0,9999914.0
mean,11.338,-72.508,39.919,-72.509,39.919,1.685
std,9.8,12.994,9.322,12.86,9.237,1.308
min,-107.75,-3439.245,-3492.264,-3426.601,-3488.08,0.0
25%,6.0,-73.992,40.735,-73.991,40.734,1.0
50%,8.5,-73.982,40.753,-73.98,40.753,1.0
75%,12.5,-73.967,40.767,-73.964,40.768,2.0
max,1273.31,3457.626,3344.459,3457.622,3351.403,6.0


In [14]:
# The passenger count also contains the value 0 for about .35 percent of the entries.
# The entry count ( around 35000 ) does not seem to be small enough that it can be 
# ignored without a second thought, and
# it could be encoding a special meaning, like the driver driving around looking for rides
# let's keep it as is for now. We'll revisit this later.

In [15]:
# Another column that looks interesting is fare_amount. From the first look
# it looks like it might contain a negative values, which does not make sense. let's have a
# deeper look into the data to figure out what might be going on here. 

#Let's start with getting the number of entries with a negetive fare_amount
neg_fare_df = train_df[train_df.fare_amount < 0.0]
neg_fare_df.count()

fare_amount          420
pickup_datetime      420
pickup_longitude     420
pickup_latitude      420
dropoff_longitude    420
dropoff_latitude     420
passenger_count      420
dtype: int64

In [16]:
# 420 does not seem to be that significant a number considering the size of the sample set. 
# should be safe to ignore, but let's still explore the data a little further to see if the 
# negative value might hold some special significance or pattern that might be important to us

neg_fare_df.head()

Unnamed: 0_level_0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-03-09 23:37:10.0000005,-2.9,2010-03-09 23:37:10 UTC,-73.789,40.643,-73.789,40.642,1
2015-03-22 05:14:27.0000001,-2.5,2015-03-22 05:14:27 UTC,-74.0,40.721,-74.0,40.721,1
2013-08-30 08:57:10.0000002,-3.0,2013-08-30 08:57:10 UTC,-73.995,40.741,-73.996,40.741,4
2013-08-11 13:39:10.0000001,-2.5,2013-08-11 13:39:10 UTC,-73.785,40.648,0.0,0.0,1
2015-04-30 15:19:45.0000003,-2.5,2015-04-30 15:19:45 UTC,-73.952,40.79,-73.95,40.793,1


In [17]:
neg_fare_df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,420.0,420.0,420.0,420.0,420.0,420.0
mean,-9.288,-69.033,38.031,-67.975,37.448,1.848
std,15.072,18.472,10.177,20.198,11.127,1.433
min,-107.75,-74.23,0.0,-74.23,0.0,0.0
25%,-6.2,-73.99,40.727,-73.989,40.725,1.0
50%,-3.5,-73.976,40.753,-73.974,40.75,1.0
75%,-2.5,-73.951,40.765,-73.943,40.764,2.0
max,-0.01,0.0,40.919,0.0,40.919,6.0


In [18]:
# dropping negative fare amounts because they make no sense. 
drop_indexes = neg_fare_df.index
train_df.drop(drop_indexes, inplace=True)

In [19]:
train_df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9999494.0,9999494.0,9999494.0,9999494.0,9999494.0,9999494.0
mean,11.339,-72.508,39.919,-72.509,39.919,1.685
std,9.799,12.994,9.322,12.859,9.237,1.308
min,0.0,-3439.245,-3492.264,-3426.601,-3488.08,0.0
25%,6.0,-73.992,40.735,-73.991,40.734,1.0
50%,8.5,-73.982,40.753,-73.98,40.753,1.0
75%,12.5,-73.967,40.767,-73.964,40.768,2.0
max,1273.31,3457.626,3344.459,3457.622,3351.403,6.0


In [24]:
import itertools
columns = [('_').join(x) for x in itertools.product(['pickup', 'dropoff'], ['latitude', 'longitude'])]
columns

['pickup_latitude',
 'pickup_longitude',
 'dropoff_latitude',
 'dropoff_longitude']

In [26]:
outliers = []
for column in columns:
    outliers.extend(train_df[(train_df[column] < -180.0) | (train_df[column] > 180.0)].index)

len(set(outliers))    # total number of data points that are outside the specified range for lat. and long.

472

In [27]:
# The value of latitude (assuming it's in degrees due to the lack of any information) should be in 
# the range -180 to 180. A minimum value of 3439 and maximum value of 3457 is weird. 
# Assumption: something went wrong in the GPS collecting the data. This can be corrected using 
# median value of the data, but since the number of datapoints are so less, it won't make too much
# of a difference here. dropping the points seems like a quicker appraoch.

for column in columns:
    drop_indexes = train_df[(train_df[column] < -180.0) | (train_df[column] > 180.0)].index
    train_df.drop(drop_indexes, inplace=True)

train_df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9999022.0,9999022.0,9999022.0,9999022.0,9999022.0,9999022.0
mean,11.339,-72.496,39.916,-72.5,39.918,1.685
std,9.799,10.464,6.123,10.448,6.117,1.308
min,0.0,-168.604,-74.824,-173.342,-74.193,0.0
25%,6.0,-73.992,40.735,-73.991,40.734,1.0
50%,8.5,-73.982,40.753,-73.98,40.753,1.0
75%,12.5,-73.967,40.767,-73.964,40.768,2.0
max,1273.31,154.101,91.267,154.101,94.617,6.0


In [29]:
train_df[train_df.passenger_count == 0].count()

fare_amount          35262
pickup_datetime      35262
pickup_longitude     35262
pickup_latitude      35262
dropoff_longitude    35262
dropoff_latitude     35262
passenger_count      35262
dtype: int64

In [30]:
# Haversine algorithm for calculating distance with given latitude and longitude
# copied from stack overflow: 
# https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points

from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 3956 # Radius of earth in miles. Use 6371 for kilometers
    return c * r

In [31]:
train_df['distance'] = train_df[columns].apply(lambda x: haversine(*x), axis=1)

train_df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance
count,9999022.0,9999022.0,9999022.0,9999022.0,9999022.0,9999022.0,9999022.0
mean,11.339,-72.496,39.916,-72.5,39.918,1.685,12.049
std,9.799,10.464,6.123,10.448,6.117,1.308,234.333
min,0.0,-168.604,-74.824,-173.342,-74.193,0.0,0.0
25%,6.0,-73.992,40.735,-73.991,40.734,1.0,0.509
50%,8.5,-73.982,40.753,-73.98,40.753,1.0,0.946
75%,12.5,-73.967,40.767,-73.964,40.768,2.0,1.742
max,1273.31,154.101,91.267,154.101,94.617,6.0,8486.451


In [36]:
train_df.dtypes

fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
distance             float64
dtype: object

In [38]:
# plotting a scatter plot to see how fare varies with distance

sns.scatterplot(train_df['distance'], train_df['fare_amount'])

AttributeError: module 'seaborn' has no attribute 'scatterplot'