# Exploratory Data Analysis (2/2)

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%%time
file_path = '../data/cleaned/taxi_data.csv'
chunksize = 10**6

with pd.read_csv(file_path, chunksize=chunksize) as reader:
    taxi_df = pd.concat([chunk for chunk in reader])

taxi_df.head()

CPU times: total: 10.4 s
Wall time: 16.1 s


Unnamed: 0,booking_id,driver_id,name,date_of_birth,gender,car_model,car_make_year,accuracy,bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,speed,rating,safety_label
0,0,48,Lilia,1974-08-13,Male,Mercedes-Benz,2013-01-01,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454,4.0,0
1,0,48,Lilia,1974-08-13,Male,Mercedes-Benz,2013-01-01,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454,4.0,0
2,0,48,Lilia,1974-08-13,Male,Mercedes-Benz,2013-01-01,8.0,143.298294,-0.346924,-9.532629,-1.204663,0.014962,-0.050033,0.025118,9.0,0.228454,4.0,0
3,0,48,Lilia,1974-08-13,Male,Mercedes-Benz,2013-01-01,8.0,143.298294,-0.600986,-9.452029,-2.157507,0.004548,-0.011713,-0.004078,11.0,0.228454,4.0,0
4,0,48,Lilia,1974-08-13,Male,Mercedes-Benz,2013-01-01,8.0,143.298294,-0.597546,-9.863403,-1.672711,-0.000401,0.000315,-0.00983,12.0,0.228454,4.0,0


In [3]:
taxi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7469656 entries, 0 to 7469655
Data columns (total 19 columns):
 #   Column          Dtype  
---  ------          -----  
 0   booking_id      int64  
 1   driver_id       int64  
 2   name            object 
 3   date_of_birth   object 
 4   gender          object 
 5   car_model       object 
 6   car_make_year   object 
 7   accuracy        float64
 8   bearing         float64
 9   acceleration_x  float64
 10  acceleration_y  float64
 11  acceleration_z  float64
 12  gyro_x          float64
 13  gyro_y          float64
 14  gyro_z          float64
 15  second          float64
 16  speed           float64
 17  rating          float64
 18  safety_label    int64  
dtypes: float64(11), int64(3), object(5)
memory usage: 1.1+ GB


## Preprocessing

In [4]:
%%time

cleaned_df = taxi_df.copy()

# set date_of_birth to datetime
cleaned_df['date_of_birth'] = pd.to_datetime(cleaned_df['date_of_birth'])

# extract years from car_make_year
cleaned_df['car_make_year'] = cleaned_df['car_make_year'].apply(lambda x: int(x.split('-')[0]))

# calculate magnitude of acceleration sqrt(x^2 + y^2 + z^2)
a_x = cleaned_df['acceleration_x']
a_y = cleaned_df['acceleration_y']
a_z = cleaned_df['acceleration_z']
cleaned_df['MA (m/s^2)'] = np.sqrt(a_x**2 + a_y**2 + a_z**2)

# magnitude of angular velocity?
g_x = cleaned_df['gyro_x']
g_y = cleaned_df['gyro_y']
g_z = cleaned_df['gyro_z']
cleaned_df['MAV (rad/s)'] = np.sqrt(g_x**2 + g_y**2 + g_z**2)

# convert driver dob to age? (current date - dob)
cleaned_df['driver_age'] = pd.to_datetime('today').year - cleaned_df['date_of_birth'].dt.year

# convert seconds to minutes
cleaned_df['duration (mins)'] = round(cleaned_df['second'] / 60, 2)

# convert speed (meters per second) to kilometers per hour
cleaned_df['speed (km/h)'] = cleaned_df['speed'] * 3.6

# convert bearing to categorical variable 
cleaned_df['direction'] = cleaned_df['bearing'].apply(
    lambda x: 
        'North' if x >= 337.5 or x < 22.5 else
        'North East' if x >= 22.5 and x < 67.5 else
        'East' if x >= 67.5 and x < 112.5 else
        'South East' if x >= 112.5 and x < 157.5 else
        'South' if x >= 157.5 and x < 202.5 else
        'South West' if x >= 202.5 and x < 247.5 else
        'West' if x >= 247.5 and x < 292.5 
        else 'Noth West'
)

# drop unnecessary columns
cols = ['driver_id', 'date_of_birth', 'speed', 'bearing']
cleaned_df.drop(columns=cols, inplace=True)

# reorder columns
cleaned_df = cleaned_df[['booking_id', 'name', 'driver_age', 'gender', 'car_model', 'car_make_year', 'accuracy', 'direction', 
                         'acceleration_x', 'acceleration_y', 'acceleration_z','MA (m/s^2)', 'gyro_x', 'gyro_y', 'gyro_z', 'MAV (rad/s)', 
                        'second', 'duration (mins)', 'speed (km/h)', 'rating', 'safety_label']]

CPU times: total: 6.77 s
Wall time: 13.7 s


In [5]:
cleaned_df.head()

Unnamed: 0,booking_id,name,driver_age,gender,car_model,car_make_year,accuracy,direction,acceleration_x,acceleration_y,...,MA (m/s^2),gyro_x,gyro_y,gyro_z,MAV (rad/s),second,duration (mins),speed (km/h),rating,safety_label
0,0,Lilia,48,Male,Mercedes-Benz,2013,8.0,South East,-1.706207,-9.270792,...,9.503762,-0.028965,-0.032652,0.01539,0.046282,2.0,0.03,0.822436,4.0,0
1,0,Lilia,48,Male,Mercedes-Benz,2013,8.0,South East,-1.416705,-9.548032,...,9.83032,-0.022413,0.005049,-0.025753,0.034511,3.0,0.05,0.822436,4.0,0
2,0,Lilia,48,Male,Mercedes-Benz,2013,8.0,South East,-0.346924,-9.532629,...,9.614707,0.014962,-0.050033,0.025118,0.057949,9.0,0.15,0.822436,4.0,0
3,0,Lilia,48,Male,Mercedes-Benz,2013,8.0,South East,-0.600986,-9.452029,...,9.713747,0.004548,-0.011713,-0.004078,0.01321,11.0,0.18,0.822436,4.0,0
4,0,Lilia,48,Male,Mercedes-Benz,2013,8.0,South East,-0.597546,-9.863403,...,10.022063,-0.000401,0.000315,-0.00983,0.009843,12.0,0.2,0.822436,4.0,0


## Descriptive Analysis

In [17]:
rows, columns = cleaned_df.shape
print(f'Number of rows: {rows}')
print(f'Number of columns: {columns}')

Number of rows: 7469656
Number of columns: 21


In [18]:
cleaned_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7469656 entries, 0 to 7469655
Data columns (total 21 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   booking_id       7469656 non-null  int64  
 1   name             7469656 non-null  object 
 2   driver_age       7469656 non-null  int64  
 3   gender           7469656 non-null  object 
 4   car_model        7469656 non-null  object 
 5   car_make_year    7469656 non-null  int64  
 6   accuracy         7351631 non-null  float64
 7   direction        7469656 non-null  object 
 8   acceleration_x   7364539 non-null  float64
 9   acceleration_y   7346553 non-null  float64
 10  acceleration_z   7370520 non-null  float64
 11  MA (m/s^2)       7148013 non-null  float64
 12  gyro_x           7336398 non-null  float64
 13  gyro_y           7329944 non-null  float64
 14  gyro_z           7404168 non-null  float64
 15  MAV (rad/s)      7136620 non-null  float64
 16  second           7

In [19]:
cleaned_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
booking_id,7469656.0,818337100000.0,495334800000.0,0.0,377957100000.0,807453900000.0,1254130000000.0,1709397000000.0
driver_age,7469656.0,39.29782,6.689755,27.0,33.0,40.0,45.0,50.0
car_make_year,7469656.0,2009.43,2.40212,2004.0,2008.0,2011.0,2011.0,2013.0
accuracy,7351631.0,11.62066,87.27007,0.75,3.9,4.247,8.0,6070.101
acceleration_x,7364539.0,0.06918624,1.424023,-38.983994,-0.5087219,0.06130981,0.6355286,66.87346
acceleration_y,7346553.0,4.467981,8.130443,-57.80019,-2.026514,9.081485,9.710308,72.81
acceleration_z,7370520.0,0.8964518,3.252218,-78.44842,-0.9295166,0.7776642,2.753036,78.05576
MA (m/s^2),7148013.0,9.89323,1.267685,0.005074,9.549204,9.817876,10.13328,95.63782
gyro_x,7336398.0,-0.001738806,0.1445958,-48.45575,-0.0267908,-0.0006561279,0.02329046,38.70809
gyro_y,7329944.0,0.0002046962,0.3371285,-57.34489,-0.0299636,0.000258396,0.03142024,80.31496


In [20]:
cleaned_df.groupby('gender').mean().T

gender,Female,Male
booking_id,819230600000.0,817803400000.0
driver_age,37.67341,40.26811
car_make_year,2009.606,2009.325
accuracy,11.82954,11.4959
acceleration_x,0.06346306,0.07260524
acceleration_y,4.334797,4.547537
acceleration_z,0.8928568,0.8985988
MA (m/s^2),9.889531,9.895439
gyro_x,-0.001586484,-0.001829774
gyro_y,-0.001642436,0.001308092


In [22]:
# get all categorical columns
cat_cols = cleaned_df.select_dtypes(include='object').columns
cat_cols

Index(['name', 'gender', 'car_model', 'direction'], dtype='object')

## Univariate Analysis