## Data Cleansing and Transformation in Python

In [1]:
# Import modules
import pandas as pd 
import logging

In [2]:
# define paths to data files
crash_data_file = "traffic_crashes.csv" 
vehicle_crash_data_file = "traffic_crash_vehicle.csv" 

In [3]:
# import data as dataframes
df_crashes = pd.read_csv(f"data/{crash_data_file}") 
df_vehicles= pd.read_csv(f"data/{vehicle_crash_data_file}")

### Preliminary Tasks: The Importance of Staging Data  

In [4]:
df_crashes.head()

Unnamed: 0,crash_record_id,rd_no,crash_date_est_i,crash_date,posted_speed_limit,traffic_control_device,device_condition,weather_condition,lighting_condition,first_crash_type,...,injuries_non_incapacitating,injuries_reported_not_evident,injuries_no_indication,injuries_unknown,crash_hour,crash_day_of_week,crash_month,latitude,longitude,location
0,530411c8611eb0ccb9b25f16b2955cd21761fa1928dcaa...,JE494048,,2021-12-31T14:00:00.000,35,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,TURNING,...,0.0,0.0,2.0,0.0,14,6,12,41.79485,-87.76728,POINT (-87.767280356289 41.794849958048)
1,305b06235b250aa0029c07313c84f969f4bc13c1cc3715...,JE494008,,2021-12-31T14:00:00.000,30,TRAFFIC SIGNAL,UNKNOWN,CLEAR,DUSK,TURNING,...,0.0,0.0,2.0,0.0,14,6,12,41.881271,-87.686536,POINT (-87.686535940171 41.881270504288)
2,444221c2a9bc82fc4f301062ab22b482d7d661cf88fcdf...,JE494016,Y,2021-12-31T13:56:00.000,10,OTHER,NO CONTROLS,CLEAR,DAYLIGHT,SIDESWIPE SAME DIRECTION,...,0.0,0.0,2.0,0.0,13,6,12,41.722941,-87.662863,POINT (-87.662862871273 41.72294121821)
3,4603435fbb4ef5d45c0d805c3e9aa5558a311a140a737e...,JE494049,,2021-12-31T13:46:00.000,30,NO CONTROLS,NO CONTROLS,RAIN,DAYLIGHT,PEDALCYCLIST,...,1.0,0.0,2.0,0.0,13,6,12,41.766336,-87.57827,POINT (-87.578269718478 41.766335621716)
4,db62bb4534d0dae57112ea3ff8d50193784aaa732ed58d...,JE494000,,2021-12-31T13:45:00.000,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,SIDESWIPE SAME DIRECTION,...,0.0,0.0,2.0,0.0,13,6,12,41.75115,-87.607802,POINT (-87.607802036151 41.7511501753)


In [5]:
df_crashes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 49 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   crash_record_id                1000 non-null   object 
 1   rd_no                          1000 non-null   object 
 2   crash_date_est_i               69 non-null     object 
 3   crash_date                     1000 non-null   object 
 4   posted_speed_limit             1000 non-null   int64  
 5   traffic_control_device         1000 non-null   object 
 6   device_condition               1000 non-null   object 
 7   weather_condition              1000 non-null   object 
 8   lighting_condition             1000 non-null   object 
 9   first_crash_type               1000 non-null   object 
 10  trafficway_type                1000 non-null   object 
 11  lane_cnt                       1 non-null      float64
 12  alignment                      1000 non-null   ob

In [6]:
df_crashes.isnull().sum()

crash_record_id                     0
rd_no                               0
crash_date_est_i                  931
crash_date                          0
posted_speed_limit                  0
traffic_control_device              0
device_condition                    0
weather_condition                   0
lighting_condition                  0
first_crash_type                    0
trafficway_type                     0
lane_cnt                          999
alignment                           0
roadway_surface_cond                0
road_defect                         0
report_type                        24
crash_type                          0
intersection_related_i            729
private_property_i                955
hit_and_run_i                     680
damage                              0
date_police_notified                0
prim_contributory_cause             0
sec_contributory_cause              0
street_no                           0
street_direction                    0
street_name 

In [7]:
df_crashes.dropna(axis='columns', how='all', inplace=True) 

In [8]:
df_crashes = df_crashes.dropna(axis='index', thresh=2, inplace=False) 

#### Working with Missing Data

In [9]:
# This column has only two values.  
df_crashes['report_type'].unique()  # ['ON SCENE', 'NOT ON SCENE (DESK REPORT)'] 

array(['ON SCENE', 'NOT ON SCENE (DESK REPORT)', nan], dtype=object)

In [10]:
# Let’s fill the missing value with ‘ON SCENE’ as below -  
df_crashes  = df_crashes.fillna(value={'report_type': 'ON SCENE'}) 

#### Merging Data

In [11]:
# Merge crashes and vehicles dataframes
df = df_crashes.merge(df_vehicles, how = 'left',on='crash_record_id',suffixes=('_left', '_right')) 
print(df.shape)

(1510, 119)


In [12]:
df.head()

Unnamed: 0,crash_record_id,rd_no_left,crash_date_est_i,crash_date_left,posted_speed_limit,traffic_control_device,device_condition,weather_condition,lighting_condition,first_crash_type,...,trailer1_length,trailer2_length,total_vehicle_length,axle_cnt,vehicle_config,cargo_body_type,load_type,hazmat_out_of_service_i,mcs_out_of_service_i,hazmat_class
0,530411c8611eb0ccb9b25f16b2955cd21761fa1928dcaa...,JE494048,,2021-12-31T14:00:00.000,35,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,TURNING,...,,,,,,,,,,
1,530411c8611eb0ccb9b25f16b2955cd21761fa1928dcaa...,JE494048,,2021-12-31T14:00:00.000,35,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,TURNING,...,,,,,,,,,,
2,305b06235b250aa0029c07313c84f969f4bc13c1cc3715...,JE494008,,2021-12-31T14:00:00.000,30,TRAFFIC SIGNAL,UNKNOWN,CLEAR,DUSK,TURNING,...,,,,,,,,,,
3,305b06235b250aa0029c07313c84f969f4bc13c1cc3715...,JE494008,,2021-12-31T14:00:00.000,30,TRAFFIC SIGNAL,UNKNOWN,CLEAR,DUSK,TURNING,...,,,,,,,,,,
4,305b06235b250aa0029c07313c84f969f4bc13c1cc3715...,JE494008,,2021-12-31T14:00:00.000,30,TRAFFIC SIGNAL,UNKNOWN,CLEAR,DUSK,TURNING,...,,,,,,,,,,


In [13]:
df_agg = df.groupby('vehicle_type').agg({'crash_record_id': 'count'}).reset_index()
df_agg

Unnamed: 0,vehicle_type,crash_record_id
0,BUS OVER 15 PASS.,5
1,MOPED OR MOTORIZED BICYCLE,1
2,OTHER,20
3,OTHER VEHICLE WITH TRAILER,1
4,PASSENGER,633
5,PICKUP,33
6,SINGLE UNIT TRUCK WITH TRAILER,2
7,SPORT UTILITY VEHICLE (SUV),138
8,TRACTOR W/ SEMI-TRAILER,5
9,TRACTOR W/O SEMI-TRAILER,2


In [14]:
number_of_passenger_cars_involved = df_agg[df_agg['vehicle_type'] == 'PASSENGER']['crash_record_id'].array[0] 
number_of_passenger_cars_involved

633

#### Data Mapping with Target Data 

In [15]:
# rename columns for data output
vehicle_mapping = {'vehicle_type':'vehicletypes'}  
df_agg = df_agg.rename(columns=vehicle_mapping)

In [16]:
df_agg

Unnamed: 0,vehicletypes,crash_record_id
0,BUS OVER 15 PASS.,5
1,MOPED OR MOTORIZED BICYCLE,1
2,OTHER,20
3,OTHER VEHICLE WITH TRAILER,1
4,PASSENGER,633
5,PICKUP,33
6,SINGLE UNIT TRUCK WITH TRAILER,2
7,SPORT UTILITY VEHICLE (SUV),138
8,TRACTOR W/ SEMI-TRAILER,5
9,TRACTOR W/O SEMI-TRAILER,2


### Writing Transformation Functions

In [17]:
def get_transformed_data(crash_file, vehicle_file): 

    # import data
    df_crashes = pd.read_csv(f"data/{crash_file}") 
    df_vehicles= pd.read_csv(f"data/{vehicle_file}") 
    
    # remove specified missing values
    under_threshold_removed = df_crashes.dropna(axis='index', thresh=2, inplace=False) 
    under_threshold_rows = df_crashes[~df_crashes.index.isin(under_threshold_removed.index)] 
    df_crashes.fillna(value={'report_type': 'ON SCENE'}, inplace=True) 
    
    # merge crashes and vehicles
    df = df_crashes.merge(df_vehicles, how='left', on='crash_record_id', suffixes=('_left','_right')) 
    df_agg = df.groupby('vehicle_type').agg({'crash_record_id': 'count'}).reset_index() 
    
    # transform column names for output data
    vehicle_mapping = {'vehicle_type':'vehicletypes'}  
    df_agg = df_agg.rename(columns=vehicle_mapping)

    return df_agg

In [18]:
get_transformed_data(crash_data_file,vehicle_crash_data_file) 

Unnamed: 0,vehicletypes,crash_record_id
0,BUS OVER 15 PASS.,5
1,MOPED OR MOTORIZED BICYCLE,1
2,OTHER,20
3,OTHER VEHICLE WITH TRAILER,1
4,PASSENGER,633
5,PICKUP,33
6,SINGLE UNIT TRUCK WITH TRAILER,2
7,SPORT UTILITY VEHICLE (SUV),138
8,TRACTOR W/ SEMI-TRAILER,5
9,TRACTOR W/O SEMI-TRAILER,2


### Running the Workflow

#### The preceding code can be split into reusable functions that are easy to manage

In [19]:
# Read data from data source  
def read_datasources(source_name): 
    df = pd.read_csv(f"data/{source_name}")  
    return df

In [20]:
# Drop rows with null values 
def drop_rows_with_null_values(df): 
    under_threshold_removed = df.dropna(axis='index', thresh=2, inplace=False)  
    df = df[~df.index.isin(under_threshold_removed.index)]  
    return df 

In [21]:
# Fill missing values 
def fill_missing_values(df): 
    df = df.fillna(value={'report_type': 'ON SCENE'})  
    return df

In [22]:
# Merge Dataframes 
def merge_dataframes(df_vehicles,df_crashes): 
    df = df_crashes.merge(df_vehicles,how='left', on='crash_record_id', suffixes=('_left', '_right'))  
    return df 

In [23]:
# Rename Columns
def rename_columns(df):
    vehicle_mapping = {'vehicle_type' :  'vehicletypes'}  
    df = df.rename(columns=vehicle_mapping)
    return df

#### Define the Pipeline Functions to run the Cleansing and Transformation Functions

In [24]:
def read_data_pipeline(crash_file, vehicle_file): 
    df_crash = pd.DataFrame() 
    df_vehicle_crash = pd.DataFrame() 
    try: 
        df_crash = read_datasources(crash_file) 
        df_vehicle = read_datasources(vehicle_file) 
    except Exception as e: 
        logging.info("Exception in reading data pipeline") 
    finally: 
        return df_crash, df_vehicle   

In [25]:
def drop_rows_with_null_values_pipeline(df_crash, df_vehicle): 
    try: 
        df_crash = drop_rows_with_null_values(df_crash) 
        df_vehicle = drop_rows_with_null_values(df_vehicle) 
    except Exception as e: 
        logging.info("Exception in dropping rows with null value data pipeline") 
 
    finally: 
        return df_crash, df_vehicle

In [26]:
def fill_missing_values_pipeline(df_crash, df_vehicle): 
    try: 
        df_crash = fill_missing_values(df_crash) 
        df_vehicle_crash = fill_missing_values(df_vehicle) 
    except Exception as e: 
        logging.info("Exception in filling missing value pipeline") 
 
    finally: 
        return df_crash, df_vehicle

In [27]:
def merge_dataframes_pipeline(df_crash, df_vehicle): 
    try: 
        df_agg = merge_dataframes(df_vehicles,df_crashes)
    except Exception as e: 
        logging.info("Exception in merge dataframes pipeline") 
 
    finally: 
        return df_agg

In [28]:
def format_dataframes_pipeline(df_agg): 
    try: 
        df_output = rename_columns(df_agg)
    except Exception as e: 
        logging.info("Exception in renaming dataframe columns pipeline") 
 
    finally: 
        return df_output

#### Use the Chigaco Traffic Data and Run the Pipeline Workflow

In [29]:
# Define input data 
crash_data_file = "traffic_crashes.csv" 
vehicle_crash_data_file = "traffic_crash_vehicle.csv" 

# Read Data Pipeline
df_crash, df_vehicle = read_data_pipeline("traffic_crashes.csv", "traffic_crash_vehicle.csv")

# Drop Nulls
df_crash, df_vehicle = drop_rows_with_null_values_pipeline(df_crash, df_vehicle) 

# Fill in Missing Values
df_crash, df_vehicle = fill_missing_values_pipeline(df_crash, df_vehicle) 

# Merge Dataframes
df_agg = merge_dataframes_pipeline(df_crash, df_vehicle)

# Merge Dataframes
df_output = format_dataframes_pipeline(df_agg)

In [30]:
df_output.head()

Unnamed: 0,crash_record_id,rd_no_left,crash_date_est_i,crash_date_left,posted_speed_limit,traffic_control_device,device_condition,weather_condition,lighting_condition,first_crash_type,...,trailer1_length,trailer2_length,total_vehicle_length,axle_cnt,vehicle_config,cargo_body_type,load_type,hazmat_out_of_service_i,mcs_out_of_service_i,hazmat_class
0,530411c8611eb0ccb9b25f16b2955cd21761fa1928dcaa...,JE494048,,2021-12-31T14:00:00.000,35,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,TURNING,...,,,,,,,,,,
1,530411c8611eb0ccb9b25f16b2955cd21761fa1928dcaa...,JE494048,,2021-12-31T14:00:00.000,35,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,TURNING,...,,,,,,,,,,
2,305b06235b250aa0029c07313c84f969f4bc13c1cc3715...,JE494008,,2021-12-31T14:00:00.000,30,TRAFFIC SIGNAL,UNKNOWN,CLEAR,DUSK,TURNING,...,,,,,,,,,,
3,305b06235b250aa0029c07313c84f969f4bc13c1cc3715...,JE494008,,2021-12-31T14:00:00.000,30,TRAFFIC SIGNAL,UNKNOWN,CLEAR,DUSK,TURNING,...,,,,,,,,,,
4,305b06235b250aa0029c07313c84f969f4bc13c1cc3715...,JE494008,,2021-12-31T14:00:00.000,30,TRAFFIC SIGNAL,UNKNOWN,CLEAR,DUSK,TURNING,...,,,,,,,,,,


### Transformation Activities in Python 

In [31]:
READING_CRASH_DATA_PIPELINE = "<NOT_EXECUTED>" 
DROPPING_ROW_WITH_NULL_PIPELINE = "<NOT_EXECUTED>" 
FILLING_MISSING_VALUE_PIPELINE = "<NOT_EXECUTED>" 
MERGE_DATAFRAME_PIPELINE = "<NOT_EXECUTED>" 

In [32]:
df_crash, df_vehicle = read_data_pipeline("traffic_crashes.csv", "traffic_crash_vehicle.csv") 
 
if READING_CRASH_DATA_PIPELINE == "<OK>": 
    df_crash, df_vehicle = drop_rows_with_null_values_pipeline(df_crash, df_vehicle) 
 
elif DROPPING_ROW_WITH_NULL_PIPELINE == "<OK>": 
    df_crash, df_vehicle= fill_missing_values_pipeline(df_crash, df_vehicle) 
 
elif FILLING_MISSING_VALUE_PIPELINE == "<OK>": 
    df_crash, df_vehicle = merge_dataframes_pipeline(df_crash, df_vehicle_crash) 