### DATA CLEANING also known as DATA SCRUBBING

    - Import Data From the phase 1
    - Wrap into Panda's Data Frame
    - Change to proper data type
    - Reformat values to numberic to compute in our ML training

## 1. Import Data

In [106]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [107]:
# The data is from HLDI web service - it should present all possible vehicles data HLDI provides
# Read data, conver ID to integer (auto read to float and we don't need float number) 
# Data - 3 xlsx files from DB (TestETL folder) reduced to 2 csv files to import here
vehicles_file = "raw_converted_data/vehicles.csv"
drivers_file = "raw_converted_data/drivers.csv"

In [108]:
# Retreive the data from previous saved as a CSV file 
vehicles_df = pd.read_csv(vehicles_file)
drivers_df = pd.read_csv(drivers_file)


In [109]:
# Rename the column Easier to read and consistent format
drivers_df = drivers_df.rename(columns={'DRIVERID':'DRIVER_ID', 'BIRTHDATE':'BIRTH_DATE', 'CREDSCORE':'CREDIT_SCORE', 'HLDIID':'VEHICLE_ID'})
drivers_df.head()

Unnamed: 0,DRIVER_ID,GENDER,BIRTH_DATE,CREDIT_SCORE,VEHICLE_ID
0,104132812,F,1990-07-01,824,7399455
1,104132854,F,1933-11-06,824,6078430
2,104132876,F,1987-12-14,750,6212210
3,104132895,F,1999-10-22,750,7416184
4,104132907,F,1933-11-06,824,6078430


## 2. Drop, Convert Data fields


In [110]:
# now Take a look at the Drivers Table to see anything need to drop or convert needed to our model to run
drivers_df.info()
# drop na first
drivers_df = drivers_df.dropna()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8964256 entries, 0 to 8964255
Data columns (total 5 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   DRIVER_ID     int64 
 1   GENDER        object
 2   BIRTH_DATE    object
 3   CREDIT_SCORE  object
 4   VEHICLE_ID    int64 
dtypes: int64(2), object(3)
memory usage: 342.0+ MB


In [111]:
# Gender, birthdata, and Credit_Score needed to convert to numeric format
# First Gender
print(drivers_df.isna().sum())
drivers_df['GENDER'].unique()

DRIVER_ID       0
GENDER          0
BIRTH_DATE      0
CREDIT_SCORE    0
VEHICLE_ID      0
dtype: int64


array(['F', 'M', 'X'], dtype=object)

In [112]:
# GENDER operation X is binary_gender nan is unknown input fill with 0
drivers_df['GENDER'] = drivers_df['GENDER'].fillna(0)
new_data = {   
    'M':1,
    'F':2,
    'X':3
}
# Change Genders Numeric values
drivers_df['GENDER'] = drivers_df['GENDER'].map(new_data)


In [113]:
# Covert Bith Date field to Age - OverflowError: Overflow in int64 addition
# drivers_df['BIRTH_DATE'] = pd.to_datetime(drivers_df['BIRTH_DATE'], format='%Y-%m-%d')
BIRTH_DATE
drivers_df.head()

Unnamed: 0,DRIVER_ID,GENDER,BIRTH_DATE,CREDIT_SCORE,VEHICLE_ID
0,104132812,2,1990-07-01,824,7399455
1,104132854,2,1933-11-06,824,6078430
2,104132876,2,1987-12-14,750,6212210
3,104132895,2,1999-10-22,750,7416184
4,104132907,2,1933-11-06,824,6078430


In [118]:
from datetime import date, datetime

def age(birth_date):
    today = date.today()
    b_date = datetime.strptime(birth_date, "%Y-%m-%d")
    age = today.year - b_date.year - ((today.month, today.day) < (b_date.month, b_date.day))
    return age

In [120]:
drivers_df['AGE'] = drivers_df['BIRTH_DATE'].apply(age)
drivers_df.head()

Unnamed: 0,DRIVER_ID,GENDER,BIRTH_DATE,CREDIT_SCORE,VEHICLE_ID,AGE
0,104132812,2,1990-07-01,824,7399455,32
1,104132854,2,1933-11-06,824,6078430,88
2,104132876,2,1987-12-14,750,6212210,34
3,104132895,2,1999-10-22,750,7416184,22
4,104132907,2,1933-11-06,824,6078430,88


In [121]:
# Once Birth date is merged to the Data Frame, remove the Birth Date column
drivers_df.drop('BIRTH_DATE', axis=1, inplace=True)
drivers_df.head()

Unnamed: 0,DRIVER_ID,GENDER,CREDIT_SCORE,VEHICLE_ID,AGE
0,104132812,2,824,7399455,32
1,104132854,2,824,6078430,88
2,104132876,2,750,6212210,34
3,104132895,2,750,7416184,22
4,104132907,2,824,6078430,88


In [122]:
# Clean Credit Score - NS0,NH0,TF1,FDL,AG0
drivers_df['CREDIT_SCORE'].value_counts(dropna=False)
credit_na_values = ['NS0', 'NH0','TF1', 'FDL', 'AG0']
drivers_df = drivers_df[drivers_df.CREDIT_SCORE.isin(credit_na_values) == False]
drivers_df['CREDIT_SCORE'].value_counts(dropna=False)

847    2165193
710    1105341
811    1053936
824     853079
666     616727
        ...   
896          2
0            2
299          1
301          1
897          1
Name: CREDIT_SCORE, Length: 604, dtype: int64

In [124]:
drivers_df.head()

Unnamed: 0,DRIVER_ID,GENDER,CREDIT_SCORE,VEHICLE_ID,AGE
0,104132812,2,824,7399455,32
1,104132854,2,824,6078430,88
2,104132876,2,750,6212210,34
3,104132895,2,750,7416184,22
4,104132907,2,824,6078430,88


In [125]:
drivers_df.to_csv("cleaned_data/drivers.csv", index=False)