### DATA CLEANING also known as DATA SCRUBBING

    - Import Data
    - Wrap into Panda's Data Frame
    - Change to proper data type
    - Reformat values to numberic to compute in our ML training

## 1. Import Data

In [52]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [53]:
# Data - 3 tables to import
vehicles_file = "data/vehicles.xlsx"
drivers_file = "data/drvr_sample.xlsx"
mapping_file = "data/drvr_veh_sample.xlsx"

In [54]:
# The data is from HLDI web service - it should present all possible vehicles data HLDI provides
# Read data, conver ID to integer (auto read to float and we don't need float number) 
vehicles_df = pd.read_excel(vehicles_file)
vehicles_df['ID'] = vehicles_df['ID'].apply(np.int64)

In [55]:
vehicles_df = vehicles_df.rename(columns={'ID':'VEHICLE_ID', 'Model Year':'YEAR', 'Manufacturer Name':'MAKE', 'Model':'MODEL'})
vehicles_df.head()

Unnamed: 0,VEHICLE_ID,Model Year,MAKE,MODEL
0,1000030,2017,MERCEDES-BENZ,S CLASS MAYBACH 650
1,2005191,2012,PORSCHE,911 CARRERA CARRERA 4/CARRERA 2
2,2005195,2012,PORSCHE,911 CARRERA 4/CARRERA 2
3,2005211,2012,PORSCHE,911 CARRERA S/CARRERA GTS
4,2005213,2012,PORSCHE,911 CARRERA CARRERA S/CARRERA 4S


In [56]:
# Model Year didn't take the column Name - must have some hidden char embedded - stripping space
vehicles_df.columns = [c.upper().replace(' ', '_') for c in vehicles_df.columns]
vehicles_df = vehicles_df.rename(columns={'MODEL_______________YEAR':'YEAR'})
vehicles_df.head()

Unnamed: 0,VEHICLE_ID,YEAR,MAKE,MODEL
0,1000030,2017,MERCEDES-BENZ,S CLASS MAYBACH 650
1,2005191,2012,PORSCHE,911 CARRERA CARRERA 4/CARRERA 2
2,2005195,2012,PORSCHE,911 CARRERA 4/CARRERA 2
3,2005211,2012,PORSCHE,911 CARRERA S/CARRERA GTS
4,2005213,2012,PORSCHE,911 CARRERA CARRERA S/CARRERA 4S


In [57]:
# Drivers file with Credit Score - To pretect PSI(Private Security Information) only selected the general info from the user
# Read data, again convert PID to integer (auto read to float and we don't need float number) 
drivers_df = pd.read_excel(drivers_file)
drivers_df['PID'] = drivers_df['PID'].apply(np.int64)
drivers_df = drivers_df.rename(columns={'PID':'DRIVER_ID', 'CREDSCORE':'CREDIT_SCORE'})
drivers_df.head()

Unnamed: 0,DRIVER_ID,GENDER,BIRTHDATE,CREDIT_SCORE
0,85854409,F,1994-08-02,824
1,85854409,M,1994-07-03,824
2,85854448,F,1970-07-25,666
3,85854454,F,1987-12-14,666
4,85854553,F,1933-11-06,824


In [58]:
# Drivers and Vehicles mapper file
mapper_df = pd.read_excel(mapping_file)
mapper_df['PID'] = mapper_df['PID'].apply(np.int64)
mapper_df['HLDIID'] = mapper_df['HLDIID'].apply(np.int64)
mapper_df = mapper_df.rename(columns={'PID':'DRIVER_ID', 'HLDIID':'VEHICLE_ID'})
mapper_df.head()

Unnamed: 0,DRIVER_ID,VEHICLE_ID
0,85854409,7420963
1,85854448,7396448
2,85854454,6208988
3,85854553,6078430
4,85854569,6212210


## 2. Drop, Convert Data fields


In [59]:
# drop na first
drivers_df.dropna()
# Change Gender Numeric values
drivers_df.GENDER[drivers_df.GENDER == 'M'] = 1
drivers_df.GENDER[drivers_df.GENDER == 'F'] = 2
drivers_df.head()

Unnamed: 0,DRIVER_ID,GENDER,BIRTHDATE,CREDIT_SCORE
0,85854409,2,1994-08-02,824
1,85854409,1,1994-07-03,824
2,85854448,2,1970-07-25,666
3,85854454,2,1987-12-14,666
4,85854553,2,1933-11-06,824


In [68]:
# Covert Bith Date field to Age
now = pd.Timestamp('now')
drivers_df['BIRTHDATE'] = pd.to_datetime(drivers_df['BIRTHDATE'], format='%Y-%m-%d')    # 1
drivers_df['BIRTHDATE'] = drivers_df['BIRTHDATE'].where(drivers_df['BIRTHDATE'] < now, drivers_df['BIRTHDATE'] -  np.timedelta64(100, 'Y')) 
drivers_df['AGE'] = (now - drivers_df['BIRTHDATE']).astype('<m8[Y]')
drivers_df.AGE.dtype

dtype('float64')

In [72]:
# Once Birth date is merged to the Data Frame, remove the Birth Date column
drivers_df.drop('BIRTHDATE', axis=1, inplace=True)
drivers_df.head()

Unnamed: 0,DRIVER_ID,GENDER,CREDIT_SCORE,AGE
0,85854409,2,824,27.0
1,85854409,1,824,28.0
2,85854448,2,666,51.0
3,85854454,2,666,34.0
4,85854553,2,824,88.0


In [73]:
# Clean Credit Score - NS0,NH0,TF1,FDL,AG0
drivers_df['CREDIT_SCORE'].value_counts(dropna=False)
credit_na_values = ['NS0', 'NH0','TF1', 'FDL', 'AG0']
drivers_df = drivers_df[drivers_df.CREDIT_SCORE.isin(credit_na_values) == False]
drivers_df['CREDIT_SCORE'].value_counts(dropna=False)

750    155
666    130
824    111
847     31
768      9
454      4
395      4
654      3
546      3
672      2
619      2
428      2
478      2
405      2
791      2
440      1
390      1
375      1
334      1
565      1
455      1
321      1
430      1
683      1
467      1
626      1
838      1
613      1
399      1
357      1
342      1
392      1
563      1
535      1
764      1
402      1
641      1
859      1
325      1
875      1
517      1
483      1
374      1
Name: CREDIT_SCORE, dtype: int64

In [74]:
drivers_df.head()

Unnamed: 0,DRIVER_ID,GENDER,CREDIT_SCORE,AGE
0,85854409,2,824,27.0
1,85854409,1,824,28.0
2,85854448,2,666,51.0
3,85854454,2,666,34.0
4,85854553,2,824,88.0


In [75]:
vehicles_df.head()

Unnamed: 0,VEHICLE_ID,YEAR,MAKE,MODEL
0,1000030,2017,MERCEDES-BENZ,S CLASS MAYBACH 650
1,2005191,2012,PORSCHE,911 CARRERA CARRERA 4/CARRERA 2
2,2005195,2012,PORSCHE,911 CARRERA 4/CARRERA 2
3,2005211,2012,PORSCHE,911 CARRERA S/CARRERA GTS
4,2005213,2012,PORSCHE,911 CARRERA CARRERA S/CARRERA 4S


In [76]:
# Columns we are going to use ML : Driver id, gender, age, credit score. vehicle id and that will be manipulated
# After we store in Database
# Before save check null once more on mapper
mapper_df.dropna()

Unnamed: 0,DRIVER_ID,VEHICLE_ID
0,85854409,7420963
1,85854448,7396448
2,85854454,6208988
3,85854553,6078430
4,85854569,6212210
...,...,...
495,85874294,99996
496,85874235,99996
497,85875895,7469287
498,85874628,6224271


In [80]:
drivers_df.to_csv("ready_db/drviers.csv", index=False)
vehicles_df.to_csv("ready_db/vehicles.csv", index=False)
mapper_df.to_csv("ready_db/mapper.csv", index=False)