#### Checking Python version

In [1]:
!python -V

Python 3.8.8


#### Importing libraries

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import os
pd.set_option('display.max_columns', None)
import warnings
warnings.simplefilter("ignore", UserWarning)

#### Importing data

In [3]:
INPUT_FILEPATH = '../inputs'
INPUT_FILENAME = 'master_data.csv'
INDEX = 'policy_number'
DATE_COLS = ['proposal_received_date', 'policy_issue_date', 'agent_dob', 'agent_doj']
NA_VALUES = ['', 'NA', 'N/A', 'NULL', 'null', '?', '*', '#N/A', '#VALUE!']

In [14]:
input_df = pd.read_csv(os.path.join(INPUT_FILEPATH, INPUT_FILENAME),
                      index_col = INDEX,
                      na_values = NA_VALUES,
                      parse_dates = DATE_COLS,
                      dayfirst = True)

In [15]:
input_df.head()

Unnamed: 0_level_0,proposal_received_date,policy_issue_date,owner_age,owner_gender,marital_status,num_nominee,smoker,medical,education,occupation,experience,income,zipcode,county,state,negative_zipcode,family_member,existing_num_policy,has_critical_health_history,policy_term,payment_freq,annual_premium,sum_insured,agent_code,agent_dob,agent_doj,agent_status,agent_education,agent_age,agent_tenure_days,agent_persistency,last_6_month_submissions,average_premium,is_reinstated,prev_persistency,num_complaints,target_completion_perc,has_contacted_in_last_6_months,credit_score,lapse
policy_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
1,2020-09-08,2020-09-08,22,Female,Single,1,No,No,Graduate,Sales,1,50000,19111,Philadelphia County,PA,0,1,0,0,25,Annually,377,580019,60503862,1997-07-03,2020-08-31,Active,Graduate,23,8,0.76,35.0,377,1,0.67,1,0.98,1,762,0
2,2020-08-24,2020-08-26,33,Female,Single,1,No,No,High School,Housewife,0,0,7764,Monmouth County,NJ,0,1,0,0,20,Quaterly,671,829002,60422713,1980-03-28,2018-05-17,Active,High School,40,830,0.87,54.0,671,0,,4,0.9,1,794,0
3,2020-08-24,2020-08-24,29,Female,Married,1,No,No,Lt High School,Other Service,13,100000,26335,Braxton County,WV,0,2,1,0,10,Annually,210,659027,60450939,1978-06-25,2019-02-25,Inactive,Some College,42,546,0.93,,579,0,,8,0.93,1,800,1
4,2020-08-24,2020-08-24,57,Male,Divorced,1,No,Yes,Graduate,Military,36,242286,98241,Skagit County,WA,0,3,0,0,15,Annually,1863,612890,60394377,1993-05-05,2017-07-10,Active,Graduate,27,1141,0.78,29.0,1333,1,0.72,8,0.95,1,850,0
5,2020-08-24,2020-08-25,32,Female,Divorced,1,No,No,Post Graduate,Teacher,8,20434,78160,Wilson County,TX,0,1,1,0,20,Annually,730,879549,60491531,1967-12-05,2020-03-20,Active,Graduate,53,157,0.89,44.0,652,0,,8,0.89,0,745,0


#### Checking data info

In [16]:
input_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44948 entries, 1 to 44948
Data columns (total 40 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   proposal_received_date          44948 non-null  datetime64[ns]
 1   policy_issue_date               44948 non-null  datetime64[ns]
 2   owner_age                       44948 non-null  int64         
 3   owner_gender                    44948 non-null  object        
 4   marital_status                  44948 non-null  object        
 5   num_nominee                     44948 non-null  int64         
 6   smoker                          44948 non-null  object        
 7   medical                         44948 non-null  object        
 8   education                       44948 non-null  object        
 9   occupation                      44948 non-null  object        
 10  experience                      44948 non-null  int64         
 11  in

## Creating features

#### time_to_issue

In [17]:
input_df['time_to_issue'] = (input_df['policy_issue_date'] - input_df['proposal_received_date']).dt.days
input_df['time_to_issue'].describe()

count    44948.000000
mean         5.640451
std         11.153843
min          0.000000
25%          0.000000
50%          1.000000
75%          7.000000
max        494.000000
Name: time_to_issue, dtype: float64