#### Introduction

In this notebook, we are going to change the input datatype from csv to parquet, becuase in many real life scenarios and in production environment, the filetypes are in parquet format, which is smaller in size and easy to work with

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

#### Defining the filepaths, column lists etc.

In [6]:
INPUT_FILEPATH = 'data'
INPUT_FILENAME = 'master_data_final2.csv'

OUTPUT_FILEPATH = 'data'
OUTPUT_FILENAME = 'input_data.parquet'

DATE_COLS = ['proposal_received_date', 'policy_issue_date', 'agent_dob', 'agent_doj']
NA_VALUES = ['', 'NA', 'N/A', 'NULL', 'null', '?', '*', '#N/A', '#VALUE!']
DTYPE_DICT = {'zipcode': 'str', 'agent_code': 'str', 'policy_number' : 'str'} ## These columns should be string

#### Reading the csv as pandas dataframe

In [7]:
def load_data(INPUT_FILEPATH, INPUT_FILENAME) -> pd.DataFrame:
    input_df = pd.read_csv(os.path.join(INPUT_FILEPATH, INPUT_FILENAME),
                      na_values = NA_VALUES,
                      parse_dates = DATE_COLS,
                      dayfirst = True,
                      dtype = DTYPE_DICT)
                    
    return input_df

input_df = load_data(INPUT_FILEPATH, INPUT_FILENAME)

In [8]:
input_df.head()

Unnamed: 0,policy_number,proposal_received_date,policy_issue_date,owner_age,owner_gender,marital_status,num_nominee,smoker,medical,education,...,agent_persistency,last_6_month_submissions,average_premium,is_reinstated,prev_persistency,num_complaints,target_completion_perc,has_contacted_in_last_6_months,credit_score,lapse
0,1,2020-09-08,2020-09-08,22,Female,Single,1,No,No,Graduate,...,0.76,35,377,1,0.67,1,0.98,1,762,0
1,2,2020-08-24,2020-08-26,33,Female,Single,1,No,No,High School,...,0.87,54,671,0,0.0,4,0.9,1,794,0
2,3,2020-08-24,2020-08-24,29,Female,Married,1,No,No,Lt High School,...,0.93,0,579,0,0.0,8,0.93,1,800,0
3,4,2020-08-24,2020-08-24,57,Male,Divorced,1,No,Yes,Graduate,...,0.78,29,1333,1,0.72,8,0.95,1,850,0
4,5,2020-08-24,2020-08-25,32,Female,Divorced,1,No,No,Post Graduate,...,0.89,44,652,0,0.0,8,0.89,0,745,1


#### Writing as parquet file

In [9]:
table = pa.Table.from_pandas(input_df)
pq.write_table(table, os.path.join(OUTPUT_FILEPATH, OUTPUT_FILENAME))