# Data Pipeline Demo

## 0. Load Required Libraries

In [1]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd
import numpy as np 
import joblib
import os
import yaml
import src.util as util

## 1. Load Configuration File

In [111]:
config_data = util.load_config()

## 2. Data Collection

In [60]:
def read_raw_data(config: dict) -> pd.DataFrame:
    # Create variable to store raw dataset
    raw_dataset = pd.DataFrame()

    # Raw dataset dir
    raw_dataset_file = config["raw_dataset_file"]

    # Look and load add CSV files
    raw_dataset = pd.read_excel(raw_dataset_file,engine='pyxlsb')
    
    # Return raw dataset
    return raw_dataset

In [61]:
raw_dataset = read_raw_data(config_data)

In [62]:
# Check our data
raw_dataset

Unnamed: 0,Patient Age at Treatment,Date patient started trying to become pregnant OR date of last pregnancy,"Total Number of Previous cycles, Both IVF and DI","Total Number of Previous treatments, Both IVF and DI at clinic",Total Number of Previous IVF cycles,Total Number of Previous DI cycles,"Total number of previous pregnancies, Both IVF and DI",Total number of IVF pregnancies,Total number of DI pregnancies,Total number of live births - conceived through IVF or DI,...,Heart Three Birth Weight,Heart Three Sex,Heart Three Delivery Date,Heart Three Birth Congenital Abnormalities,Heart Four Weeks Gestation,Heart Four Birth Outcome,Heart Four Birth Weight,Heart Four Sex,Heart Four Delivery Date,Heart Four Birth Congenital Abnormalities
0,40-42,,2,2,2,0,0,0,0,0,...,,,,,,,,,,
1,45-50,,3,3,3,0,0,0,0,0,...,,,,,,,,,,
2,35-37,,0,0,0,0,0,0,0,0,...,,,,,,,,,,
3,18 - 34,,2,2,2,0,0,0,0,0,...,,,,,,,,,,
4,18 - 34,,5,5,5,0,1,1,0,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158514,18 - 34,,3,3,0,3,0,0,0,0,...,,,,,,,,,,
158515,38-39,,0,0,0,0,0,0,0,0,...,,,,,,,,,,
158516,38-39,,1,1,0,1,0,0,0,0,...,,,,,,,,,,
158517,18 - 34,,2,0,0,2,0,0,0,0,...,,,,,,,,,,


In [63]:
# Try to reset the index to solve first problem
raw_dataset.reset_index(inplace = True, drop = True)

In [64]:
# Now check the result
raw_dataset

Unnamed: 0,Patient Age at Treatment,Date patient started trying to become pregnant OR date of last pregnancy,"Total Number of Previous cycles, Both IVF and DI","Total Number of Previous treatments, Both IVF and DI at clinic",Total Number of Previous IVF cycles,Total Number of Previous DI cycles,"Total number of previous pregnancies, Both IVF and DI",Total number of IVF pregnancies,Total number of DI pregnancies,Total number of live births - conceived through IVF or DI,...,Heart Three Birth Weight,Heart Three Sex,Heart Three Delivery Date,Heart Three Birth Congenital Abnormalities,Heart Four Weeks Gestation,Heart Four Birth Outcome,Heart Four Birth Weight,Heart Four Sex,Heart Four Delivery Date,Heart Four Birth Congenital Abnormalities
0,40-42,,2,2,2,0,0,0,0,0,...,,,,,,,,,,
1,45-50,,3,3,3,0,0,0,0,0,...,,,,,,,,,,
2,35-37,,0,0,0,0,0,0,0,0,...,,,,,,,,,,
3,18 - 34,,2,2,2,0,0,0,0,0,...,,,,,,,,,,
4,18 - 34,,5,5,5,0,1,1,0,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158514,18 - 34,,3,3,0,3,0,0,0,0,...,,,,,,,,,,
158515,38-39,,0,0,0,0,0,0,0,0,...,,,,,,,,,,
158516,38-39,,1,1,0,1,0,0,0,0,...,,,,,,,,,,
158517,18 - 34,,2,0,0,2,0,0,0,0,...,,,,,,,,,,


In [74]:
raw_dataset.columns = raw_dataset.columns.str.lower().str.replace('-','').str.replace('_','').str.replace(',','').str.replace('   ',' ').str.replace('  ',' ').str.replace(' ','_')

In [75]:
raw_dataset.columns

Index(['patient_age_at_treatment',
       'date_patient_started_trying_to_become_pregnant_or_date_of_last_pregnancy',
       'total_number_of_previous_cycles_both_ivf_and_di',
       'total_number_of_previous_treatments_both_ivf_and_di_at_clinic',
       'total_number_of_previous_ivf_cycles',
       'total_number_of_previous_di_cycles',
       'total_number_of_previous_pregnancies_both_ivf_and_di',
       'total_number_of_ivf_pregnancies', 'total_number_of_di_pregnancies',
       'total_number_of_live_births_conceived_through_ivf_or_di',
       'total_number_of_live_births_conceived_through_ivf',
       'total_number_of_live_births_conceived_through_di',
       'type_of_infertility_female_primary',
       'type_of_infertility_female_secondary',
       'type_of_infertility_male_primary',
       'type_of_infertility_male_secondary',
       'type_of_infertility_couple_primary',
       'type_of_infertility_couple_secondary',
       'cause_of_infertility_tubal_disease',
       'cause_of_inf

In [76]:
# Save raw dataset to file
util.pickle_dump(raw_dataset, config_data["raw_dataset_path"])

## 2. Data Definition

In [77]:
# Define data type, range of data and some explanation out data for each variable

In [78]:
for col in raw_dataset.columns:
    print(col)
    print(raw_dataset[col].unique()[:10])
    print(raw_dataset[col].nunique())
    print()

patient_age_at_treatment
['40-42' '45-50' '35-37' '18 - 34' '43-44' '38-39' '999']
7

date_patient_started_trying_to_become_pregnant_or_date_of_last_pregnancy
[nan 13. 11. 10. 15. 16. 17. 14. 19. 12.]
18

total_number_of_previous_cycles_both_ivf_and_di
['2' '3' '0' '5' '>=5' '1' '4']
7

total_number_of_previous_treatments_both_ivf_and_di_at_clinic
['2' '3' '0' '5' '1' '>=5' '4']
7

total_number_of_previous_ivf_cycles
['2' '3' '0' '5' '4' '1' '>=5']
7

total_number_of_previous_di_cycles
['0' '3' '>=5' '1' '2' '4' '5']
7

total_number_of_previous_pregnancies_both_ivf_and_di
['0' '1' '2' '4' '3' '5' '>=5']
7

total_number_of_ivf_pregnancies
['0' '1' '2' '4' '3' '5' '>=5']
7

total_number_of_di_pregnancies
[0 1 3 2 4]
5

total_number_of_live_births_conceived_through_ivf_or_di
[0 1 2 3 5 4]
6

total_number_of_live_births_conceived_through_ivf
[0 1 2 3 5 4]
6

total_number_of_live_births_conceived_through_di
[0 1 2 3]
4

type_of_infertility_female_primary
[0 1]
2

type_of_infertility_female_

## 3. Data Validation

### 3.1. Tipe Data

In [79]:
# Check data type each variable
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158519 entries, 0 to 158518
Data columns (total 95 columns):
 #   Column                                                                    Non-Null Count   Dtype  
---  ------                                                                    --------------   -----  
 0   patient_age_at_treatment                                                  158519 non-null  object 
 1   date_patient_started_trying_to_become_pregnant_or_date_of_last_pregnancy  617 non-null     float64
 2   total_number_of_previous_cycles_both_ivf_and_di                           158519 non-null  object 
 3   total_number_of_previous_treatments_both_ivf_and_di_at_clinic             158519 non-null  object 
 4   total_number_of_previous_ivf_cycles                                       158519 non-null  object 
 5   total_number_of_previous_di_cycles                                        158519 non-null  object 
 6   total_number_of_previous_pregnancies_both_ivf_and_di

### 3.2. Range

In [80]:
# Check the range of data for each variable
raw_dataset.describe().T.sort_values(by = 'std',ascending = False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
heart_three_delivery_date,120.0,1668.283333,484.178698,999.0,999.0,2015.0,2016.0,2016.0
heart_one_delivery_date,37403.0,1667.621929,482.478536,999.0,999.0,2016.0,2016.0,2016.0
heart_two_delivery_date,4995.0,1723.221021,460.299141,999.0,999.0,2016.0,2016.0,2016.0
date_of_egg_thawing,1093.0,5.489478,73.84703,0.0,0.0,0.0,0.0,999.0
date_of_egg_mixing,97807.0,2.914986,53.84811,0.0,0.0,0.0,0.0,999.0
date_of_embryo_transfer,116577.0,5.639989,52.725893,0.0,0.0,3.0,5.0,999.0
date_of_embryo_thawing,38901.0,0.623172,24.286163,0.0,0.0,0.0,0.0,999.0
embryos_from_eggs_microinjected,148106.0,2.097201,3.709646,0.0,0.0,0.0,3.0,42.0
date_patient_started_trying_to_become_pregnant_or_date_of_last_pregnancy,617.0,13.247974,3.087748,2.0,11.0,13.0,15.0,20.0
embryos_stored_for_use_by_patient,148106.0,1.00077,2.218,0.0,0.0,0.0,1.0,37.0


In [81]:
# The result shows us that some data values are 999, which is not true. We need more investigation about it later

### 3.3. Dimensi Data

In [82]:
# It will not be affected
raw_dataset.shape

(158519, 95)

### 3.4. Handling Variables Error

#### 3.4.1. Handing Variabel 'Total Number of Previous IVF cycles'

In [83]:
raw_dataset['total_number_of_previous_ivf_cycles'] = raw_dataset['total_number_of_previous_ivf_cycles'].replace(">=5", 6).astype(int)

#### 3.4.2. Handing Variabel 'Total number of IVF pregnancies'

In [84]:
raw_dataset['total_number_of_ivf_pregnancies'] = raw_dataset['total_number_of_ivf_pregnancies'].replace(">=5", 6).astype(int)

#### 3.4.3. Handing Variabel 'Fresh Cycle'

In [85]:
raw_dataset['fresh_cycle'] = raw_dataset['fresh_cycle'].astype(int)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [86]:
raw_dataset['fresh_cycle'] = raw_dataset['fresh_cycle'].fillna(0).astype(int)

#### 3.4.4. Handing Variabel 'Frozen Cycle'

In [87]:
raw_dataset['frozen_cycle'] = raw_dataset['frozen_cycle'].astype(int)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [88]:
raw_dataset['frozen_cycle'] = raw_dataset['frozen_cycle'].fillna(0).astype(int)

#### 3.4.5. Handing Variabel 'Eggs Source'

In [89]:
raw_dataset['egg_source'] = raw_dataset['egg_source'].fillna('not assigned')

#### 3.4.6. Handing Variabel 'Eggs Thawed'

In [90]:
raw_dataset['eggs_thawed'] = raw_dataset['eggs_thawed'].fillna(0).astype(int)

#### 3.4.7. Handing Variabel 'Fresh Eggs Collected'

In [91]:
raw_dataset['fresh_eggs_collected'] = raw_dataset['fresh_eggs_collected'].replace("> 50", 51)

In [92]:
raw_dataset['fresh_eggs_collected'] = raw_dataset['fresh_eggs_collected'].fillna(0).astype(int)

#### 3.4.8. Handing Variabel 'Eggs Mixed With Partner Sperm'

In [93]:
raw_dataset['eggs_mixed_with_partner_sperm'] = raw_dataset['eggs_mixed_with_partner_sperm'].astype(int)

ValueError: invalid literal for int() with base 10: '> 50'

In [94]:
raw_dataset['eggs_mixed_with_partner_sperm'] = raw_dataset['eggs_mixed_with_partner_sperm'].replace("> 50", 51)

In [95]:
raw_dataset['eggs_mixed_with_partner_sperm'] = raw_dataset['eggs_mixed_with_partner_sperm'].fillna(0).astype(int)

#### 3.4.9. Handing Variabel 'Embryos Transfered'

In [96]:
raw_dataset['embryos_transfered'] = raw_dataset['embryos_transfered'].fillna(0).astype(int)

#### 3.4.9. Handing Variabel "Live birth occurrence"

In [97]:
raw_dataset['live_birth_occurrence'] = raw_dataset['live_birth_occurrence'].fillna(0)

In [98]:
raw_dataset['live_birth_occurrence'] = raw_dataset['live_birth_occurrence'].astype(int)

In [99]:
raw_dataset.drop(index = raw_dataset[raw_dataset["date_of_egg_thawing"] == 999].index, inplace = True)

In [100]:
raw_dataset.drop(index = raw_dataset[raw_dataset["patient_age_at_treatment"] == '999'].index, inplace = True)

In [101]:
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156125 entries, 0 to 158518
Data columns (total 95 columns):
 #   Column                                                                    Non-Null Count   Dtype  
---  ------                                                                    --------------   -----  
 0   patient_age_at_treatment                                                  156125 non-null  object 
 1   date_patient_started_trying_to_become_pregnant_or_date_of_last_pregnancy  617 non-null     float64
 2   total_number_of_previous_cycles_both_ivf_and_di                           156125 non-null  object 
 3   total_number_of_previous_treatments_both_ivf_and_di_at_clinic             156125 non-null  object 
 4   total_number_of_previous_ivf_cycles                                       156125 non-null  int64  
 5   total_number_of_previous_di_cycles                                        156125 non-null  object 
 6   total_number_of_previous_pregnancies_both_ivf_and_di

In [102]:
util.pickle_dump(raw_dataset, config_data["cleaned_raw_dataset_path"])

## 4. Data Defense

In [116]:
def check_data(input_data, params):
    # Check data types
    assert input_data.select_dtypes("object").columns.to_list() == params["object_columns"], "an error occurs in object column(s)."
    assert input_data.select_dtypes("int").columns.to_list() == params["int32_columns"], "an error occurs in int32 column(s)."

    # Check range of data
    assert input_data['total_number_of_previous_ivf_cycles'].between(params['range_total_number_of_previous_ivf_cycles'][0], params['range_total_number_of_previous_ivf_cycles'][1]).sum() == len(input_data), 'an error occurs in total_number_of_previous_ivf_cycles range.'
    assert input_data['total_number_of_ivf_pregnancies'].between(params['range_total_number_of_ivf_pregnancies'][0], params['range_total_number_of_ivf_pregnancies'][1]).sum() == len(input_data), 'an error occurs in total_number_of_ivf_pregnancies range.'
    assert input_data['total_number_of_live_births_conceived_through_ivf'].between(params['range_total_number_of_live_births_conceived_through_ivf'][0], params['range_total_number_of_live_births_conceived_through_ivf'][1]).sum() == len(input_data), 'an error occurs in total_number_of_live_births_-_conceived_through_ivf range.'
    assert input_data['type_of_infertility_female_primary'].between(params['range_type_of_infertility_female_primary'][0], params['range_type_of_infertility_female_primary'][1]).sum() == len(input_data), 'an error occurs in type_of_infertility_-_female_primary range.'
    assert input_data['type_of_infertility_female_secondary'].between(params['range_type_of_infertility_female_secondary'][0], params['range_type_of_infertility_female_secondary'][1]).sum() == len(input_data), 'an error occurs in type_of_infertility_-_female_secondary range.'
    assert input_data['type_of_infertility_male_primary'].between(params['range_type_of_infertility_male_primary'][0], params['range_type_of_infertility_male_primary'][1]).sum() == len(input_data), 'an error occurs in type_of_infertility_-_male_primary range.'
    assert input_data['type_of_infertility_male_secondary'].between(params['range_type_of_infertility_male_secondary'][0], params['range_type_of_infertility_male_secondary'][1]).sum() == len(input_data), 'an error occurs in type_of_infertility_-_male_secondary range.'
    assert input_data['type_of_infertility_couple_primary'].between(params['range_type_of_infertility_couple_primary'][0], params['range_type_of_infertility_couple_primary'][1]).sum() == len(input_data), 'an error occurs in type_of_infertility_-couple_primary range.'
    assert input_data['type_of_infertility_couple_secondary'].between(params['range_type_of_infertility_couple_secondary'][0], params['range_type_of_infertility_couple_secondary'][1]).sum() == len(input_data), 'an error occurs in type_of_infertility_-couple_secondary range.'
    assert input_data['cause_of_infertility_tubal_disease'].between(params['range_cause_of_infertility_tubal_disease'][0], params['range_cause_of_infertility_tubal_disease'][1]).sum() == len(input_data), 'an error occurs in cause__of_infertility_-_tubal_disease range.'
    assert input_data['cause_of_infertility_ovulatory_disorder'].between(params['range_cause_of_infertility_ovulatory_disorder'][0], params['range_cause_of_infertility_ovulatory_disorder'][1]).sum() == len(input_data), 'an error occurs in cause_of_infertility_-_ovulatory_disorder range.'
    assert input_data['cause_of_infertility_male_factor'].between(params['range_cause_of_infertility_male_factor'][0], params['range_cause_of_infertility_male_factor'][1]).sum() == len(input_data), 'an error occurs in cause_of_infertility_-_male_factor range.'
    assert input_data['cause_of_infertility_patient_unexplained'].between(params['range_cause_of_infertility_patient_unexplained'][0], params['range_cause_of_infertility_patient_unexplained'][1]).sum() == len(input_data), 'an error occurs in cause_of_infertility_-_patient_unexplained range.'
    assert input_data['cause_of_infertility_endometriosis'].between(params['range_cause_of_infertility_endometriosis'][0], params['range_cause_of_infertility_endometriosis'][1]).sum() == len(input_data), 'an error occurs in cause_of_infertility_-_endometriosis range.'
    assert input_data['cause_of_infertility_cervical_factors'].between(params['range_cause_of_infertility_cervical_factors'][0], params['range_cause_of_infertility_cervical_factors'][1]).sum() == len(input_data), 'an error occurs in cause_of_infertility_-_cervical_factors range.'
    assert input_data['cause_of_infertility_female_factors'].between(params['range_cause_of_infertility_female_factors'][0], params['range_cause_of_infertility_female_factors'][1]).sum() == len(input_data), 'an error occurs in cause_of_infertility_-_female_factors range.'
    assert input_data['cause_of_infertility_partner_sperm_concentration'].between(params['range_cause_of_infertility_partner_sperm_concentration'][0], params['range_cause_of_infertility_partner_sperm_concentration'][1]).sum() == len(input_data), 'an error occurs in cause_of_infertility_-_partner_sperm_concentration range.'
    assert input_data['cause_of_infertility_partner_sperm_morphology'].between(params['range_cause_of_infertility_partner_sperm_morphology'][0], params['range_cause_of_infertility_partner_sperm_morphology'][1]).sum() == len(input_data), 'an error occurs in cause_of_infertility_-__partner_sperm_morphology range.'
    assert input_data['causes_of_infertility_partner_sperm_motility'].between(params['range_causes_of_infertility_partner_sperm_motility'][0], params['range_causes_of_infertility_partner_sperm_motility'][1]).sum() == len(input_data), 'an error occurs in causes_of_infertility_-_partner_sperm_motility range.'
    assert input_data['cause_of_infertility_partner_sperm_immunological_factors'].between(params['range_cause_of_infertility_partner_sperm_immunological_factors'][0], params['range_cause_of_infertility_partner_sperm_immunological_factors'][1]).sum() == len(input_data), 'an error occurs in cause_of_infertility_-__partner_sperm_immunological_factors range.'
    assert input_data['stimulation_used'].between(params['range_stimulation_used'][0], params['range_stimulation_used'][1]).sum() == len(input_data), 'an error occurs in stimulation_used range.'
    assert input_data['fresh_cycle'].between(params['range_fresh_cycle'][0], params['range_fresh_cycle'][1]).sum() == len(input_data), 'an error occurs in fresh_cycle range.'
    assert input_data['frozen_cycle'].between(params['range_frozen_cycle'][0], params['range_frozen_cycle'][1]).sum() == len(input_data), 'an error occurs in frozen_cycle range.'
    assert input_data['eggs_thawed'].between(params['range_eggs_thawed'][0], params['range_eggs_thawed'][1]).sum() == len(input_data), 'an error occurs in eggs_thawed range.'
    assert input_data['fresh_eggs_collected'].between(params['range_fresh_eggs_collected'][0], params['range_fresh_eggs_collected'][1]).sum() == len(input_data), 'an error occurs in fresh_eggs_collected range.'
    assert input_data['eggs_mixed_with_partner_sperm'].between(params['range_eggs_mixed_with_partner_sperm'][0], params['range_eggs_mixed_with_partner_sperm'][1]).sum() == len(input_data), 'an error occurs in eggs_mixed_with_partner_sperm range.'
    assert input_data['embryos_transfered'].between(params['range_embryos_transfered'][0], params['range_embryos_transfered'][1]).sum() == len(input_data), 'an error occurs in embryos_transfered range.'

In [117]:
check_data(raw_dataset, config_data)

## 5. Data Splitting

In [118]:
# Split input/variable/feature with target/labet/output
x = raw_dataset[config_data["predictors"]].copy()
y = raw_dataset['live_birth_occurrence'].copy()

In [119]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156125 entries, 0 to 158518
Data columns (total 28 columns):
 #   Column                                                    Non-Null Count   Dtype 
---  ------                                                    --------------   ----- 
 0   patient_age_at_treatment                                  156125 non-null  object
 1   total_number_of_previous_ivf_cycles                       156125 non-null  int64 
 2   total_number_of_ivf_pregnancies                           156125 non-null  int64 
 3   total_number_of_live_births_conceived_through_ivf         156125 non-null  int64 
 4   type_of_infertility_female_primary                        156125 non-null  int64 
 5   type_of_infertility_female_secondary                      156125 non-null  int64 
 6   type_of_infertility_male_primary                          156125 non-null  int64 
 7   type_of_infertility_male_secondary                        156125 non-null  int64 
 8   type_of_infert

In [120]:
# First split, splitting train and test set with ratio 0.7:0.3 and do stratify splitting
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42, stratify = y)

In [121]:
# Second split, splitting test and valid set with ratio 0.5:0.5 and do stratify splitting
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42, stratify = y_test)

In [122]:
util.pickle_dump(x_train, config_data["train_set_path"][0])
util.pickle_dump(y_train, config_data["train_set_path"][1])

util.pickle_dump(x_valid, config_data["valid_set_path"][0])
util.pickle_dump(y_valid, config_data["valid_set_path"][1])

util.pickle_dump(x_test, config_data["test_set_path"][0])
util.pickle_dump(y_test, config_data["test_set_path"][1])