### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Loading Data

In [3]:
df_train = pd.read_csv("train_genetic_disorders.csv")
df_test = pd.read_csv("test_genetic_disorders.csv")

In [5]:
df_train.head()

Unnamed: 0,Patient Id,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Patient First Name,Family Name,Father's name,...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,PID0x6418,2.0,Yes,No,Yes,No,4.760603,Richard,,Larre,...,,9.857562,,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,PID0x25d5,4.0,Yes,Yes,No,No,4.910669,Mike,,Brycen,...,Multiple,5.52256,normal,1.0,,1.0,1.0,0.0,,Cystic fibrosis
2,PID0x4a82,6.0,Yes,No,No,No,4.893297,Kimberly,,Nashon,...,Singular,,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,PID0x4ac8,12.0,Yes,No,Yes,No,4.70528,Jeffery,Hoelscher,Aayaan,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,PID0x1bf7,11.0,Yes,No,,Yes,4.720703,Johanna,Stutzman,Suave,...,Multiple,4.09821,,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer


In [6]:
df_test.head()

Unnamed: 0,Patient Id,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Patient First Name,Family Name,Father's name,...,History of anomalies in previous pregnancies,No. of previous abortion,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5
0,PID0x4175,6.0,No,Yes,No,No,4.981655,Charles,,Kore,...,-99,2.0,Multiple,-99.0,slightly abnormal,True,True,True,True,True
1,PID0x21f5,10.0,Yes,No,,Yes,5.11889,Catherine,,Homero,...,Yes,-99.0,Multiple,8.179584,normal,False,False,False,True,False
2,PID0x49b8,5.0,No,,No,No,4.876204,James,,Danield,...,No,0.0,Singular,-99.0,slightly abnormal,False,False,True,True,False
3,PID0x2d97,13.0,No,Yes,Yes,No,4.687767,Brian,,Orville,...,Yes,-99.0,Singular,6.884071,normal,True,False,True,False,True
4,PID0x58da,5.0,No,,,Yes,5.152362,Gary,,Issiah,...,No,-99.0,Multiple,6.195178,normal,True,True,True,True,False


In [18]:
df_train.shape

(22083, 45)

In [19]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22083 entries, 0 to 22082
Data columns (total 45 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Patient Id                                        21011 non-null  object 
 1   Patient Age                                       19643 non-null  float64
 2   Genes in mother's side                            21011 non-null  object 
 3   Inherited from father                             20724 non-null  object 
 4   Maternal gene                                     18317 non-null  object 
 5   Paternal gene                                     21011 non-null  object 
 6   Blood cell count (mcL)                            21011 non-null  float64
 7   Patient First Name                                21011 non-null  object 
 8   Family Name                                       11771 non-null  object 
 9   Father's name    

In [20]:
df_train.tail()

Unnamed: 0,Patient Id,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Patient First Name,Family Name,Father's name,...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
22078,,,,,,,,,,,...,,,,,,,,,,
22079,,,,,,,,,,,...,,,,,,,,,,
22080,,,,,,,,,,,...,,,,,,,,,,
22081,,,,,,,,,,,...,,,,,,,,,,
22082,,,,,,,,,,,...,,,,,,,,,,


### Data Cleaning

In [21]:
df_train.isnull().sum()

Patient Id                                           1072
Patient Age                                          2440
Genes in mother's side                               1072
Inherited from father                                1359
Maternal gene                                        3766
Paternal gene                                        1072
Blood cell count (mcL)                               1072
Patient First Name                                   1072
Family Name                                         10312
Father's name                                        1072
Mother's age                                         6790
Father's age                                         6761
Institute Name                                       5932
Location of Institute                                1072
Status                                               1072
Respiratory Rate (breaths/min)                       3131
Heart Rate (rates/min                                3097
Test 1        

In [22]:
df_test.isnull().sum()

Patient Id                                           173
Patient Age                                          173
Genes in mother's side                               173
Inherited from father                                717
Maternal gene                                       3829
Paternal gene                                        173
Blood cell count (mcL)                               173
Patient First Name                                   173
Family Name                                         9316
Father's name                                        173
Mother's age                                         173
Father's age                                         173
Institute Name                                      2170
Location of Institute                                173
Status                                               173
Respiratory Rate (breaths/min)                      3013
Heart Rate (rates/min                               3014
Test 1                         

#### Dropping unwanted attributes & rows

In [23]:
columns_to_drop = [
    "Patient Id", "Patient First Name", "Family Name", "Father's name",
    "Institute Name", "Location of Institute", "Parental consent",
    "Mother's age", "Father's age", "Test 1", "Test 2",
    "Test 3", "Test 4", "Test 5"
]

In [24]:
# Dropping the columns from the training and testing datasets
df_train = df_train.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

In [25]:
df_train.shape

(22083, 31)

In [26]:
df_test.shape

(9463, 29)

In [27]:
# Removing rows with all missing values from the training and testing datasets
df_train = df_train.dropna(how='all')
df_test = df_test.dropna(how='all')

In [30]:
df_train.shape

(21011, 31)

In [31]:
df_test.shape

(9290, 29)

#### Renaming attributes

In [38]:
new_columns = {
    "Patient Age": "patient_age",
    "Genes in mother's side": "maternal_gene_defect",
    "Inherited from father": "paternal_gene_defect",
    "Maternal gene": "maternal_gene",
    "Paternal gene": "paternal_gene",
    "Blood cell count (mcL)": "RBC_count",
    'Status': 'patient_status',
    "Respiratory Rate (breaths/min)": "respiratory_rate",
    "Heart Rate (rates/min": "heart_rate",
    "Follow-up": "follow_up",
    'Gender': 'gender',
    "Birth asphyxia": "birth_asphyxia",
    "Autopsy shows birth defect (if applicable)": "birth_defect_autopsy",
    "Place of birth": "birth_place",
    "Folic acid details (peri-conceptional)": "folic_acid_use",
    "H/O serious maternal illness": "maternal_illness_history",
    "H/O radiation exposure (x-ray)": "radiation_exposure_history",
    "H/O substance abuse": "substance_abuse_history",
    "Assisted conception IVF/ART": "assisted_conception",
    "History of anomalies in previous pregnancies": "previous_pregnancy_anomalies",
    "Birth defects": "birth_defects",
    "Blood test result": "blood_test_result",
    "Genetic Disorder": "genetic_disorder",
    "Disorder Subclass": "disorder_subclass",
    "No. of previous abortion": "previous_abortions",
    "White Blood cell count (thousand per microliter)": "WBC_count",
    'Symptom 1': 'symptom_1',
    'Symptom 2': 'symptom_2',
    'Symptom 3': 'symptom_3',
    'Symptom 4': 'symptom_4',
    'Symptom 5': 'symptom_5'
}

In [39]:
df_train = df_train.rename(columns = new_columns)
df_test = df_test.rename(columns = new_columns)

In [40]:
df_train.head()

Unnamed: 0,patient_age,maternal_gene_defect,paternal_gene_defect,maternal_gene,paternal_gene,RBC_count,patient_status,respiratory_rate,heart_rate,follow_up,...,birth_defects,WBC_count,blood_test_result,symptom_1,symptom_2,symptom_3,symptom_4,symptom_5,genetic_disorder,disorder_subclass
0,2.0,Yes,No,Yes,No,4.760603,Alive,Normal (30-60),Normal,High,...,,9.857562,,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,4.0,Yes,Yes,No,No,4.910669,Deceased,Tachypnea,Normal,High,...,Multiple,5.52256,normal,1.0,,1.0,1.0,0.0,,Cystic fibrosis
2,6.0,Yes,No,No,No,4.893297,Alive,Normal (30-60),Tachycardia,Low,...,Singular,,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,12.0,Yes,No,Yes,No,4.70528,Deceased,Tachypnea,Normal,High,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,11.0,Yes,No,,Yes,4.720703,Alive,Tachypnea,Tachycardia,Low,...,Multiple,4.09821,,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer


#### Updating categorical values

In [41]:
# Identify all categorical columns in the dataset
categorical_cols = df_train.select_dtypes(include=['object', 'category']).columns

# Fill missing values in all categorical columns with "Unknown"
df_train[categorical_cols] = df_train[categorical_cols].fillna("Unknown")

In [51]:
df_train.head()

Unnamed: 0,patient_age,maternal_gene_defect,paternal_gene_defect,maternal_gene,paternal_gene,RBC_count,patient_status,respiratory_rate,heart_rate,follow_up,...,birth_defects,WBC_count,blood_test_result,symptom_1,symptom_2,symptom_3,symptom_4,symptom_5,genetic_disorder,disorder_subclass
0,2.0,Yes,No,Yes,No,4.760603,Alive,Normal (30-60),Normal,High,...,Unknown,9.857562,Unknown,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,4.0,Yes,Yes,No,No,4.910669,Deceased,Tachypnea,Normal,High,...,Multiple,5.52256,Normal,1.0,,1.0,1.0,0.0,Unknown,Cystic fibrosis
2,6.0,Yes,No,No,No,4.893297,Alive,Normal (30-60),Tachycardia,Low,...,Singular,,Normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,12.0,Yes,No,Yes,No,4.70528,Deceased,Tachypnea,Normal,High,...,Singular,7.919321,Inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,11.0,Yes,No,Unknown,Yes,4.720703,Alive,Tachypnea,Tachycardia,Low,...,Multiple,4.09821,Unknown,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer


In [52]:
# Identify all categorical columns in the dataset
categorical_cols = df_test.select_dtypes(include=['object', 'category']).columns

# Fill missing values in all categorical columns with "Unknown"
df_test[categorical_cols] = df_test[categorical_cols].fillna("Unknown")

In [54]:
df_test.head()

Unnamed: 0,patient_age,maternal_gene_defect,paternal_gene_defect,maternal_gene,paternal_gene,RBC_count,patient_status,respiratory_rate,heart_rate,follow_up,...,previous_pregnancy_anomalies,previous_abortions,birth_defects,WBC_count,blood_test_result,symptom_1,symptom_2,symptom_3,symptom_4,symptom_5
0,6.0,No,Yes,No,No,4.981655,Alive,Tachypnea,Normal,Low,...,-99,2.0,Multiple,-99.0,Slightly Abnormal,True,True,True,True,True
1,10.0,Yes,No,Unknown,Yes,5.11889,Alive,Unknown,-99,Low,...,Yes,-99.0,Multiple,8.179584,Normal,False,False,False,True,False
2,5.0,No,Unknown,No,No,4.876204,Deceased,Unknown,Normal,Low,...,No,0.0,Singular,-99.0,Slightly Abnormal,False,False,True,True,False
3,13.0,No,Yes,Yes,No,4.687767,Alive,-99,-99,Low,...,Yes,-99.0,Singular,6.884071,Normal,True,False,True,False,True
4,5.0,No,Unknown,Unknown,Yes,5.152362,Deceased,Tachypnea,Unknown,Low,...,No,-99.0,Multiple,6.195178,Normal,True,True,True,True,False


In [56]:
# Standardize all categorical variables to title case
for col in categorical_cols:
    df_train[col] = df_train[col].str.strip().str.title()

In [57]:
df_train.head()

Unnamed: 0,patient_age,maternal_gene_defect,paternal_gene_defect,maternal_gene,paternal_gene,RBC_count,patient_status,respiratory_rate,heart_rate,follow_up,...,birth_defects,WBC_count,blood_test_result,symptom_1,symptom_2,symptom_3,symptom_4,symptom_5,genetic_disorder,disorder_subclass
0,2.0,Yes,No,Yes,No,4.760603,Alive,Normal (30-60),Normal,High,...,Unknown,9.857562,Unknown,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,4.0,Yes,Yes,No,No,4.910669,Deceased,Tachypnea,Normal,High,...,Multiple,5.52256,Normal,1.0,,1.0,1.0,0.0,Unknown,Cystic fibrosis
2,6.0,Yes,No,No,No,4.893297,Alive,Normal (30-60),Tachycardia,Low,...,Singular,,Normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,12.0,Yes,No,Yes,No,4.70528,Deceased,Tachypnea,Normal,High,...,Singular,7.919321,Inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,11.0,Yes,No,Unknown,Yes,4.720703,Alive,Tachypnea,Tachycardia,Low,...,Multiple,4.09821,Unknown,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer


In [49]:
# Standardize all categorical variables to title case
for col in categorical_cols:
    df_test[col] = df_test[col].str.strip().str.title()

In [50]:
df_test.head()

Unnamed: 0,patient_age,maternal_gene_defect,paternal_gene_defect,maternal_gene,paternal_gene,RBC_count,patient_status,respiratory_rate,heart_rate,follow_up,...,previous_pregnancy_anomalies,previous_abortions,birth_defects,WBC_count,blood_test_result,symptom_1,symptom_2,symptom_3,symptom_4,symptom_5
0,6.0,No,Yes,No,No,4.981655,Alive,Tachypnea,Normal,Low,...,-99,2.0,Multiple,-99.0,Slightly Abnormal,True,True,True,True,True
1,10.0,Yes,No,Unknown,Yes,5.11889,Alive,Unknown,-99,Low,...,Yes,-99.0,Multiple,8.179584,Normal,False,False,False,True,False
2,5.0,No,Unknown,No,No,4.876204,Deceased,Unknown,Normal,Low,...,No,0.0,Singular,-99.0,Slightly Abnormal,False,False,True,True,False
3,13.0,No,Yes,Yes,No,4.687767,Alive,-99,-99,Low,...,Yes,-99.0,Singular,6.884071,Normal,True,False,True,False,True
4,5.0,No,Unknown,Unknown,Yes,5.152362,Deceased,Tachypnea,Unknown,Low,...,No,-99.0,Multiple,6.195178,Normal,True,True,True,True,False
