## **Import Libraries**

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

### **Step 1: Data Creation and Loading**

In [2]:
# step-1 >> Creating Synthetic data and loading it
# columns also called features

data = {
        "patient_id" : [1,2,3,4,5,6,7,8],
        "name" :["AB","BC","CD","DE","EF","FG","GH","HI"],
        "age" :[34,np.nan,44,64,34,np.nan,45,55] ,
        "gender":["M","F","F","M","M","M","F","F"],
        "blood_presssure" :["120/80",None,"140/90","abc","130/85","135/80","120/80","130/80"],
        "diagnosis":["Hypertention","Diabetes","None","Diabetes","hypertention","HYPERTENTION",None,"pre-Diabetic"],
        "date_of_visit":["2024-01-18","203/01/18","20230118","2022-01-18","2023-01-18","2023-01-18","2023/01/18","20230118"]
}

### **Step 2: Load into DataFrame**

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,patient_id,name,age,gender,blood_presssure,diagnosis,date_of_visit
0,1,AB,34.0,M,120/80,Hypertention,2024-01-18
1,2,BC,,F,,Diabetes,203/01/18
2,3,CD,44.0,F,140/90,,20230118
3,4,DE,64.0,M,abc,Diabetes,2022-01-18
4,5,EF,34.0,M,130/85,hypertention,2023-01-18
5,6,FG,,M,135/80,HYPERTENTION,2023-01-18
6,7,GH,45.0,F,120/80,,2023/01/18
7,8,HI,55.0,F,130/80,pre-Diabetic,20230118


### **Step 3: Cleaning Individual Columns**

 > #### **Normalize Diagnosis**

In [4]:
df['diagnosis'] = df['diagnosis'].str.lower().str.strip()
df

Unnamed: 0,patient_id,name,age,gender,blood_presssure,diagnosis,date_of_visit
0,1,AB,34.0,M,120/80,hypertention,2024-01-18
1,2,BC,,F,,diabetes,203/01/18
2,3,CD,44.0,F,140/90,none,20230118
3,4,DE,64.0,M,abc,diabetes,2022-01-18
4,5,EF,34.0,M,130/85,hypertention,2023-01-18
5,6,FG,,M,135/80,hypertention,2023-01-18
6,7,GH,45.0,F,120/80,,2023/01/18
7,8,HI,55.0,F,130/80,pre-diabetic,20230118


In [5]:
df.replace({'diagnosis': {"none": np.nan, "": np.nan, "null": np.nan}}, inplace=True)
df

Unnamed: 0,patient_id,name,age,gender,blood_presssure,diagnosis,date_of_visit
0,1,AB,34.0,M,120/80,hypertention,2024-01-18
1,2,BC,,F,,diabetes,203/01/18
2,3,CD,44.0,F,140/90,,20230118
3,4,DE,64.0,M,abc,diabetes,2022-01-18
4,5,EF,34.0,M,130/85,hypertention,2023-01-18
5,6,FG,,M,135/80,hypertention,2023-01-18
6,7,GH,45.0,F,120/80,,2023/01/18
7,8,HI,55.0,F,130/80,pre-diabetic,20230118


> #### **Impute Missing Ages**

In [6]:
# filling nan in age feature
df["age"] = df['age'].fillna(df['age'].mean())
df

Unnamed: 0,patient_id,name,age,gender,blood_presssure,diagnosis,date_of_visit
0,1,AB,34.0,M,120/80,hypertention,2024-01-18
1,2,BC,46.0,F,,diabetes,203/01/18
2,3,CD,44.0,F,140/90,,20230118
3,4,DE,64.0,M,abc,diabetes,2022-01-18
4,5,EF,34.0,M,130/85,hypertention,2023-01-18
5,6,FG,46.0,M,135/80,hypertention,2023-01-18
6,7,GH,45.0,F,120/80,,2023/01/18
7,8,HI,55.0,F,130/80,pre-diabetic,20230118


> #### **Clean Blood Pressure**

In [7]:
df["blood_presssure"].where(df["blood_presssure"].str.contains(r'^\d+/\d+$'))

0    120/80
1       NaN
2    140/90
3       NaN
4    130/85
5    135/80
6    120/80
7    130/80
Name: blood_presssure, dtype: object

> #### **Parse Dates**

In [8]:
df['date_of_visit'] = pd.to_datetime(df['date_of_visit'],errors='coerce')
df

Unnamed: 0,patient_id,name,age,gender,blood_presssure,diagnosis,date_of_visit
0,1,AB,34.0,M,120/80,hypertention,2024-01-18
1,2,BC,46.0,F,,diabetes,NaT
2,3,CD,44.0,F,140/90,,NaT
3,4,DE,64.0,M,abc,diabetes,2022-01-18
4,5,EF,34.0,M,130/85,hypertention,2023-01-18
5,6,FG,46.0,M,135/80,hypertention,2023-01-18
6,7,GH,45.0,F,120/80,,NaT
7,8,HI,55.0,F,130/80,pre-diabetic,NaT


> #### **Exact Duplicate Termination**

In [9]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,patient_id,name,age,gender,blood_presssure,diagnosis,date_of_visit
0,1,AB,34.0,M,120/80,hypertention,2024-01-18
1,2,BC,46.0,F,,diabetes,NaT
2,3,CD,44.0,F,140/90,,NaT
3,4,DE,64.0,M,abc,diabetes,2022-01-18
4,5,EF,34.0,M,130/85,hypertention,2023-01-18
5,6,FG,46.0,M,135/80,hypertention,2023-01-18
6,7,GH,45.0,F,120/80,,NaT
7,8,HI,55.0,F,130/80,pre-diabetic,NaT


> #### **Deleting NaN**

In [10]:
df.dropna(subset=['blood_presssure','date_of_visit'],inplace=True)
df

Unnamed: 0,patient_id,name,age,gender,blood_presssure,diagnosis,date_of_visit
0,1,AB,34.0,M,120/80,hypertention,2024-01-18
3,4,DE,64.0,M,abc,diabetes,2022-01-18
4,5,EF,34.0,M,130/85,hypertention,2023-01-18
5,6,FG,46.0,M,135/80,hypertention,2023-01-18
