In [3]:
import numpy as np 
import pandas as pd
import os
import seaborn as sns
import plotly.express as px
keys = ['A', 'D', 'S']
csv_files = [f for f in os.listdir() if f.split('.')[-1] in ['csv']]
csv_files

['asthma_dataset.csv', 'diabetes_data.csv', 'stroke_data.csv']

In [4]:
data_dict = dict()
for k, data in zip(keys, csv_files):
    data_dict[k] = pd.read_csv(data)

In [5]:
data_dict['A'].head()

Unnamed: 0,VAX_TYPE,AGE_YRS,SEX,SLEEPING_PROB,CHEST_TIGHTNESS,BREATH,COUGH,ALLERGY,WHEEZING,ASTHMA
0,HEPA,1.5,M,False,False,False,False,False,False,False
1,RV5,28.0,F,False,True,True,True,False,True,True
2,FLU3,53.0,F,False,False,False,False,False,False,False
3,HEP,69.0,M,False,False,False,False,False,False,False
4,HEPA,24.0,F,False,False,True,True,False,True,True


In [6]:
data_dict['D'].head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
data_dict['S'].head()

Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1.0,63.0,0,1,1,4,1,228.69,36.6,1,1
1,1.0,42.0,0,1,1,4,0,105.92,32.5,0,1
2,0.0,61.0,0,0,1,4,1,171.23,34.4,1,1
3,1.0,41.0,1,0,1,3,0,174.12,24.0,0,1
4,1.0,85.0,0,0,1,4,1,186.21,29.0,1,1


In [8]:
# To identify the unique values of each column
def show_unique_values(df: pd.DataFrame): 
    data = []
    for col in df.columns:
        unique_value = np.unique(df[col]).tolist()
        temp_str = ""
        data.append([col, len(unique_value), ", ".join(str(value) for value in unique_value)])
    df_temp = pd.DataFrame(data, columns=["Columns Name", "Num of unique value", "Unique Value"])
    pd.options.display.max_colwidth = 50
    return df_temp   

def check_null(data: pd.DataFrame, solver: str=None):
    if solver not in ['remove', 'replace', None]:
        raise ValueError(f"The argument only takes 'remove' or 'replace' but found '{solver}'.")
    
    contain_null = data.isnull().any().any()
    if not contain_null: 
        print("The dataset does not contain any null values.")
        return 
    
    col_miss = [(col,data[col].isnull().sum()) for col in data.columns if data[col].isnull().sum() > 0]
    print("Columns with missing values and the number of missing value: ", col_miss)
    if not solver:
        pass
    elif solver == 'remove':
        return data.dropna()
    elif solver == 'replace':
        for col,_ in col_miss:
            mean = data[col].mean()     
            data.loc[:,col].fillna(mean, inplace=True)
        return data
    
    

In [13]:
show_unique_values(data_dict['A'])

Unnamed: 0,Columns Name,Num of unique value,Unique Value
0,VAX_TYPE,86,"6VAX-F, ADEN_4_7, ANTH, BCG, CHOL, DPP, DT, DT..."
1,AGE_YRS,344,"0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07,..."
2,SEX,3,"F, M, U"
3,SLEEPING_PROB,2,"False, True"
4,CHEST_TIGHTNESS,2,"False, True"
5,BREATH,2,"False, True"
6,COUGH,2,"False, True"
7,ALLERGY,2,"False, True"
8,WHEEZING,2,"False, True"
9,ASTHMA,2,"False, True"


In [14]:
show_unique_values(data_dict['D'])

Unnamed: 0,Columns Name,Num of unique value,Unique Value
0,Age,13,"1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 1..."
1,Sex,2,"0.0, 1.0"
2,HighChol,2,"0.0, 1.0"
3,CholCheck,2,"0.0, 1.0"
4,BMI,80,"12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0..."
5,Smoker,2,"0.0, 1.0"
6,HeartDiseaseorAttack,2,"0.0, 1.0"
7,PhysActivity,2,"0.0, 1.0"
8,Fruits,2,"0.0, 1.0"
9,Veggies,2,"0.0, 1.0"


In [15]:
show_unique_values(data_dict['S'].dropna())

Unnamed: 0,Columns Name,Num of unique value,Unique Value
0,sex,2,"0.0, 1.0"
1,age,111,"-9.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0, 0.0,..."
2,hypertension,2,"0, 1"
3,heart_disease,2,"0, 1"
4,ever_married,2,"0, 1"
5,work_type,5,"0, 1, 2, 3, 4"
6,Residence_type,2,"0, 1"
7,avg_glucose_level,2903,"55.12, 55.22, 55.25, 55.27, 55.32, 55.34, 55.4..."
8,bmi,370,"11.5, 14.1, 15.0, 15.3, 15.4, 15.7, 16.0, 16.1..."
9,smoking_status,2,"0, 1"


In [16]:
data_dict['S'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40910 entries, 0 to 40909
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sex                40907 non-null  float64
 1   age                40910 non-null  float64
 2   hypertension       40910 non-null  int64  
 3   heart_disease      40910 non-null  int64  
 4   ever_married       40910 non-null  int64  
 5   work_type          40910 non-null  int64  
 6   Residence_type     40910 non-null  int64  
 7   avg_glucose_level  40910 non-null  float64
 8   bmi                40910 non-null  float64
 9   smoking_status     40910 non-null  int64  
 10  stroke             40910 non-null  int64  
dtypes: float64(4), int64(7)
memory usage: 3.4 MB


In [103]:
col_miss = [(col,data_dict['S'][col].isna().sum()) for col in data_dict['S'].columns if data_dict['S'][col].isna().sum() > 0]
col_miss

[('sex', 3)]