# Data
https://archive.ics.uci.edu/dataset/45/heart+disease

In [84]:
import pandas as pd
from ucimlrepo import fetch_ucirepo

# Fetch the dataset
heart_disease = fetch_ucirepo(id=45)
X = heart_disease.data.features
y = heart_disease.data.targets

In [85]:
X.columns, y.columns

(Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
        'exang', 'oldpeak', 'slope', 'ca', 'thal'],
       dtype='object'),
 Index(['num'], dtype='object'))

In [86]:
# Convert X and y to DataFrame for easier handling
X_df = pd.DataFrame(X, columns=[
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
    'exang', 'oldpeak', 'slope', 'ca', 'thal'
])
y_df = pd.DataFrame(y, columns=['num'])

In [87]:
X_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0
299,68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0
300,57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0
301,57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0


In [110]:
y_df

Unnamed: 0,num
0,0
1,2
2,1
3,0
4,0
...,...
298,1
299,2
300,3
301,1


# Data Validation

In [100]:
# Syntactic Validation
def syntactic_validation(df):
    expected_types = {
        'age': 'int64', 
        'sex': 'int64', 
        'cp': 'int64', 
        'trestbps': 'int64', 
        'chol': 'int64', 
        'fbs': 'int64', 
        'restecg': 'int64', 
        'thalach': 'int64', 
        'exang': 'int64', 
        'oldpeak': 'float64',
        'slope': 'int64', 
        'ca': 'float64', 
        'thal': 'float64'
    }
    
    for col, expected_type in expected_types.items():
        if df[col].dtype != expected_type:
            print(f"Warning: Column '{col}' should be of type {expected_type}, but found {df[col].dtype}.")
    else: 
        print("All Columns type is expected type.")

  

In [101]:
syntactic_validation(X_df)

All Columns type is expected type.


In [102]:
def semantic_validation(df):
    if (df['age'] < 0).any():
        print("Warning: Age cannot be negative.")
    if (df['trestbps'] <= 0).any():
        print("Warning: Resting blood pressure (trestbps) should be positive.")
    if (df['chol'] <= 0).any():
        print("Warning: Cholesterol level (chol) should be positive.")
    if (df['thalach'] <= 0).any():
        print("Warning: Maximum heart rate (thalach) should be positive.")
    if (df['oldpeak'] < 0).any():
        print("Warning: Oldpeak (depression of ST segment) should not be negative.")
    if (df['ca'] < 0).any() or (df['ca'] > 3).any():
        print("Warning: CA should be between 0 and 3.")
    if (df['slope'] < 1).any() or (df['slope'] > 3).any():
        print("Warning: Slope should be between 1 and 3.")
    invalid_thal = df[~df['thal'].isin([3, 6, 7])]
    if not invalid_thal.empty:
        print("Warning: Thal should be either 3, 6, or 7. Invalid rows:")
        print(invalid_thal)


In [103]:
semantic_validation(X_df)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
87    53    0   3       128   216    0        2      115      0      0.0   
266   52    1   4       128   204    1        0      156      1      1.0   

     slope   ca  thal  
87       1  0.0   NaN  
266      2  0.0   NaN  


In [104]:
def cross_field_validation_with_details(df):
    # Check for rows where fbs = 1 and corresponding chol < 120
    invalid_fbs_chol = df[(df['fbs'] == 1) & (df['chol'] < 120)]
    if not invalid_fbs_chol.empty:
        print("Warning: Rows where fasting blood sugar (fbs = 1) corresponds to cholesterol levels < 120:")
        print(invalid_fbs_chol)
    
    # Check for rows where thalach < 100 and age > 60
    invalid_thalach_age = df[(df['thalach'] < 100) & (df['age'] > 60)]
    if not invalid_thalach_age.empty:
        print("Warning: Rows where maximum heart rate (thalach < 100) is unexpectedly low for patients older than 60:")
        print(invalid_thalach_age)


In [105]:
cross_field_validation_with_details(X_df)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
72    62    1   4       120   267    0        0       99      1      1.8   
114   62    0   3       130   263    0        0       97      0      1.2   
154   64    1   4       120   246    0        2       96      1      2.2   
245   67    1   4       120   237    0        0       71      0      1.0   

     slope   ca  thal  
72       2  2.0   7.0  
114      2  1.0   7.0  
154      3  1.0   3.0  
245      2  0.0   3.0  


In [106]:
def data_consistency(df):
    missing_values = df.isnull().sum()
    if missing_values.any():
        print("Warning: Missing values found in the following columns:")
        print(missing_values[missing_values > 0])
        print("\nRows with missing values:")
        print(df[df.isnull().any(axis=1)])
    else:
        print("No missing values found.")


In [107]:
data_consistency(X_df)

ca      4
thal    2
dtype: int64

Rows with missing values:
     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
87    53    0   3       128   216    0        2      115      0      0.0   
166   52    1   3       138   223    0        0      169      0      0.0   
192   43    1   4       132   247    1        2      143      1      0.1   
266   52    1   4       128   204    1        0      156      1      1.0   
287   58    1   2       125   220    0        0      144      0      0.4   
302   38    1   3       138   175    0        0      173      0      0.0   

     slope   ca  thal  
87       1  0.0   NaN  
166      1  NaN   3.0  
192      2  NaN   7.0  
266      2  0.0   NaN  
287      2  NaN   7.0  
302      1  NaN   3.0  


In [108]:
def data_compatibility(df):
    # Check for column names with spaces or special characters
    problematic_columns = [col for col in df.columns if not col.isidentifier()]
    if problematic_columns:
        print("Warning: Some column names may not be compatible with certain systems (e.g., spaces or special characters in column names).")
        print(f"Problematic column names: {problematic_columns}")
    else: 
        print("All column names is compatible.")

    
    # Check for unsupported data types
    unsupported_types = df.select_dtypes(include=['object', 'category']).columns
    if len(unsupported_types) > 0:
        print("Warning: Some columns have unsupported data types for compatibility with certain systems.")
        print(f"Columns with unsupported types: {list(unsupported_types)}")
    else:
        print("All columns have supported data types for compatibility with systems.")

    
    # Check for mixed data types in columns
    for col in df.columns:
        if df[col].map(type).nunique() > 1:
            print(f"Warning: Column '{col}' contains mixed data types.")
            print(df[col].apply(type).value_counts())
    else:
        print("Columns doesn't contains mixed data types.")


In [109]:
data_compatibility(X_df)

All column names is compatible.
All columns have supported data types for compatibility with systems.
Columns doesn't contains mixed data types.
