In [3]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [4]:
dataframe = pd.read_csv('heart.csv')
df = dataframe.copy()
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [5]:
categorical_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
numeric_features = ['Age', 'RestingBP', 'FastingBS', 'Cholesterol', 'MaxHR', 'Oldpeak']

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [7]:
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']
X_processed = preprocessor.fit_transform(X)

In [8]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_processed, y)

In [9]:
additional_rows = 2000

if additional_rows > 0:
    smote = SMOTE(sampling_strategy={1: y.value_counts()[0] + additional_rows}, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_processed, y)

In [10]:
import numpy as np
def inverse_transform(preprocessor, X_transformed, original_df):
    # Get feature names after transformation
    num_features = numeric_features
    cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
    all_features = np.concatenate([num_features, cat_features])
    
    # Create DataFrame with transformed data
    df_transformed = pd.DataFrame(X_transformed, columns=all_features)
    
    # Inverse transform numerical features
    num_scaler = preprocessor.named_transformers_['num']
    df_transformed[num_features] = num_scaler.inverse_transform(df_transformed[num_features])
    
    # Inverse transform categorical features
    encoder = preprocessor.named_transformers_['cat']
    cat_columns = encoder.get_feature_names_out(categorical_features)
    cat_values = encoder.inverse_transform(df_transformed[cat_columns])
    
    # Create final DataFrame
    df_final = pd.DataFrame({
        'Age': df_transformed['Age'].round().astype(int),
        'Sex': cat_values[:, 0],
        'ChestPainType': cat_values[:, 1],
        'RestingBP': df_transformed['RestingBP'].round().astype(int),
        'Cholesterol': df_transformed['Cholesterol'].round().astype(int),
        'FastingBS': df_transformed['FastingBS'].round().astype(int).clip(0, 1),  # Ensure binary
        'RestingECG': cat_values[:, 2],
        'MaxHR': df_transformed['MaxHR'].round().astype(int),
        'ExerciseAngina': cat_values[:, 3],
        'Oldpeak': df_transformed['Oldpeak'].round(1),
        'ST_Slope': cat_values[:, 4],
        'HeartDisease': y_resampled
    })
    
    return df_final

# Get the expanded data in original format
expanded_df = inverse_transform(preprocessor, X_resampled, df)

In [11]:
expanded_df.shape

(2820, 12)

In [12]:
expanded_df.to_csv('heart_expanded.csv', index=False)

## -----------

In [13]:
from random import choice, randint, random

def corrupt_data(df):
    
    corrupted_df = df.copy()

    def random_age_format(age):
        formats = [
            lambda x: f"{x} years",
            lambda x: f"{x}",
            lambda x: str(x * 10),
            lambda x: f"{x} " + choice(["year", "yr", "y", "years"])
        ]
        return choice(formats)(age)

    sample_indices = corrupted_df.sample(frac=0.1, random_state=42).index
    corrupted_df.loc[sample_indices, 'Age'] = corrupted_df.loc[sample_indices, 'Age'].apply(random_age_format)
    
    # change data types
    corrupted_df['Age'] = corrupted_df['Age'].astype(str)
    corrupted_df['RestingBP'] = corrupted_df['RestingBP'].astype(str)
    
    duplicated_rows = corrupted_df.sample(n=50)
    corrupted_df = pd.concat([corrupted_df, duplicated_rows], ignore_index=True)
    
    sample_indices = corrupted_df.sample(frac=0.4, random_state=42).index
    corrupted_df.loc[sample_indices, 'Sex'] = corrupted_df.loc[sample_indices, 'Sex'].map({'M': 'Male', 'F': 'Female'})
    corrupted_df.loc[sample_indices, 'ExerciseAngina'] = corrupted_df.loc[sample_indices, 'ExerciseAngina'].map({'Y': 'Yes', 'N': 'No'})
    corrupted_df['ExerciseAngina'] = corrupted_df['ExerciseAngina'].map({'Y': 'Yes', 'N': 'No'})

    missing_values = ["null", "undefined", "##", "??", np.nan, "", "NaN"]
    
    for col in ['RestingBP', 'Cholesterol', 'MaxHR', 'Sex']:
        sample_indices = corrupted_df.sample(frac=0.05).index
        corrupted_df.loc[sample_indices, col] = [choice(missing_values) for _ in range(len(sample_indices))]
    
    return corrupted_df

corrupted_data = corrupt_data(expanded_df)


 '53 year' '60 year' '51 years' '51 years' '53' '62' '66 years' '37 years'
 '600' '60 years' '54 years' '48' '57 years' '54 years' '410' '61'
 '55 years' '490' '56 years' '59' '64 years' '54 years' '60 years' '660'
 '56 y' '59' '66 years' '58' '53 years' '57 years' '61' '58' '62 years'
 '38 years' '51' '69 years' '52 years' '50 years' '630' '62 years'
 '60 years' '53 y' '59 years' '68' '47' '62 years' '59 y' '54' '55 years'
 '470' '70 years' '57' '65 years' '470' '560' '590' '59' '50 y' '56 year'
 '73' '590' '58 years' '59' '60 y' '58 years' '710' '630' '60 yr' '50 y'
 '73 years' '580' '620' '56' '67' '51' '54 years' '61' '32' '69' '52'
 '55 years' '580' '42 y' '54' '380' '750' '560' '61' '59 years' '50' '640'
 '400' '62' '61 yr' '53' '55 years' '55 years' '61' '71 years' '560'
 '56 yr' '64 years' '50 years' '60 yr' '65 years' '630' '57 years'
 '44 years' '570' '66 y' '56' '53 years' '670' '58 years' '600' '550'
 '480' '610' '57 years' '52' '610' '50' '520' '510' '38' '60 years' '62'
 

In [14]:
corrupted_data.to_csv('heart_corrupted.csv', index=False)