# Data preprocessing

This notebook contains preprocessing of the four data tables we have chosen. To check the data quality and missing values, we used Sweetviz. There were no missing values:) We decided to split each dataset in a 3:1 ratio (train:test) and later scale all values in each new dataset using the scale_dataframes function.

In [1]:
import numpy as np
import pandas as pd

#import sweetviz as sv #- only to check data quality
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
def preprocess_dataframes(train_df, test_df, target_column):
    #Numeric and categorical columns
    numeric_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    numeric_cols = [col for col in numeric_cols if col != target_column]
    categorical_cols = [col for col in categorical_cols if col != target_column]

    # Scaler for numeric values
    scaler = MinMaxScaler()
    scaler.fit(train_df[numeric_cols])
    
    train_scaled = pd.DataFrame(scaler.transform(train_df[numeric_cols]),columns=numeric_cols,index=train_df.index)
    test_scaled = pd.DataFrame(scaler.transform(test_df[numeric_cols]),columns=numeric_cols,index=test_df.index)

    # One-hot encoding 
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoder.fit(train_df[categorical_cols])
    
    train_encoded = pd.DataFrame(
        encoder.transform(train_df[categorical_cols]),columns=encoder.get_feature_names_out(categorical_cols),index=train_df.index)
    
    test_encoded = pd.DataFrame(
        encoder.transform(test_df[categorical_cols]),columns=encoder.get_feature_names_out(categorical_cols),index=test_df.index)

    # Concat of scaled and encoded dataframes and target column
    train_processed = pd.concat([train_scaled, train_encoded, train_df[[target_column]]], axis=1)
    test_processed = pd.concat([test_scaled, test_encoded, test_df[[target_column]]], axis=1)
    
    return train_processed, test_processed

## Heart Attack dataframe

 Target variable -> class

In [3]:
heart_df = pd.read_csv('Data/Heart-Attack.csv')
heart_df.head()

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin,class
0,64,1,66,160,83,160.0,1.8,0.012,negative
1,21,1,94,98,46,296.0,6.75,1.06,positive
2,55,1,64,160,77,270.0,1.99,0.003,negative
3,64,1,70,120,55,270.0,13.87,0.122,positive
4,55,1,64,112,65,300.0,1.08,0.003,negative


In [4]:
heart_df['class'] = heart_df['class'].map({'negative': 0, 'positive': 1})

In [5]:
heart_train_df, heart_test_df = train_test_split(heart_df, test_size=0.25, stratify=heart_df['class'], random_state=42)

In [6]:
heart_train_df = heart_train_df.drop(["troponin"],axis=1)
heart_test_df = heart_test_df.drop(["troponin"],axis=1)

In [7]:
heart_train_df, heart_test_df = preprocess_dataframes(heart_train_df, heart_test_df, target_column='class')

In [9]:
heart_train_df.to_csv('Data/heart_train.csv', index=False)
heart_test_df.to_csv('Data/heart_test.csv', index=False)

In [9]:
print(len(heart_df))     

1319


## Diabetes dataframe

 Target variable -> class

In [10]:
diabetes_df = pd.read_csv('Data/pima.csv')
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [11]:
#raport_2 = sv.analyze(diabetes_df)
#raport_2.show_html("raport_2_sweetviz.html")

In [12]:
diabetes_df.rename(columns={'Outcome': 'class'}, inplace=True)

In [13]:
diabetes_train_df, diabetes_test_df = train_test_split(diabetes_df, test_size=0.25, stratify=diabetes_df['class'], random_state=42)

In [14]:
diabetes_train_df, diabetes_test_df = preprocess_dataframes(diabetes_train_df, diabetes_test_df, target_column='class')

In [15]:
diabetes_train_df.to_csv('Data/diabetes_train.csv', index=False)
diabetes_test_df.to_csv('Data/diabetes_test.csv', index=False)

In [15]:
print(len(diabetes_df))

768


## Cancer dataframe

Target Variable -> class

In [16]:
cancer_df = pd.read_csv('Data/Breast_Cancer.csv')
cancer_df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


In [17]:
# raport_3 = sv.analyze(cancer_df)
# raport_3.show_html("raport_3_sweetviz.html")

In [18]:
cancer_df.rename(columns={'Status': 'class'}, inplace=True)
cancer_df['class'] = cancer_df['class'].map({'Dead': 0, 'Alive': 1})

In [19]:
cancer_train_df, cancer_test_df = train_test_split(cancer_df, test_size=0.25, stratify=cancer_df['class'], random_state=42)

In [20]:
cancer_train_df, cancer_test_df = preprocess_dataframes(cancer_train_df, cancer_test_df, target_column='class')

In [21]:
cancer_train_df.to_csv('Data/cancer_train.csv', index=False)
cancer_test_df.to_csv('Data/cancer_test.csv', index=False)

In [21]:
print(len(cancer_df))

4024


## Alzheimer dataframe

Target Variable -> class

In [22]:
alzheimer_df = pd.read_csv('Data/alzheimers_disease_data.csv')
alzheimer_df.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [23]:
#raport_4 = sv.analyze(alzheimer_df)
#raport_4.show_html("raport_4_sweetviz.html")

In [24]:
alzheimer_df.rename(columns={'Diagnosis': 'class'}, inplace=True)

#Drop FunctionalAssessment, because results were too good.

In [25]:
alzheimer_df = alzheimer_df.drop(["FunctionalAssessment", "PatientID","DoctorInCharge"],axis=1)

In [26]:
alzheimer_train_df, alzheimer_test_df = train_test_split(alzheimer_df, test_size=0.25, stratify=alzheimer_df['class'], random_state=42)

In [27]:
alzheimer_train_df, alzheimer_test_df = preprocess_dataframes(alzheimer_train_df, alzheimer_test_df, target_column='class')

In [28]:
alzheimer_train_df.to_csv('Data/alzheimer_train.csv', index=False)
alzheimer_test_df.to_csv('Data/alzheimer_test.csv', index=False)

In [29]:
print(len(alzheimer_df))

2149
