### Imports

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Section 1: Data Exploration and Column Consolidation

In [56]:
cancer_og = pd.read_csv(r"data/original-files/cancer-dataset.csv")
cancer_og.describe()

# clean the data, save to a new csv
cancer_og.columns = cancer_og.columns.str.lower()
cancer = cancer_og[sorted(cancer_og.columns)]

scaler = MinMaxScaler()
cancer["alcoholintake"] = scaler.fit_transform(cancer[["alcoholintake"]])
cancer["physicalactivity"] = scaler.fit_transform(cancer[["physicalactivity"]])
cancer["geneticrisk"] = (cancer["geneticrisk"] != 0).astype(int)

cancer.to_csv(r"data/cleaned-files/cleaned-cancer-dataset.csv", index=False)


In [20]:
diabetes_og = pd.read_csv(r"data/original-files/diabetes-dataset.csv")

diabetes_og.rename(columns={diabetes_og.columns[4]: 'smoking'}, inplace=True)
diabetes_og['smoking'] = diabetes_og['smoking'].replace({
    'never': 0,
    'No Info': 0,
    'current': 1
})

diabetes_og.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,0,25.19,6.6,140,0
1,Female,54.0,0,0,0,27.32,6.6,80,0
2,Male,28.0,0,0,0,27.32,5.7,158,0
3,Female,36.0,0,0,1,23.45,5.0,155,0
4,Male,76.0,1,1,1,20.14,4.8,155,0


In [None]:
heart_og = pd.read_csv(r"data/original-files/heart-dataset.csv")


heart_og.drop(heart_og.columns[['Chol']], axis=1, inplace=True)



heart_og.head()

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,Low HDL Cholesterol,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Homocysteine Level,Heart Disease Status
0,56.0,Male,153.0,155.0,High,Yes,Yes,No,24.991591,Yes,Yes,No,High,Medium,7.633228,Medium,12.38725,No
1,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,Yes,No,Medium,High,8.744034,Medium,19.298875,No
2,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,Yes,Yes,Low,Low,4.44044,Low,11.230926,No
3,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,No,Yes,Low,High,5.249405,High,5.961958,No
4,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,No,No,Low,High,7.030971,High,8.153887,No


In [57]:
stroke_og = pd.read_csv(r"data/original-files/stroke-dataset.csv")

stroke_og.drop(['id', ], axis=1, inplace=True)


stroke_og.describe()

#stroke_og.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.245,23.5,0.0
50%,45.0,0.0,0.0,91.885,28.1,0.0
75%,61.0,0.0,0.0,114.09,33.1,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [None]:
liver_og = pd.read_csv(r"data/original-files/liver-dataset.csv")

# Normalize alcohol consumption, physical activity, genetic risk from 0 to 1
liver_og['alcohol'] = liver_og['alcohol'] / liver_og['alcohol'].max()
liver_og['activity'] = liver_og['activity'] / (liver_og['activity'].max()-liver_og['activity'].min)
liver_og['risk'] = liver_og['risk'] / liver_og['risk'].max()

liver_og.drop('liver_function_test', axis=1, inplace=True)

liver_og.describe()

liver_og.head()

Unnamed: 0,age,gender,bmi,alcohol,smoking,risk,activity,diabetes,hypertension,diagnosis
count,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0
mean,50.394118,0.504118,27.699801,0.492787,0.291765,0.260882,0.500351,0.142353,0.154706,0.550588
std,17.641915,0.50013,7.2104,0.28856,0.454708,0.333131,0.284851,0.349515,0.36173,0.497581
min,20.0,0.0,15.00471,0.000187,0.0,0.0,0.000185,0.0,0.0,0.0
25%,35.0,0.0,21.455414,0.242667,0.0,0.0,0.262344,0.0,0.0,0.0
50%,51.0,1.0,27.925367,0.492581,0.0,0.0,0.502541,0.0,0.0,1.0
75%,66.0,1.0,33.957668,0.745355,1.0,0.5,0.740537,0.0,0.0,1.0
max,80.0,1.0,39.992845,1.0,1.0,1.0,1.0,1.0,1.0,1.0
