### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Section 1: Data Exploration and Column Consolidation

In [57]:
cancer_og = pd.read_csv(r"data/original-files/cancer-dataset.csv")
cancer_og.describe()

# clean the data, save to a new csv
cancer_og.columns = cancer_og.columns.str.lower()
cancer = cancer_og[sorted(cancer_og.columns)]

scaler = MinMaxScaler()
cancer["alcoholintake"] = scaler.fit_transform(cancer[["alcoholintake"]])
cancer["physicalactivity"] = scaler.fit_transform(cancer[["physicalactivity"]])
cancer["geneticrisk"] = (cancer["geneticrisk"] != 0).astype(int)

cancer.to_csv(r"data/cleaned-files/cleaned-cancer-dataset.csv", index=False)

cancer.head()

Unnamed: 0,age,alcoholintake,bmi,cancerhistory,diagnosis,gender,geneticrisk,physicalactivity,smoking
0,58,0.831746,16.085313,1,1,1,1,0.81502,0
1,71,0.705684,30.828784,0,0,0,1,0.936653,0
2,48,0.948104,38.785084,0,1,1,1,0.513678,0
3,34,0.40984,30.040296,0,0,0,0,0.95078,0
4,62,0.663598,35.479721,0,1,1,0,0.535866,0


In [63]:
diabetes_og = pd.read_csv(r"data/original-files/diabetes-dataset.csv")

diabetes_og.drop(['HbA1c_level', 'blood_glucose_level'], axis=1, inplace=True)

diabetes_og.rename(columns={diabetes_og.columns[4]: 'smoking'}, inplace=True)
diabetes_og['smoking'] = diabetes_og['smoking'].replace({
    'never': 0,
    'No Info': 0,
    'current': 1
})

diabetes = diabetes_og[sorted(diabetes_og.columns)]
diabetes.to_csv(r"data/cleaned-files/cleaned-diabetes-dataset.csv", index=False)

diabetes.head()

Unnamed: 0,age,bmi,diabetes,gender,heart_disease,hypertension,smoking
0,80.0,25.19,0,Female,1,0,0
1,54.0,27.32,0,Female,0,0,0
2,28.0,27.32,0,Male,0,0,0
3,36.0,23.45,0,Female,0,0,1
4,76.0,20.14,0,Male,1,1,1


In [92]:
heart_og = pd.read_csv(r"data/original-files/heart-dataset.csv")
drop_cols = ['cholesterol', 'Low HDL Cholesterol', 'High LDL Cholesterol','Stress Level','Sleep Hours', 'Sugar Consumption', 'Triglyceride Level',
       'Fasting Blood Sugar', 'CRP Level', 'Homocysteine Level', 'blood_pressure']
heart_og.drop(drop_cols, axis=1, inplace=True)
heart_og.dropna(inplace=True)
heart_og['activity'] = heart_og['activity'].replace({
    'High': 1,
    'Low': 0,
    'Medium': 0.5
})
heart_og['smoking'] = heart_og['smoking'].replace({
    'No': 0,
    'Yes': 1
})
heart_og['risk'] = heart_og['risk'].replace({
    'No': 0,
    'Yes': 1
})
heart_og['diabetes'] = heart_og['diabetes'].replace({
    'No': 0,
    'Yes': 1
})
heart_og['high_blood_pressure'] = heart_og['high_blood_pressure'].replace({
    'No': 0,
    'Yes': 1
})
heart_og['Alcohol Consumption'] = heart_og['Alcohol Consumption'].replace({
    'High': 1,
    'Low': 0,
    'Medium': 0.5
})
heart_og['Heart Disease Status'] = heart_og['Heart Disease Status'].replace({
    'No': 0,
    'Yes': 1
})


heart_og.columns = heart_og.columns.str.lower()
heart = heart_og[sorted(heart_og.columns)]
heart.head()
heart.to_csv(r"data/cleaned-files/cleaned-heart-dataset.csv", index=False)

In [57]:
stroke_og = pd.read_csv(r"data/original-files/stroke-dataset.csv")

stroke_og.drop(['id', ], axis=1, inplace=True)


stroke_og.describe()

#stroke_og.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.245,23.5,0.0
50%,45.0,0.0,0.0,91.885,28.1,0.0
75%,61.0,0.0,0.0,114.09,33.1,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [None]:
liver_og = pd.read_csv(r"data/original-files/liver-dataset.csv")

# Normalize alcohol consumption, physical activity, genetic risk from 0 to 1
liver_og['alcohol'] = liver_og['alcohol'] / liver_og['alcohol'].max()
liver_og['activity'] = liver_og['activity'] / (liver_og['activity'].max()-liver_og['activity'].min)
liver_og['risk'] = liver_og['risk'] / liver_og['risk'].max()

liver_og.drop('liver_function_test', axis=1, inplace=True)

liver_og.describe()

liver_og.head()

Unnamed: 0,age,gender,bmi,alcohol,smoking,risk,activity,diabetes,hypertension,diagnosis
count,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0
mean,50.394118,0.504118,27.699801,0.492787,0.291765,0.260882,0.500351,0.142353,0.154706,0.550588
std,17.641915,0.50013,7.2104,0.28856,0.454708,0.333131,0.284851,0.349515,0.36173,0.497581
min,20.0,0.0,15.00471,0.000187,0.0,0.0,0.000185,0.0,0.0,0.0
25%,35.0,0.0,21.455414,0.242667,0.0,0.0,0.262344,0.0,0.0,0.0
50%,51.0,1.0,27.925367,0.492581,0.0,0.0,0.502541,0.0,0.0,1.0
75%,66.0,1.0,33.957668,0.745355,1.0,0.5,0.740537,0.0,0.0,1.0
max,80.0,1.0,39.992845,1.0,1.0,1.0,1.0,1.0,1.0,1.0
