### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Section 1: Data Exploration and Column Consolidation

In [2]:
cancer_og = pd.read_csv(r"data/original-files/cancer-dataset.csv")
cancer_og.describe()

# clean the data, save to a new csv
cancer_og.columns = cancer_og.columns.str.lower()
cancer = cancer_og[sorted(cancer_og.columns)]

scaler = MinMaxScaler()
cancer["alcoholintake"] = scaler.fit_transform(cancer[["alcoholintake"]])
cancer["physicalactivity"] = scaler.fit_transform(cancer[["physicalactivity"]])
cancer["geneticrisk"] = (cancer["geneticrisk"] != 0).astype(int)

cancer.to_csv(r"data/cleaned-files/cleaned-cancer-dataset.csv", index=False)

cancer.head()

Unnamed: 0,age,alcoholintake,bmi,cancerhistory,diagnosis,gender,geneticrisk,physicalactivity,smoking
0,58,0.831746,16.085313,1,1,1,1,0.81502,0
1,71,0.705684,30.828784,0,0,0,1,0.936653,0
2,48,0.948104,38.785084,0,1,1,1,0.513678,0
3,34,0.40984,30.040296,0,0,0,0,0.95078,0
4,62,0.663598,35.479721,0,1,1,0,0.535866,0


In [3]:
diabetes_og = pd.read_csv(r"data/original-files/diabetes-dataset.csv")

diabetes_og.drop(['HbA1c_level', 'blood_glucose_level'], axis=1, inplace=True)

diabetes_og.rename(columns={diabetes_og.columns[4]: 'smoking'}, inplace=True)
diabetes_og['smoking'] = diabetes_og['smoking'].replace({
    'never': 0,
    'No Info': 0,
    'current': 1
})

diabetes = diabetes_og[sorted(diabetes_og.columns)]
diabetes.to_csv(r"data/cleaned-files/cleaned-diabetes-dataset.csv", index=False)

diabetes.head()

Unnamed: 0,age,bmi,diabetes,gender,heart_disease,hypertension,smoking
0,80.0,25.19,0,Female,1,0,0
1,54.0,27.32,0,Female,0,0,0
2,28.0,27.32,0,Male,0,0,0
3,36.0,23.45,0,Female,0,0,1
4,76.0,20.14,0,Male,1,1,1


In [None]:
heart_og = pd.read_csv(r"data/original-files/heart-dataset.csv")
drop_cols = ['Cholesterol', 'Low HDL Cholesterol', 'High LDL Cholesterol','Stress Level','Sleep Hours', 'Sugar Consumption', 'Triglyceride Level',
       'Fasting Blood Sugar', 'CRP Level', 'Homocysteine Level', 'Blood Pressure']
heart_og.drop(drop_cols, axis=1, inplace=True)
heart_og.dropna(inplace=True)
heart_og['activity'] = heart_og['activity'].replace({
    'High': 1,
    'Low': 0,
    'Medium': 0.5
})
heart_og['smoking'] = heart_og['smoking'].replace({
    'No': 0,
    'Yes': 1
})
heart_og['risk'] = heart_og['risk'].replace({
    'No': 0,
    'Yes': 1
})
heart_og['diabetes'] = heart_og['diabetes'].replace({
    'No': 0,
    'Yes': 1
})
heart_og['high_blood_pressure'] = heart_og['high_blood_pressure'].replace({
    'No': 0,
    'Yes': 1
})
heart_og['Alcohol Consumption'] = heart_og['Alcohol Consumption'].replace({
    'High': 1,
    'Low': 0,
    'Medium': 0.5
})
heart_og['Heart Disease Status'] = heart_og['Heart Disease Status'].replace({
    'No': 0,
    'Yes': 1
})


heart_og.columns = heart_og.columns.str.lower()
heart = heart_og[sorted(heart_og.columns)]

heart.to_csv(r"data/cleaned-files/cleaned-heart-dataset.csv", index=False)

heart.head()

KeyError: "['cholesterol', 'blood_pressure'] not found in axis"

In [5]:
stroke_og = pd.read_csv(r"data/original-files/stroke-dataset.csv")

# Save clean data to a new csv file
stroke_og.columns = stroke_og.columns.str.lower()
stroke = stroke_og[sorted(stroke_og.columns)]

# Drop irrelevant columns
stroke.drop(['id', 'ever_married', 'work_type', 'residence_type', 'avg_glucose_level'], axis=1, inplace=True)

# Normalize smoking column
stroke['smoking'] = stroke_og['smoking'].replace({
    'never smoked': 0,
    'formerly smoked': 1,
    'smokes': 1
})

# Drop rows with missing BMI values
stroke.dropna(inplace=True)

stroke.to_csv(r"data/cleaned-files/cleaned-stroke-dataset.csv", index=False)

stroke.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke.drop(['id', 'ever_married', 'work_type', 'residence_type', 'avg_glucose_level'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke['smoking'] = stroke_og['smoking'].replace({
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke.dropna(inplace=True)


Unnamed: 0,age,bmi,diagnosis,gender,heart_disease,hypertension,smoking
0,67.0,36.6,1,Male,1,0,1
2,80.0,32.5,1,Male,1,0,0
3,49.0,34.4,1,Female,0,0,1
4,79.0,24.0,1,Female,0,1,0
5,81.0,29.0,1,Male,0,0,1


In [6]:
liver_og = pd.read_csv(r"data/original-files/liver-dataset.csv")

# Save clean data to a new csv file
liver_og.columns = liver_og.columns.str.lower()
liver = liver_og[sorted(liver_og.columns)]

# Drop liver function test column
liver.drop('liver_function_test', axis=1, inplace=True)

# Normalize alcohol consumption, physical activity, and genetic risk from 0 to 1 using minmax scaler
scaler = MinMaxScaler()
liver["alcohol"] = scaler.fit_transform(liver[["alcohol"]])
liver["activity"] = scaler.fit_transform(liver[["activity"]])
liver["risk"] = scaler.fit_transform(liver[["risk"]])

liver.to_csv(r"data/cleaned-files/cleaned-liver-dataset.csv", index=False)

liver.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  liver.drop('liver_function_test', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  liver["alcohol"] = scaler.fit_transform(liver[["alcohol"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  liver["activity"] = scaler.fit_transform(liver[["activity"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[ro

Unnamed: 0,activity,age,alcohol,bmi,diabetes,diagnosis,gender,hypertension,risk,smoking
0,0.065754,58,0.865674,35.857584,0,1,0,0,0.5,0
1,0.166986,71,0.110159,30.73247,1,1,1,0,0.5,0
2,0.99333,48,0.927238,19.971407,0,0,0,0,0.0,0
3,0.563216,34,0.63308,16.615417,0,1,1,0,0.0,0
4,0.356682,62,0.054344,16.06583,1,1,1,0,0.5,0
