### All of our necessary imports go here, with optional aliases.

In [76]:
import pandas as pd

### Local variables can be defined here.

In [77]:
path_to_data: str = '../data/raw/diabetes.csv'

### Functions defined here.

In [78]:
def get_data(path_to_data):
    """Docstring"""
    
    df = pd.read_csv(path_to_data)
    return df

In [79]:
df = get_data(path_to_data)

At this point, we can begin cleaning, EDA, and feature engineering. Let's look for:
### Cleaning 
1. Missing values, zeroes, and NaNs.
2. Outliers or values outside of 3 sigma.
3. Type errors, encoding errors, etc.
4. Duplicates, if any.

### EDA
1. Correlational matrix (which features best predict the label?).
2. Is there skew or kurtosis in the feature distribution?
3. Understanding features holistically (what does 'skin thickness' or 'pedigree' mean?).

### Feature Engineering
1. Imputation of missing values, zeroes, NaNs.   
    a. Median method.  
    b. KNN cluster analysis method.
2. Creation of new, derivative features.
    a. Branch the dataset at this point so that one can be used for an ANN and the other for ensemble.
3. Scaling, capping, etc.

In [80]:
def zeroes_exist(dataframe) -> bool:
    # Check if zeroes in either flt or int form exist. Prints type of zero and returns boolean.
    if 0 in dataframe.values or 0.0 in dataframe.values:
        if 0 in dataframe.values:
            print("Int-type 0 found in dataset")
            return True
        else:
            print("Float-type 0.0 found in dataset")
            return True
    else:
        return False

In [81]:
def NaNs_exist(dataframe) -> bool:
    return dataframe.isnull().values.any()

In [82]:
def count_zeroes(dataframe, column_header) -> int:
    # Takes in a dataframe and a column name and returns number of zeroes (float and int)
    if dataframe[column_header].isnull().values.any(): # If it has no zeroes
        return 0
    else:
        count = dataframe[column_header].value_counts()[0]
        return count
        
        

In [83]:
def summarize_zeroes(dataframe) -> dict:
    # Takes in a dataframe and returns a dict that maps feature names to number of missing values.
    zeroes_dict = {}
    for header in dataframe.columns:
        num_zeroes = count_zeroes(dataframe, header)
        zeroes_dict[header] = num_zeroes
    return zeroes_dict

In [84]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [85]:
count_zeroes(df,'Pregnancies')

111

In [86]:
count_zeroes(df,'Glucose')

5

In [87]:
count_zeroes(df,'BloodPressure')

35

In [88]:
count_zeroes(df, 'SkinThickness')

227

In [89]:
count_zeroes(df,'Insulin')

374

In [90]:
count_zeroes(df,'BMI')

11

In [91]:
count_zeroes(df, 'DiabetesPedigreeFunction')

KeyError: 0

In [93]:
count_zeroes(df, 'Age')

KeyError: 0

In [92]:
count_zeroes(df, 'Outcome')

500