In [None]:
import os
import pandas as pd
import numpy as np
import hds
from plt_rcs import *

In [None]:
df = pd.read_csv('https://bit.ly/PimaIndiansDiabetes')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe().round(3)

In [None]:
df.columns

In [None]:
cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

In [None]:
df[cols].apply(func=lambda x: x.eq(0).sum())

## 데이터 전처리

In [None]:
df[cols] = df[cols].replace(0, np.nan)
df[cols].isna().sum()

In [None]:
df.columns

In [None]:
df = df.rename(columns={'DiabetesPedigreeFunction': 'Pedigree'})

In [None]:
df.head()

## 회귀 대체

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
imputer = IterativeImputer()

In [None]:
df_imp = pd.DataFrame(data=imputer.fit_transform(X=df), columns=df.columns)

In [None]:
df_imp.head()

In [None]:
df_imp.isna().sum()

In [None]:
df_imp[cols] = df_imp[cols].round(1)

In [None]:
df_imp.head()

In [None]:
df_imp.describe().round(3)

In [None]:
df_imp.loc[df['Insulin'].isna(), 'Insulin'].sort_values()

In [None]:
df_imp['Insulin'] = df_imp['Insulin'].clip(lower=16)

In [None]:
df_imp.describe().round(3)

In [None]:
plt.rc(group='figure', figsize=(4, 4))

In [None]:
sns.histplot(data=df, x='Insulin', binrange=(0, 850), binwidth=50, fc='0.8')
plt.show()

In [None]:
sns.kdeplot(
    data=df, x='Insulin',
    color='0.8', fill=True
)
plt.axvline(x=df_imp['Insulin'].mean())
plt.axvline(x=df_imp['Insulin'].median(), color='red', linestyle='--')
plt.show()

In [None]:
hds.plot.corr_heatmap(data=df)

In [None]:
var_names = df.columns.to_list()[:-1]

In [None]:
for var in var_names:
    if var == 'Insulin':
        continue
    hds.plot.regline(data=df_imp, x=var, y='Insulin')
    plt.show()

In [None]:
hds.plot.box_group(data=df_imp, x='Outcome', y='Insulin', palette=['skyblue', 'orange'])

In [None]:
df_imp.columns

In [None]:
cond1 = df_imp['SkinThickness'].lt(80)
cond2 = df_imp['BMI'].lt(60)
df_imp = df_imp.loc[cond1 & cond2, :].drop(columns='Outcome')
df_imp = df_imp.reset_index(drop=True)
df_imp.shape

In [None]:
yvar = 'Insulin'
X = df_imp.drop(columns=yvar)
y = df_imp[yvar].copy()
display(X)
display(y)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
X_train.shape

In [None]:
X_valid.shape

In [None]:
y_train.mean()

In [None]:
y_valid.mean()

In [None]:
os.getcwd()

In [None]:
os.chdir('../../data')

In [None]:
objs = {
    'X_train': X_train,
    'X_valid': X_valid,
    'y_train': y_train,
    'y_valid': y_valid
}

In [None]:
pd.to_pickle(obj=objs, filepath_or_buffer='Diabetes.pkl')