#**Requirement**

In [None]:
# !pip3 install pandas_profiling --upgrade
# !pip3 install -upgrade pandas
# !pip install category_encoders
# !pip install eli5
# !pip install shap

In [None]:
#from google.colab import files
#uploaded = files.upload()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from scipy import stats
from scipy.stats import zscore, randint, uniform

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, classification_report

from category_encoders import TargetEncoder

# **EDA**

Feature Details
Attribute Information:

Id(numerical) - Patient Id Age(numerical) - age in years

Blood Pressure(numerical) - bp in mm/Hg

Specific Gravity(nominal) - sg - (1.005,1.010,1.015,1.020,1.025)

Albumin(nominal) - al - (0,1,2,3,4,5)

Sugar(nominal) - su - (0,1,2,3,4,5)

Red Blood Cells(nominal) - rbc - (normal,abnormal)

Pus Cell (nominal) - pc - (normal,abnormal)

Pus Cell clumps(nominal) - pcc - (present,notpresent)

Bacteria(nominal) - ba - (present,notpresent)

Blood Glucose Random(numerical) - bgr in mgs/dl

Blood Urea(numerical) -bu in mgs/dl

Serum Creatinine(numerical) - sc in mgs/dl

Sodium(numerical) - sod in mEq/L

Potassium(numerical) - pot in mEq/L

Hemoglobin(numerical) - hemo in gms

Packed Cell Volume(numerical)

White Blood Cell Count(numerical) - wc in cells/cumm

Red Blood Cell Count(numerical) - rc in millions/cmm

Hypertension(nominal) - htn - (yes,no)

Diabetes Mellitus(nominal) - dm - (yes,no)

Coronary Artery Disease(nominal) - cad - (yes,no)

Appetite(nominal) - appet - (good,poor)

Pedal Edema(nominal) - pe - (yes,no)

Anemia(nominal) - ane - (yes,no)

Class (nominal)- class - (ckd,notckd)

In [None]:
target = 'classification'

train = pd.read_csv('kidney_disease_train.csv')
test = pd.read_csv('kidney_disease_test.csv')

In [None]:
categoric_var=['sg','al','su','rbc','pc','pcc','ba','htn','dm','cad','appet','pe','ane']
numeric_var=['id','age','bp','bgr','bu','sc','sod','pot','hemo','pcv','wc','rc']

In [None]:
# 훈련 / 테스트 데이터 전처리
# 실제로는 'wc', 'rc'는 numeric_var 이기 때문에 올바른 데이터형으로 변환
train['wc']=pd.to_numeric(train['wc'],errors='coerce')
train['rc']=pd.to_numeric(train['rc'], errors='coerce')

In [None]:
# '/t' 제거
train['dm']=train.dm.replace("\tno","no")
train['dm']=train.dm.replace("\tyes","yes")

train['cad']=train.cad.replace("\tno","no")

In [None]:
# 잘못된 value 처리
train = train.astype({'pcv':str})
train['pcv'] = train['pcv'].replace(['\t?', '\t43'],['np.nan', '43'])
train['pcv'] = pd.to_numeric(train['pcv'], errors='coerce')

In [None]:
# 결측치 처리 (num = median, cat = mode)
for col in numeric_var:
    train[col]=train[col].fillna(train[col].median())

for col in categoric_var:
    train[col]=train[col].fillna(train[col].mode()[0])

In [None]:
# nominal variables을 처리를 위한 Dictionary.
cat_nom_dict = {"rbc":     {"normal": 1, "abnormal": 0},
                "pc":     {"normal": 1, "abnormal": 0},
                "pcc":     {"present": 1, "notpresent": 0},
                "ba":     {"present": 1, "notpresent": 0},
                "htn":     {"yes": 1, "no": 0},
                "dm":     {"yes": 1, "no": 0},
                "cad":     {"yes": 1, "no": 0},
                "pe":     {"yes": 1, "no": 0},
                "ane":     {"yes": 1, "no": 0},
                "appet":     {"good": 1, "poor": 0},
                "classification":     {"ckd": 1, "notckd": 0} 
               }

In [None]:
# Outlier 처리 (cat = mode, num = median)
train_num_df = train[numeric_var].copy()
for col in numeric_var:
    train.loc[(np.abs(stats.zscore(train_num_df[col])) >= 3), col] = train[col].median()

In [None]:
# nominal variable에 numeric value 부여
train.replace(cat_nom_dict, inplace=True)

In [None]:
# 특성간 상관관계가 있는 것 및 id는 삭제 
train.drop(['id', 'hemo', 'pcv', 'htn'], axis=1, inplace=True)

In [None]:
# 결측치 유무 확인
train.apply(lambda x: x.isna().value_counts()). T

In [None]:
train, val = train_test_split(train, test_size=0.2,
                              stratify=train[target], random_state=2)

train.shape, val.shape, test.shape

In [None]:
features = train.drop(columns=[target]).columns

X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]

In [None]:
y_train.value_counts(normalize=True)