# Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from pycaret.classification import *

seed = 42
np.random.seed(seed)

In [2]:
df = pd.read_csv("/home/jovyan/work/_dataset/data_csv.csv")
df.head()

Unnamed: 0,CASE_NO_PATIENT'S,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,Global developmental delay/intellectual disability,Social/Behavioural Issues,Childhood Autism Rating Scale,Anxiety_disorder,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who_completed_the_test,ASD_traits
0,1,0,0,0,0,0,0,1,1,0,...,Yes,Yes,1,Yes,F,middle eastern,Yes,No,Family Member,No
1,2,1,1,0,0,0,1,1,0,0,...,Yes,Yes,2,Yes,M,White European,Yes,No,Family Member,Yes
2,3,1,0,0,0,0,0,1,1,0,...,Yes,Yes,4,Yes,M,Middle Eastern,Yes,No,Family Member,Yes
3,4,1,1,1,1,1,1,1,1,1,...,Yes,Yes,2,Yes,M,Hispanic,No,No,Family Member,Yes
4,5,1,1,0,1,1,1,1,1,1,...,Yes,Yes,1,Yes,F,White European,No,No,Family Member,Yes


### 결측치 제거

In [3]:
df.drop("CASE_NO_PATIENT'S", axis=1, inplace=True) # 환자 ID 제거

In [4]:
df = df.dropna(axis=0)

In [5]:
df.isnull().sum()

A1                                                    0
A2                                                    0
A3                                                    0
A4                                                    0
A5                                                    0
A6                                                    0
A7                                                    0
A8                                                    0
A9                                                    0
A10_Autism_Spectrum_Quotient                          0
Social_Responsiveness_Scale                           0
Age_Years                                             0
Qchat_10_Score                                        0
Speech Delay/Language Disorder                        0
Learning disorder                                     0
Genetic_Disorders                                     0
Depression                                            0
Global developmental delay/intellectual disabili

### 범주형 변수 수치형으로 변환

Ethnicity, Who_completed_the_test 변수 범주 통합

In [6]:
# Ethnicity이 Asian, asian 이렇게 분리된 경우가 있어서 이를 통합
new_ethnicity = list(map(str.upper, df['Ethnicity'].to_list()))

# Who_completed_the_test 변수도 분리되어 있어서 이를 통합
new_who_completed = list(map(str.upper, df['Who_completed_the_test'].to_list()))

# dataset 생성
dataset = df.copy()

Ethnicity, Who_completed_the_test 변수 수치형으로 변환

In [7]:
encoder = OrdinalEncoder()

# Ethnicity 변환
new_ethnicity = encoder.fit_transform(np.array(new_ethnicity).reshape(-1, 1))
dataset['Ethnicity'] = new_ethnicity

# Who_completed_the_test 변환
new_who_completed = encoder.fit_transform(np.array(new_who_completed).reshape(-1, 1))
dataset['Who_completed_the_test'] = new_who_completed

Yes, No 변수 수치형으로 변환 & 성별 수치형으로 변환

In [8]:
yes_no_columns = ['Social_Responsiveness_Scale', 'Speech Delay/Language Disorder', 'Learning disorder', 'Genetic_Disorders', 'Depression', 'Global developmental delay/intellectual disability', 'Social/Behavioural Issues', 'Anxiety_disorder', 'Jaundice', 'Family_mem_with_ASD', 'ASD_traits']

for column in yes_no_columns:
    dataset[column] = dataset[column].replace({'Yes': 1.0, 'No': 0.0})

# 성별 수치형 변환
dataset['Sex'] = dataset['Sex'].replace({'M': 0.0, 'F': 1.0})

### 최종 데이터셋

In [9]:
dataset.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10_Autism_Spectrum_Quotient,...,Global developmental delay/intellectual disability,Social/Behavioural Issues,Childhood Autism Rating Scale,Anxiety_disorder,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who_completed_the_test,ASD_traits
0,0,0,0,0,0,0,1,1,0,1,...,1.0,1.0,1,1.0,1.0,4.0,1.0,0.0,0.0,0.0
1,1,1,0,0,0,1,1,0,0,0,...,1.0,1.0,2,1.0,0.0,10.0,1.0,0.0,0.0,1.0
2,1,0,0,0,0,0,1,1,0,1,...,1.0,1.0,4,1.0,0.0,4.0,1.0,0.0,0.0,1.0
3,1,1,1,1,1,1,1,1,1,1,...,1.0,1.0,2,1.0,0.0,2.0,0.0,0.0,0.0,1.0
4,1,1,0,1,1,1,1,1,1,1,...,1.0,1.0,1,1.0,1.0,10.0,0.0,0.0,0.0,1.0


### 모델링

In [10]:
clf = setup(data=dataset, target='ASD_traits', train_size=0.7, session_id=seed)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,ASD_traits
2,Target type,Binary
3,Original data shape,"(1923, 27)"
4,Transformed data shape,"(1923, 27)"
5,Transformed train set shape,"(1346, 27)"
6,Transformed test set shape,"(577, 27)"
7,Numeric features,26
8,Preprocess,True
9,Imputation type,simple


In [12]:
best3models = compare_models(sort='Accuracy', n_select=3, fold=10)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.044
gbc,Gradient Boosting Classifier,0.9993,0.9999,1.0,0.9986,0.9993,0.9985,0.9985,0.031
lightgbm,Light Gradient Boosting Machine,0.9993,1.0,0.9986,1.0,0.9993,0.9985,0.9985,0.585
xgboost,Extreme Gradient Boosting,0.9985,1.0,0.9986,0.9986,0.9986,0.997,0.997,0.011
dt,Decision Tree Classifier,0.9941,0.9941,0.9944,0.9944,0.9944,0.9881,0.9881,0.005
et,Extra Trees Classifier,0.9918,0.9999,0.9944,0.9903,0.9923,0.9836,0.9837,0.038
ada,Ada Boost Classifier,0.9881,0.9974,0.9874,0.9903,0.9888,0.9762,0.9763,0.019
lr,Logistic Regression,0.9666,0.9819,0.9635,0.9737,0.9683,0.9329,0.9336,0.011
knn,K Neighbors Classifier,0.9472,0.9841,0.9481,0.9527,0.95,0.8942,0.895,0.008
ridge,Ridge Classifier,0.9079,0.0,0.8726,0.95,0.9091,0.8163,0.8201,0.006
