# SW중심대학 공동 AI 경진대회

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier

from xgboost import XGBRegressor

from sklearn import metrics

# 데이터 불러오기

In [3]:
pd.set_option('display.max_columns', 500) # 모든 컬럼 생략없이 출력
pd.set_option('display.max_row', 500) # row도 생략없이 다 출력|

In [4]:
train = pd.read_csv("/home/nyh/Dacon_SW_competition/SW_competition/train.csv")
train = train.drop(['index', 'country'], axis=1) # index 컬럼 삭제 무의미한 데이터

test = pd.read_csv("/home/nyh/Dacon_SW_competition/SW_competition/test.csv")
test = test.drop(['index', 'country'], axis=1)

submission = pd.read_csv("/home/nyh/Dacon_SW_competition/SW_competition/sample_submission.csv")
submission.head()

Unnamed: 0,index,nerdiness
0,0,-1
1,1,-1
2,2,-1
3,3,-1
4,4,-1


## 이상치 처리

In [8]:
def outliers_iqr(data):
    q1, q3 = np.percentile(data, [25, 75])
    # 넘파이의 값을 퍼센트로 표시해주는 함수

    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    
    return np.where((data > upper_bound) | (data < lower_bound))

introelapse_index_data = outliers_iqr(train['introelapse'])[0]
testelapse_index_data = outliers_iqr(train['testelapse'])[0]
surveyelapse_index_data = outliers_iqr(train['surveyelapse'])[0]
age_index_data = outliers_iqr(train['age'])[0]

train.loc[train['familysize'] > 5, 'familysize'] = np.nan  # 6명 이상의 가족 수 == 이상치 처리,
train.loc[introelapse_index_data, 'introelapse'] = np.nan
train.loc[testelapse_index_data, 'testelapse'] = np.nan
train.loc[surveyelapse_index_data, 'surveyelapse'] = np.nan
train.loc[age_index_data, 'age'] = np.nan

In [12]:
introelapse_index_data = outliers_iqr(test['introelapse'])[0]
testelapse_index_data = outliers_iqr(test['testelapse'])[0]
surveyelapse_index_data = outliers_iqr(test['surveyelapse'])[0]
age_index_data = outliers_iqr(test['age'])[0]
test.loc[test['familysize'] > 5, 'familysize'] = np.nan # 6명 이상의 가족 수 == 이상치 처리,
test['familysize'].value_counts()

test.loc[introelapse_index_data, 'introelapse'] = np.nan
test.loc[testelapse_index_data, 'testelapse'] = np.nan
test.loc[surveyelapse_index_data, 'surveyelapse'] = np.nan
test.loc[age_index_data, 'age'] = np.nan

In [15]:
train = train.round()
test = test.round()
train = train.astype('float')
test = test.astype('float')

In [23]:
import random
from datetime import datetime

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedKFold
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from tqdm import tqdm

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
    
device

'cuda'

In [405]:
from sklearn.model_selection import StratifiedKFold


def kfold(model, train, scale = False):
    cv_accuracy = []
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    n_iter = 0
    
    for t, v in cv.split(train, train['nerdiness']):
        
        train_cv = train.iloc[t] # 훈련용
        val_cv = train.iloc[v] # 검증용 분리

        train_X = train_cv.drop('nerdiness', axis=1)
        train_y = train_cv['nerdiness']

        val_X = val_cv.drop('nerdiness', axis=1)
        val_y = val_cv['nerdiness']
            
        model.fit(train_X, train_y)
        fpr, tpr, thresholds = metrics.roc_curve(val_y, model.predict_proba(val_X)[:, 1])

        score = metrics.auc(fpr, tpr)
        
        cv_accuracy.append(score)
        n_iter += 1
    return np.mean(cv_accuracy)

## KFold를 이용한 모델 튜닝

In [60]:
for iterations in range(100, 10000, 100):

    models = [
        RandomForestClassifier(n_estimators = iterations),
    ]

    for model in models:
        print(f'{iterations}th {type(model).__name__} score: {kfold(model, train)}')


100th RandomForestClassifier score: 0.8731901526492051
200th RandomForestClassifier score: 0.8759534681149239
300th RandomForestClassifier score: 0.8767455295496752
400th RandomForestClassifier score: 0.8771522602707892
500th RandomForestClassifier score: 0.8774613764147032
600th RandomForestClassifier score: 0.8777604509960554
700th RandomForestClassifier score: 0.8777884492014041
800th RandomForestClassifier score: 0.8774201547800395
900th RandomForestClassifier score: 0.8778871331740813
1000th RandomForestClassifier score: 0.878479421713242
1100th RandomForestClassifier score: 0.8777875582709829
1200th RandomForestClassifier score: 0.8781904320957914
1300th RandomForestClassifier score: 0.878286111663687
1400th RandomForestClassifier score: 0.8780406788568161
1500th RandomForestClassifier score: 0.8780411953226667
1600th RandomForestClassifier score: 0.8778909452026546
1700th RandomForestClassifier score: 0.8781424532946408
1800th RandomForestClassifier score: 0.8780589536404406
190

In [None]:
for iterations in range(100, 10000, 100):

    models = [
        ExtraTreesClassifier(n_estimators = iterations),
    ]

    for model in models:
        print(f'{iterations}th {type(model).__name__} score: {kfold(model, train)}')


100th ExtraTreesClassifier score: 0.8770176092245621
200th ExtraTreesClassifier score: 0.8791914445604091
300th ExtraTreesClassifier score: 0.8795352825618444
400th ExtraTreesClassifier score: 0.8807328164367026
500th ExtraTreesClassifier score: 0.8801476061415127
600th ExtraTreesClassifier score: 0.8800288827187999
700th ExtraTreesClassifier score: 0.8806792149762845
800th ExtraTreesClassifier score: 0.8805409841364457
900th ExtraTreesClassifier score: 0.8803806828223626
1000th ExtraTreesClassifier score: 0.8806698972870418
1100th ExtraTreesClassifier score: 0.8808349470091859
1200th ExtraTreesClassifier score: 0.8807085213264905
1300th ExtraTreesClassifier score: 0.8812408389294358
1400th ExtraTreesClassifier score: 0.8806587651908181
1500th ExtraTreesClassifier score: 0.880842344184354
1600th ExtraTreesClassifier score: 0.8812712249424977
1700th ExtraTreesClassifier score: 0.8809304311680168
1800th ExtraTreesClassifier score: 0.8808614616463167
1900th ExtraTreesClassifier score: 0.8

In [None]:
for iterations in range(100, 10000, 100):

    models = [
        LGBMClassifier(n_estimators = iterations),
    ]

    for model in models:
        print(f'{iterations}th {type(model).__name__} score: {kfold(model, train)}')

100th LGBMClassifier score: 0.8255820006780216
200th LGBMClassifier score: 0.8381316356113793
300th LGBMClassifier score: 0.8465804120526688
400th LGBMClassifier score: 0.8515850415067995
500th LGBMClassifier score: 0.8548278277268487
600th LGBMClassifier score: 0.8574293081039226
700th LGBMClassifier score: 0.8594564608552654
800th LGBMClassifier score: 0.8604473461627299
900th LGBMClassifier score: 0.8613798815368305
1000th LGBMClassifier score: 0.8623171225782432
1100th LGBMClassifier score: 0.8629896333220468
1200th LGBMClassifier score: 0.8634247538594643
1300th LGBMClassifier score: 0.8640806220280736
1400th LGBMClassifier score: 0.864592733275743
1500th LGBMClassifier score: 0.8647799469656501
1600th LGBMClassifier score: 0.86494863394062
1700th LGBMClassifier score: 0.8655097270910419
1800th LGBMClassifier score: 0.8655405807793285
1900th LGBMClassifier score: 0.8656471129594869
2000th LGBMClassifier score: 0.8660646940867862
2100th LGBMClassifier score: 0.8662987485473883
2200

In [58]:
for iterations in range(100, 10000, 100):

    models = [
        XGBClassifier(n_estimators = iterations),
    ]

    for model in models:
        print(f'{i}th {type(model).__name__} score: {kfold(model, train)}')
        i+=100

8100th XGBClassifier score: 0.8412829661768854
8200th XGBClassifier score: 0.8520802013181743
8300th XGBClassifier score: 0.8558139668193533
8400th XGBClassifier score: 0.8578135006859171
8500th XGBClassifier score: 0.8580822711009324
8600th XGBClassifier score: 0.8584466686596774
8700th XGBClassifier score: 0.8587726985938151
8800th XGBClassifier score: 0.8589437862171694
8900th XGBClassifier score: 0.8592457302057044
9000th XGBClassifier score: 0.8592387112065083
9100th XGBClassifier score: 0.8591679333120996
9200th XGBClassifier score: 0.8592228645434272
9300th XGBClassifier score: 0.8591863195142426
9400th XGBClassifier score: 0.8590938233119356
9500th XGBClassifier score: 0.8588111675707049
9600th XGBClassifier score: 0.8587966386710131
9700th XGBClassifier score: 0.8587544136596043
9800th XGBClassifier score: 0.8588546208657162
9900th XGBClassifier score: 0.858811373637689
10000th XGBClassifier score: 0.8586527137067247
10100th XGBClassifier score: 0.8586551928823679
10200th XGBC

In [76]:
models = [
    RandomForestClassifier(n_estimators=1000),
    XGBClassifier(n_estimators = 1000),
    LGBMClassifier(n_estimators = 4400),
    # CatBoostClassifier(silent=True),
    ExtraTreesClassifier(n_estimators=1000)
]

for model in models:
    print(f'{type(model).__name__} score: {kfold(model, train)}')

RandomForestClassifier score: 0.875041601518457
XGBClassifier score: 0.8568863321310882
LGBMClassifier score: 0.8691905366660286
ExtraTreesClassifier score: 0.8794715619631056


## AutoML을 이용한 모델 튜닝

In [31]:
!pip install --pre pycaret

[K     |████████████████████████████████| 79.9 MB 1.3 MB/s 
[?25hCollecting pyod>=0.9.8
  Downloading pyod-1.0.4.tar.gz (134 kB)
[K     |████████████████████████████████| 134 kB 59.4 MB/s 
[?25hCollecting lightgbm>=3.0.0
  Downloading lightgbm-3.3.2-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 38.7 MB/s 
Collecting numba~=0.55.0
  Downloading numba-0.55.2-cp37-cp37m-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 46.6 MB/s 
[?25hCollecting psutil>=5.9.0
  Downloading psutil-5.9.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (281 kB)
[K     |████████████████████████████████| 281 kB 56.2 MB/s 
[?25hCollecting requests>=2.27.1
  Downloading requests-2.28.1-py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 1.7 MB/s 
Collecting plotly-resampler>=0.7.2.2
  Downloading plotly-resampler-0.8.0rc12.tar.gz (44

In [32]:
from pycaret.classification import *
import jinja2
# pip install markupsafe==2.0.1 # 진자 오류시 다음과 같은 설치
setup_ = setup(data = train,             # 학습 데이터 
               target = 'nerdiness',  # 학습 데이터 중 예측해야 하는 값. (Train_test_spli을 하지 않아도 됨)
               # ignore_features = ["index", "country"],
               session_id=42,            # random_state 값 고정 88888
               n_jobs = -1,              # 성능을 최대로 내기 위해 -1 사용.
               train_size = 0.9)        # Train - test 비율을 1:1로 나눔)
              #  numerical_features = ["contract_until"])
               # silent =True)             # 이거 True로 두면 Enter 안누르고 진행 가능

# setup에서 feature의 type을 잘못 적었을 때 : numerical_features[변수명] 혹은 categorical_feautres[변수명]

'cuml' is a soft dependency and not included in the pycaret installation. Please run: `pip install cuml` to install.
'cuml' is a soft dependency and not included in the pycaret installation. Please run: `pip install cuml` to install.
'cuml' is a soft dependency and not included in the pycaret installation. Please run: `pip install cuml` to install.
'cuml' is a soft dependency and not included in the pycaret installation. Please run: `pip install cuml` to install.
INFO:logs:PyCaret ClassificationExperiment
INFO:logs:Logging name: clf-default-name
INFO:logs:ML Usecase: MLUsecase.CLASSIFICATION
INFO:logs:version 3.0.0.rc3
INFO:logs:Initializing setup()
INFO:logs:self.USI: 2031
INFO:logs:self.variable_keys: {'seed', '_all_metrics', '_is_multiclass', 'X', 'n_jobs_param', 'exp_id', '_available_plots', '_all_models_internal', 'X_train', 'memory', 'fold_shuffle_param', '_all_models', 'fold_groups_param', '_ml_usecase', 'X_test', 'master_model_container', 'display_container', 'variable_keys', '

Unnamed: 0,Description,Value
0,Session id,42
1,Target,nerdiness
2,Target type,Binary
3,Original data shape,"(15000, 68)"
4,Transformed data shape,"(15000, 68)"
5,Transformed train set shape,"(13500, 68)"
6,Transformed test set shape,"(1500, 68)"
7,Numeric features,67
8,Preprocess,True
9,Imputation type,simple


INFO:logs:Soft dependency imported: xgboost: 0.90
INFO:logs:Soft dependency imported: catboost: 1.0.6
INFO:logs:Soft dependency imported: xgboost: 0.90
INFO:logs:Soft dependency imported: catboost: 1.0.6
INFO:logs:setup() successfully completed in 3.0s...............


In [33]:
top3 = compare_models(sort = "AUC", n_select=2)

INFO:logs:create_model() successfully completed......................................


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7859,0.8773,0.8377,0.7888,0.8124,0.5635,0.5651,1.827
rf,Random Forest Classifier,0.7871,0.8733,0.8418,0.7882,0.814,0.5657,0.5677,2.466
catboost,CatBoost Classifier,0.7654,0.8371,0.8246,0.7687,0.7955,0.5212,0.5232,12.103
gbc,Gradient Boosting Classifier,0.735,0.8061,0.7979,0.7427,0.7692,0.4589,0.4608,3.133
lda,Linear Discriminant Analysis,0.7258,0.7961,0.8046,0.7285,0.7646,0.4381,0.4415,0.217
lr,Logistic Regression,0.7243,0.7939,0.7942,0.731,0.7612,0.4364,0.4387,3.515
ada,Ada Boost Classifier,0.7258,0.7932,0.78,0.7392,0.7589,0.4415,0.4426,0.843
nb,Naive Bayes,0.685,0.7445,0.761,0.6974,0.7278,0.3556,0.3577,0.093
qda,Quadratic Discriminant Analysis,0.6691,0.7271,0.7591,0.6803,0.7175,0.3211,0.3239,0.15
dt,Decision Tree Classifier,0.719,0.715,0.7518,0.7435,0.7476,0.4306,0.4307,0.244


INFO:logs:master_model_container: 16
INFO:logs:display_container: 2
INFO:logs:[ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=100, n_jobs=-1, oob_score=False,
                     random_state=42, verbose=0, warm_start=False), RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=-1, oob_score=False,
            

In [34]:
blender_top3 = stack_models(estimator_list=top3,optimize='AUC')

INFO:logs:Calculating mean and std
INFO:logs:Creating metrics dataframe


INFO:logs:Finalizing model
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



INFO:logs:Uploading results into container
INFO:logs:Uploading model into container now
INFO:logs:master_model_container: 17
INFO:logs:display_container: 3
INFO:logs:StackingClassifier(cv=5,
                   estimators=[('Extra Trees Classifier',
                                ExtraTreesClassifier(bootstrap=False,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_samples_leaf=1,
                             

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7904,0.8861,0.8249,0.8023,0.8134,0.5743,0.5746
1,0.7881,0.8873,0.8289,0.7969,0.8126,0.5692,0.5698
2,0.7822,0.879,0.8195,0.794,0.8066,0.5576,0.5579
3,0.7948,0.8836,0.8514,0.793,0.8212,0.5812,0.5832
4,0.7807,0.8714,0.842,0.7794,0.8095,0.5521,0.5544
5,0.7733,0.8657,0.8099,0.7867,0.7982,0.5398,0.5401
6,0.7926,0.8776,0.8286,0.8029,0.8155,0.5788,0.5792
7,0.7985,0.8915,0.8246,0.8137,0.8191,0.5917,0.5918
8,0.7904,0.8828,0.8407,0.7929,0.8161,0.5728,0.5742
9,0.7963,0.8836,0.8461,0.798,0.8213,0.5849,0.5863


INFO:logs:master_model_container: 17
INFO:logs:display_container: 3
INFO:logs:StackingClassifier(cv=5,
                   estimators=[('Extra Trees Classifier',
                                ExtraTreesClassifier(bootstrap=False,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                           

In [35]:
final_model = finalize_model(blender_top3)

INFO:logs:Initializing finalize_model()
INFO:logs:finalize_model(self=<pycaret.classification.oop.ClassificationExperiment object at 0x7efcf1ea44d0>, estimator=StackingClassifier(cv=5,
                   estimators=[('Extra Trees Classifier',
                                ExtraTreesClassifier(bootstrap=False,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_samples_leaf=1,
                                   

In [36]:
prediction = final_model.predict_proba(test)

In [37]:
prediction = prediction[:, 1]


In [38]:
prediction

array([0.01469996, 0.89084912, 0.93380572, ..., 0.96262788, 0.01178973,
       0.75862174])

In [39]:
submission = pd.read_csv('/content/drive/MyDrive/dacon/SW_competition/sample_submission.csv')
submission['nerdiness'] = prediction
submission

Unnamed: 0,index,nerdiness
0,0,0.014700
1,1,0.890849
2,2,0.933806
3,3,0.530616
4,4,0.964933
...,...,...
35447,35447,0.968593
35448,35448,0.782305
35449,35449,0.962628
35450,35450,0.011790


In [40]:
submission.to_csv("submission_08_03.csv", index = False)