In [None]:
!pip install pycaret

In [2]:
from pycaret.classification import *
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore')

def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    #mim  = int(x[10:12])
    #sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 0
    else:
        return int(x)

PATH = '/content/drive/MyDrive/dacon/lg/'


In [14]:
train_err  = pd.read_csv(PATH+'train_err_data.csv')
id_error = train_err[['user_id','errtype']].values
error = np.zeros((15000,42))
for person_idx, err in tqdm(id_error):
    # person_idx - 10000 위치에 person_idx, errtype에 해당하는 error값을 +1
    error[person_idx - 10000,err - 1] += 1

train_prob = pd.read_csv(PATH+'train_problem_data.csv')
problem = np.zeros(15000)
problem[train_prob.user_id.unique()-10000] = 1 

train = pd.DataFrame(data=error)
train['problem'] = problem
del error, problem

clf = setup(data = train, target = 'problem') 

100%|██████████| 16554663/16554663 [00:39<00:00, 423414.43it/s]


In [21]:
best_5 = compare_models(sort = 'AUC', n_select = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.7828,0.8059,0.4912,0.7757,0.6013,0.4623,0.4852,7.867
lightgbm,Light Gradient Boosting Machine,0.7801,0.8016,0.5037,0.7554,0.6042,0.4603,0.4785,0.275
et,Extra Trees Classifier,0.7772,0.8014,0.4849,0.7603,0.592,0.449,0.4705,1.357
rf,Random Forest Classifier,0.775,0.8004,0.4734,0.7617,0.5837,0.441,0.4645,1.58
gbc,Gradient Boosting Classifier,0.7789,0.8002,0.4609,0.7886,0.5815,0.4451,0.4749,1.719
xgboost,Extreme Gradient Boosting,0.7694,0.7877,0.5191,0.7115,0.5999,0.4436,0.4546,2.753
ada,Ada Boost Classifier,0.7714,0.7871,0.4615,0.7586,0.5736,0.4299,0.4548,0.467
lda,Linear Discriminant Analysis,0.7474,0.7414,0.3195,0.807,0.4572,0.3308,0.3913,0.079
qda,Quadratic Discriminant Analysis,0.7477,0.7389,0.3618,0.7539,0.488,0.3474,0.3889,0.044
nb,Naive Bayes,0.7092,0.7295,0.2064,0.7233,0.32,0.2032,0.2679,0.031


CatBoost, Lightgbm, gbc, et,rf 가 골고루 AUC 0.8대를 보여주기 때문에 5개를 골라 블렌딩 해보겠습니다

In [22]:
blended = blend_models(estimator_list = best_5, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7724,0.7989,0.4665,0.7587,0.5777,0.4338,0.4579
1,0.7829,0.8142,0.4807,0.7855,0.5965,0.4595,0.4854
2,0.7748,0.7859,0.46,0.7722,0.5765,0.4362,0.4634
3,0.7886,0.8156,0.4857,0.8019,0.605,0.4723,0.4999
4,0.7828,0.8226,0.4814,0.7837,0.5965,0.4592,0.4847
Mean,0.7803,0.8074,0.4749,0.7804,0.5904,0.4522,0.4783
SD,0.0059,0.0133,0.0099,0.0144,0.0113,0.0148,0.0155


In [23]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.78,0.8069,0.472,0.7804,0.5882,0.4503,0.4768


In [24]:
final_model = finalize_model(blended)

In [27]:
test_err  = pd.read_csv(PATH+'test_err_data.csv')
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((14999,42))
for person_idx, err in tqdm(id_error):
    test_x[person_idx - 30000,err - 1] += 1
test_x = test_x.reshape(test_x.shape[0],-1)
test = pd.DataFrame(data=test_x)

100%|██████████| 16532648/16532648 [00:40<00:00, 413245.19it/s]


In [28]:
predictions = predict_model(final_model, data = test)

In [40]:
#pycaret에서는 score이 label을 맞출 확률이기때문에 output을 제출 양식에 맞게 바꿔줍니다
x = []
for i in range(len(predictions['Score'])):
  if predictions['Label'][i] =='1.0':
    x.append(predictions['Score'][i])
  else:
    x.append(1-predictions['Score'][i])

In [42]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')
sample_submssion['problem'] = x
sample_submssion.to_csv("AutoML.csv", index = False)