In [1]:
from pycaret.classification import *
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
PATH = 'D:/ykio/dacon_LG/data/'

In [3]:
train_err  = pd.read_csv(PATH+'train_err_data.csv')
id_error = train_err[['user_id','errtype']].values
error = np.zeros((15000,42))
for person_idx, err in tqdm(id_error):
    # person_idx - 10000 위치에 person_idx, errtype에 해당하는 error값을 +1
    error[person_idx - 10000,err - 1] += 1

train_prob = pd.read_csv(PATH+'train_problem_data.csv')
problem = np.zeros(15000)
problem[train_prob.user_id.unique()-10000] = 1 

train = pd.DataFrame(data=error)
train['problem'] = problem
del error, problem

clf = setup(data = train, target = 'problem') 

Unnamed: 0,Description,Value
0,session_id,2370
1,Target,problem
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(15000, 43)"
5,Missing Values,False
6,Numeric Features,42
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [4]:
best_5 = compare_models(sort = 'Accuracy', n_select = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.7825,0.8077,0.4929,0.7644,0.599,0.4592,0.4802,4.884
lightgbm,Light Gradient Boosting Machine,0.7818,0.8034,0.5019,0.7548,0.6026,0.4605,0.4789,0.127
et,Extra Trees Classifier,0.7806,0.8013,0.4863,0.7633,0.5938,0.4535,0.4753,0.265
rf,Random Forest Classifier,0.7783,0.801,0.4719,0.7665,0.5837,0.4441,0.4686,0.279
gbc,Gradient Boosting Classifier,0.7767,0.8036,0.4568,0.7743,0.5744,0.4364,0.4645,0.275
ada,Ada Boost Classifier,0.7708,0.7927,0.4672,0.7433,0.5733,0.4277,0.4495,0.082
xgboost,Extreme Gradient Boosting,0.7687,0.7881,0.5189,0.7024,0.5966,0.4397,0.4497,0.404
lr,Logistic Regression,0.7509,0.7347,0.3523,0.7668,0.4821,0.3468,0.3925,0.884
ridge,Ridge Classifier,0.7473,0.0,0.3119,0.8004,0.4482,0.3234,0.3841,0.021
lda,Linear Discriminant Analysis,0.7471,0.7378,0.3174,0.7913,0.4523,0.3253,0.3829,0.017


In [5]:
blended = blend_models(estimator_list = best_5, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7738,0.7955,0.4653,0.7541,0.5755,0.4329,0.4563
1,0.7824,0.8206,0.4964,0.7611,0.6009,0.4602,0.4801
2,0.7976,0.8207,0.5007,0.8146,0.6202,0.4927,0.5198
3,0.791,0.8001,0.4733,0.8159,0.5991,0.4709,0.5028
4,0.7856,0.815,0.4812,0.7854,0.5968,0.462,0.4877
Mean,0.7861,0.8104,0.4834,0.7862,0.5985,0.4638,0.4894
SD,0.008,0.0106,0.0134,0.0259,0.0142,0.0193,0.0214


In [6]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7774,0.8026,0.4626,0.8016,0.5866,0.4489,0.4807


In [7]:
final_model = finalize_model(blended)

In [8]:
test_err  = pd.read_csv(PATH+'test_err_data.csv')
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((14999,42))
for person_idx, err in tqdm(id_error):
    test_x[person_idx - 30000,err - 1] += 1
test_x = test_x.reshape(test_x.shape[0],-1)
test = pd.DataFrame(data=test_x)

100%|██████████████████████████████████████████████████████████████████| 16532648/16532648 [00:46<00:00, 352753.36it/s]


In [9]:
predictions = predict_model(final_model, data = test)

In [10]:
#pycaret에서는 score이 label을 맞출 확률이기때문에 output을 제출 양식에 맞게 바꿔줍니다
x = []
for i in range(len(predictions['Score'])):
  if predictions['Label'][i] =='1.0':
    x.append(predictions['Score'][i])
  else:
    x.append(1-predictions['Score'][i])

In [11]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')
sample_submssion['problem'] = x
sample_submssion.to_csv("AutoML.csv", index = False)