# 시스템 품질 변화로 인한 사용자 불편 예지 AI 경진대회

https://dacon.io/competitions/official/235687/overview/description

## 1. Library Import & Data 불러오기

In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 77 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6


In [None]:
!pip install lightgbm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import catboost as cb
import lightgbm as lgb
from tqdm import tqdm

In [None]:
PATH = '/content/drive/MyDrive/project/data/system_quality_customer_complain_data/'
train_err  = pd.read_csv(PATH+'train_err_data_.csv')
id_error = train_err[['user_id','errtype']].values
# 불만을 제기한 사람 타겟값 생성
train_prob = pd.read_csv(PATH+'train_problem_data_.csv')

array([10000,    15])

## 2. 학습 데이터 생성

### 2_1 train_df 생성

In [None]:
#error 0으로 이루어진 데이터셋 생성
error = np.zeros((15000,42)) # 15000개 user_id, 41가지(maxnum 42) errtype 

for person_idx, err in tqdm(id_error):
    # (person_idx - 10000)에 해당하는 행에, 열 기준 errtype 값의 -1 에 해당하는 위치에 error값을 +1
    error[person_idx - 10000,err - 1] += 1
error 

100%|██████████| 16554663/16554663 [00:45<00:00, 361337.77it/s]


array([[  0.,   0.,   8., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ..., 113.,  56.,   1.],
       [  0.,   0.,   2., ...,   0.,   0.,   0.],
       ...,
       [  0.,   0.,   0., ...,  58.,   8.,   5.],
       [  0.,   0.,   0., ...,   6.,   0.,   0.],
       [  0.,   0.,   4., ...,   0.,   0.,   0.]])

In [None]:
train_df = pd.DataFrame(error)
train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0.0,0.0,8.0,104.0,0.0,1.0,1.0,0.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,53.0,1.0,1.0,0.0,0.0,0.0,...,10.0,18.0,0.0,1.0,1.0,0.0,0.0,113.0,56.0,1.0
2,0.0,0.0,2.0,132.0,1.0,2.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,...,8.0,0.0,0.0,1.0,1.0,2.0,0.0,17.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,3.0,4.0,0.0,0.0,0.0,...,16.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0.0,0.0,0.0,0.0,2.0,5.0,5.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,7.0,4.0
14996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14997,0.0,0.0,0.0,1.0,8.0,1.0,1.0,0.0,0.0,0.0,...,16.0,17.0,0.0,1.0,1.0,0.0,0.0,58.0,8.0,5.0
14998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,15.0,12.0,0.0,1.0,1.0,0.0,0.0,6.0,0.0,0.0


### 2_2 target 생성

In [None]:
# 0 으로 이루어진 열 생성
problem = np.zeros(15000)
# user_id 각각에 10000 빼서 저장
user_ids = train_prob['user_id'].unique() - 10000
# probelm[유저id번호] 에 1 저장
problem[user_ids] = 1

display(problem, user_ids, sum(problem))

array([0., 1., 0., ..., 1., 1., 0.])

array([ 9224, 13664,  5166, ...,  9114, 11505,  8822])

5000.0

In [None]:
target = pd.DataFrame(problem)
target

Unnamed: 0,0
0,0.0
1,1.0
2,0.0
3,0.0
4,1.0
...,...
14995,0.0
14996,0.0
14997,1.0
14998,1.0


## 3. CatBoost 훈련


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score ,f1_score, roc_auc_score, classification_report

In [None]:
def get_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test,y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
    print('오차행렬:\n', confusion)
    print(f'\n정확도:: {accuracy:.4f}')
    print(f'정밀도: {precision:.4f}')
    print(f'재현율: {recall:.4f}')
    print(f'F1: {F1:.4f}')
    print(f'AUC: {AUC:.4f}')
    print(classification_report(y_test, y_pred))



In [None]:
# train validation 분리
X_train, X_val, y_train, y_val = train_test_split(train_df,target,test_size=0.2, random_state=42)
X_train.shape,X_val.shape,y_train.shape,y_val.shape

((12000, 42), (3000, 42), (12000, 1), (3000, 1))

In [None]:
y_train

Unnamed: 0,0
9839,0.0
9680,0.0
7093,0.0
11293,0.0
820,0.0
...,...
5191,1.0
13418,1.0
5390,1.0
860,1.0


#### 3.1 첫번째 기본훈련

In [None]:
model = cb.CatBoostClassifier(iterations=2,
                           learning_rate=0.01,
                           max_depth=10,
                           early_stopping_rounds=3
                           )
cb_model = model.fit(X_train,y_train)

0:	learn: 0.6886158	total: 117ms	remaining: 117ms
1:	learn: 0.6840796	total: 245ms	remaining: 0us


In [None]:
# Validation 데이터셋 예상
pred = cb_model.predict(X_val)

In [None]:
get_eval(y_val,pred)

오차행렬:
 [[1564   69]
 [ 865  502]]

정확도:: 0.6887
정밀도: 0.8792
재현율: 0.3672
F1: 0.5181
AUC: 0.6625
              precision    recall  f1-score   support

         0.0       0.64      0.96      0.77      1633
         1.0       0.88      0.37      0.52      1367

    accuracy                           0.69      3000
   macro avg       0.76      0.66      0.64      3000
weighted avg       0.75      0.69      0.66      3000



#### 3.2 두번째 훈련세트

In [None]:
model2 = cb.CatBoostClassifier(iterations=30,
                           learning_rate=0.01,
                           max_depth=10,
                           early_stopping_rounds=3
                           )
cb_model2 = model2.fit(X_train,y_train)
# Validation 데이터셋 예상
pred2 = cb_model2.predict(X_val)


In [None]:
get_eval(y_val,pred2)

오차행렬:
 [[1579   54]
 [ 888  479]]

정확도:: 0.6860
정밀도: 0.8987
재현율: 0.3504
F1: 0.5042
AUC: 0.6587
              precision    recall  f1-score   support

         0.0       0.64      0.97      0.77      1633
         1.0       0.90      0.35      0.50      1367

    accuracy                           0.69      3000
   macro avg       0.77      0.66      0.64      3000
weighted avg       0.76      0.69      0.65      3000



#### 3.3 세번째 훈련세트

In [None]:
model3 = cb.CatBoostClassifier(iterations=20,
                           learning_rate=0.1,
                           max_depth=10,
                           early_stopping_rounds=3
                           )
cb_model3 = model3.fit(X_train,y_train)
# Validation 데이터셋 예상
pred5 = cb_model3.predict_proba(X_val,y_val)

0:	learn: 0.6505000	total: 148ms	remaining: 2.8s
1:	learn: 0.6153801	total: 293ms	remaining: 2.64s
2:	learn: 0.5881039	total: 442ms	remaining: 2.5s
3:	learn: 0.5660749	total: 598ms	remaining: 2.39s
4:	learn: 0.5457978	total: 752ms	remaining: 2.25s
5:	learn: 0.5331155	total: 879ms	remaining: 2.05s
6:	learn: 0.5206175	total: 976ms	remaining: 1.81s
7:	learn: 0.5095166	total: 1.1s	remaining: 1.65s
8:	learn: 0.5012893	total: 1.26s	remaining: 1.54s
9:	learn: 0.4935748	total: 1.43s	remaining: 1.43s
10:	learn: 0.4869934	total: 1.55s	remaining: 1.26s
11:	learn: 0.4814076	total: 1.7s	remaining: 1.13s
12:	learn: 0.4750337	total: 1.84s	remaining: 992ms
13:	learn: 0.4704678	total: 2s	remaining: 859ms
14:	learn: 0.4659291	total: 2.15s	remaining: 715ms
15:	learn: 0.4623204	total: 2.3s	remaining: 576ms
16:	learn: 0.4590786	total: 2.46s	remaining: 435ms
17:	learn: 0.4565556	total: 2.6s	remaining: 289ms
18:	learn: 0.4551197	total: 2.75s	remaining: 145ms
19:	learn: 0.4524473	total: 2.9s	remaining: 0us


TypeError: ignored

In [None]:
get_eval(y_val,pred5)

오차행렬:
 [[1632    1]
 [1307   60]]

정확도:: 0.5640
정밀도: 0.9836
재현율: 0.0439
F1: 0.0840
AUC: 0.5216
              precision    recall  f1-score   support

         0.0       0.56      1.00      0.71      1633
         1.0       0.98      0.04      0.08      1367

    accuracy                           0.56      3000
   macro avg       0.77      0.52      0.40      3000
weighted avg       0.75      0.56      0.43      3000



#### 3.4 네번째 훈련세트

코드 방식 변화

pools, params 방식으로 훈련 input


In [None]:
cb_dtrain = cb.Pool(data=X_train, label = y_train)
cb_deval = cb.Pool(data=X_val, label = y_val)
cb_param = {
    'max_depth':10, # 트리 깊이
    'learning_rate': 0.01, # 학습률
    'n_estimators': 100, # 트리 생성 갯수
    'eval_metric' : 'Accuracy', # 평가 척도
    'loss_function': 'MultiClass' # 손실 함수
}
cb_model3 = cb.CatBoostClassifier(iterations=30,
                           learning_rate=1,
                           depth=2,
                           loss_function='MultiClass')

model3 = cb_model3.fit(cb_dtrain)
pred_proba3 = model3.predict_proba(cb_deval)

0:	learn: 0.5251073	total: 13.5ms	remaining: 391ms
1:	learn: 0.5034400	total: 28.8ms	remaining: 403ms
2:	learn: 0.4964993	total: 46.2ms	remaining: 416ms
3:	learn: 0.4904056	total: 64.7ms	remaining: 420ms
4:	learn: 0.4890739	total: 83.5ms	remaining: 417ms
5:	learn: 0.4814115	total: 100ms	remaining: 401ms
6:	learn: 0.4803820	total: 118ms	remaining: 386ms
7:	learn: 0.4797385	total: 137ms	remaining: 378ms
8:	learn: 0.4774377	total: 156ms	remaining: 364ms
9:	learn: 0.4757923	total: 172ms	remaining: 344ms
10:	learn: 0.4746199	total: 192ms	remaining: 331ms
11:	learn: 0.4741125	total: 214ms	remaining: 320ms
12:	learn: 0.4729151	total: 233ms	remaining: 305ms
13:	learn: 0.4725505	total: 250ms	remaining: 286ms
14:	learn: 0.4722845	total: 263ms	remaining: 263ms
15:	learn: 0.4717416	total: 269ms	remaining: 235ms
16:	learn: 0.4702004	total: 277ms	remaining: 212ms
17:	learn: 0.4694432	total: 283ms	remaining: 189ms
18:	learn: 0.4692262	total: 289ms	remaining: 167ms
19:	learn: 0.4686717	total: 297ms	re

In [None]:
pred_proba3

array([[0.57912439, 0.42087561],
       [0.81987101, 0.18012899],
       [0.27897879, 0.72102121],
       ...,
       [0.69191134, 0.30808866],
       [0.92416394, 0.07583606],
       [0.91411438, 0.08588562]])

In [None]:
model_pred3 = cb_model3.predict(X_val) 
# get_eval(y_val,pred3)
pred3 = np.argmax(model_pred3, axis =1)

In [None]:
display(model_pred3, pred3)

array([[ 0.17720876, -0.17720876],
       [ 0.43551616, -0.43551616],
       [-0.0535972 ,  0.0535972 ],
       ...,
       [ 0.32245047, -0.32245047],
       [ 0.4916471 , -0.4916471 ],
       [ 0.52719757, -0.52719757]])

array([0, 0, 1, ..., 0, 0, 0])

In [None]:
get_eval(y_val,pred3)

오차행렬:
 [[1580   53]
 [ 872  495]]

정확도:: 0.6917
정밀도: 0.9033
재현율: 0.3621
F1: 0.5170
AUC: 0.6648
              precision    recall  f1-score   support

         0.0       0.64      0.97      0.77      1633
         1.0       0.90      0.36      0.52      1367

    accuracy                           0.69      3000
   macro avg       0.77      0.66      0.65      3000
weighted avg       0.76      0.69      0.66      3000



In [None]:
threshold = 0.4
pred4 = np.argmax(np.where(model_pred3 >= threshold , 1, 0),axis = 1)
pred4

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
get_eval(y_val,pred4)

오차행렬:
 [[1613   20]
 [1134  233]]

정확도:: 0.6153
정밀도: 0.9209
재현율: 0.1704
F1: 0.2877
AUC: 0.5791
              precision    recall  f1-score   support

         0.0       0.59      0.99      0.74      1633
         1.0       0.92      0.17      0.29      1367

    accuracy                           0.62      3000
   macro avg       0.75      0.58      0.51      3000
weighted avg       0.74      0.62      0.53      3000



In [None]:
pred_proba3[:,1:]

array([[0.42087561],
       [0.18012899],
       [0.72102121],
       ...,
       [0.30808866],
       [0.07583606],
       [0.08588562]])

In [None]:
np.where(pred_proba3[:,1] >= 0.4 , 1, 0)

array([1, 0, 1, ..., 0, 0, 0])

In [None]:
thresholds = [0, 0.01,0.05, 0.08, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
for i in thresholds:
    print(i)
    temp_pred = np.where(pred_proba3[:,1] >= i , 1, 0)
    get_eval(y_val,temp_pred)

0
오차행렬:
 [[   0 1633]
 [   0 1367]]

정확도:: 0.4557
정밀도: 0.4557
재현율: 1.0000
F1: 0.6261
AUC: 0.5000
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1633
         1.0       0.46      1.00      0.63      1367

    accuracy                           0.46      3000
   macro avg       0.23      0.50      0.31      3000
weighted avg       0.21      0.46      0.29      3000

0.01
오차행렬:
 [[   0 1633]
 [   0 1367]]

정확도:: 0.4557
정밀도: 0.4557
재현율: 1.0000
F1: 0.6261
AUC: 0.5000
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1633
         1.0       0.46      1.00      0.63      1367

    accuracy                           0.46      3000
   macro avg       0.23      0.50      0.31      3000
weighted avg       0.21      0.46      0.29      3000

0.05
오차행렬:
 [[  89 1544]
 [   9 1358]]

정확도:: 0.4823
정밀도: 0.4680
재현율: 0.9934
F1: 0.6362
AUC: 0.5240
              precision    recall  f1-score   sup

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


오차행렬:
 [[ 329 1304]
 [  62 1305]]

정확도:: 0.5447
정밀도: 0.5002
재현율: 0.9546
F1: 0.6564
AUC: 0.5781
              precision    recall  f1-score   support

         0.0       0.84      0.20      0.33      1633
         1.0       0.50      0.95      0.66      1367

    accuracy                           0.54      3000
   macro avg       0.67      0.58      0.49      3000
weighted avg       0.69      0.54      0.48      3000

0.2
오차행렬:
 [[ 975  658]
 [ 291 1076]]

정확도:: 0.6837
정밀도: 0.6205
재현율: 0.7871
F1: 0.6940
AUC: 0.6921
              precision    recall  f1-score   support

         0.0       0.77      0.60      0.67      1633
         1.0       0.62      0.79      0.69      1367

    accuracy                           0.68      3000
   macro avg       0.70      0.69      0.68      3000
weighted avg       0.70      0.68      0.68      3000

0.3
오차행렬:
 [[1317  316]
 [ 495  872]]

정확도:: 0.7297
정밀도: 0.7340
재현율: 0.6379
F1: 0.6826
AUC: 0.7222
              precision    recall  f1-score   support