In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
import random
warnings.filterwarnings("ignore")


In [3]:
# 파일 불러오기
def dataframe_from_csv(target):
  return pd.read_csv(target).rename(columns=lambda x:x.strip())

def dataframe_from_csvs(targets):
  return pd.concat([dataframe_from_csv(x) for x in targets])

train_files = sorted([x for x in Path('/content/drive/MyDrive/AIFactory/Unid/data/train/').glob('*.csv')])
val_files = sorted([x for x in Path('/content/drive/MyDrive/AIFactory/Unid/data/val/').glob('*.csv')])

train = dataframe_from_csvs(train_files)
val = dataframe_from_csvs(val_files)
test = pd.read_csv('/content/drive/MyDrive/AIFactory/Unid/data/test.csv')
print(f'train: {len(train)}')
print(f'validation: {len(val)}')
print(f'test: {len(test)}')

train: 62564
validation: 7820
test: 7820


In [4]:
df = train.copy()
# target과 feature간의 상관계수를 보기 위한 변환
df['leaktype'].replace(['out','in','noise','other','normal'], [0,1,2,3,4], inplace=True)
# 상관계수가 0.05미만인 feature
under005 = df.corr()['leaktype'][np.abs(df.corr()['leaktype']) < 0.05].index[1:]
under005

Index(['C12', 'C13', 'C15', 'C16', 'C17', 'C18', 'C25'], dtype='object')

In [5]:
# 삭제할 feature 리스트에 상관계수가 0.05미만인 feature, 'site', 'sid'추가
drops = under005.to_list()
drops.append('site')
drops.append('sid')
drops

['C12', 'C13', 'C15', 'C16', 'C17', 'C18', 'C25', 'site', 'sid']

In [6]:
# Disregard columns with high difference from the others (C02, C26) and calculate the variance amongst the remaining columns

k = list(train.columns)
k.pop(0)
k.pop(0)
k.pop(0)
k.pop(1)
k.pop(24)

train["var"] = train[k].T.var()
val["var"] = val[k].T.var()
test["var"] = test[k].T.var()

In [7]:
# One-Hot Encode Sites

sites = list(train["site"].value_counts().index)[:1000]
for site in sites:
  train[site] = train["site"].apply(lambda x : int(x == site))
  val[site] = val["site"].apply(lambda x : int(x == site))
  test[site] = test["site"].apply(lambda x : int(x == site))  

In [8]:
train["sid"].value_counts()[:500]

S-0359369081383008    131
S-0359369084269204    118
S-0359369084047519     99
S-0359369084035613     99
S-0359369085126635     95
                     ... 
S-0359369085183693     46
S-0359369083971081     46
S-0359369085125785     46
S-0359369084109822     46
S-0359369084051016     46
Name: sid, Length: 500, dtype: int64

In [9]:
# One-Hot Encode SIDs

from tqdm import tqdm

sid = list(train["sid"].value_counts().index)
for si in tqdm(sid):
  train[si] = train["sid"].apply(lambda x : int(x == si))
  val[si] = val["sid"].apply(lambda x : int(x == si))
  test[si] = test["sid"].apply(lambda x : int(x == si))

100%|██████████| 2671/2671 [01:27<00:00, 30.36it/s]


In [10]:
# 미리 지정한 feature 삭제 후 target encoding
train = train.drop(drops, axis=1)
val = val.drop(drops, axis=1)
test = test.drop(drops, axis=1)
train['leaktype'].replace(['out','in','noise','other','normal'], [0,1,2,3,4], inplace=True)
val['leaktype'].replace(['out','in','noise','other','normal'], [0,1,2,3,4], inplace=True)
test['leaktype']=""

# 데이터셋들을을 target과 feature로 나눠줍니다.
train_x=train.drop(['leaktype'], axis=1)
train_y=train['leaktype']
val_x=val.drop(['leaktype'], axis=1)
val_y=val['leaktype']
test_x=test.drop(['leaktype'], axis=1)
test_y=test['leaktype']


In [11]:
# 데이터셋 표준화
from sklearn.preprocessing import StandardScaler, RobustScaler

#scaler정의
scaler = StandardScaler()
cols = train_x.select_dtypes(np.number).columns
#train data에 scaler를 fit
scaler.fit(train_x[cols])

#train data 변환
train_x[cols] = scaler.transform(train_x[cols])
#train data 변환
val_x[cols] = scaler.transform(val_x[cols])
#train data 변환
test_x[cols] = scaler.transform(test_x[cols])

9968
n_estimators = 500
max_depth = 18
learning_rate = 0.1
subsample = 0.7
min_child_weight = 0.2
min_split_loss = 3
reg_lambda = 0.25

In [12]:
# XGBoost로 학습
import xgboost as xgb
import random

random.seed(1004)

n_estimators = 500  # 생성할 weak learner 수
max_depth = 18  # 깊이 제한
learning_rate = 0.1 # 학습률
subsample = 0.7 # 데이터 샘플링 비율 지정(과적합 제어)
min_child_weight = 0.2  # 관측치에 대한 가중치 합의 최소
min_split_loss = 3  # 값이 클수록 과적합 감소효과
reg_lambda = 0.25   # L2 Regularization 적용

model1 = xgb.XGBClassifier(n_estimators=n_estimators, max_depth = max_depth, learning_rate=learning_rate, subsample=subsample, tree_method="gpu_hist", min_child_weight=min_child_weight, min_split_loss=min_split_loss, reg_lambda=reg_lambda)
setattr(model1, 'verbosity', 2)

# 학습
train_model = model1.fit(train_x, train_y)

In [13]:
from sklearn.metrics import classification_report

# validation set 예측
pred1 = train_model.predict(val_x)

print(classification_report(val_y, pred1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2192
           1       0.99      1.00      1.00      1659
           2       1.00      1.00      1.00       629
           3       1.00      0.99      0.99       878
           4       1.00      1.00      1.00      2462

    accuracy                           1.00      7820
   macro avg       1.00      1.00      1.00      7820
weighted avg       1.00      1.00      1.00      7820



In [14]:
from sklearn.metrics import f1_score
f1_score(val_y, pred1, average='macro')

0.9968368969340915

In [15]:
# test set 예측
test_pred = train_model.predict(test_x)

In [16]:
# 제출파일 생성
submission = pd.read_csv('/content/drive/MyDrive/AIFactory/Unid/data/sample_submission.csv')
submission['leaktype']=test_pred
submission

Unnamed: 0,site,sid,leaktype
0,S-4784025026,S-0359369085186035,1
1,S-4521010100,S-0359369084102843,4
2,S-2911010107,S-0359369084039755,4
3,S-2711010100,S-0359369084044425,0
4,S-4677025033,S-0359369083955449,0
...,...,...,...
7815,S-4677025029,S-0359369081312577,1
7816,S-4772025022,S-0359369084106224,3
7817,S-4673025027,S-0359369085149850,0
7818,S-4571025029,S-0359369084043617,4


In [17]:
submission.to_csv('submission9968.csv', index=False)