## 0. 라이브러리 불러오기

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout
from keras.callbacks import EarlyStopping

## 1. 데이터 불러오기

In [None]:
train = pd.read_csv('/content/drive/MyDrive/dacon/oil_abnormal/train.csv')

## 2. 데이터 전처리
- 결측치 제거
- 샘플링
- Train / Validation 분리

In [None]:
# 결측치 확인

train.isna().sum()
train.info()

In [None]:
# object형 feature와 test에 사용될 feature 정의
object_columns = ['ID', 'COMPONENT_ARBITRARY']
oil_test_cols = ['COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR' , 'ANONYMOUS_2', 'AG', 'CO', 'CR', 'CU', 'FE', 'H2O', 'MN', 'MO', 'NI', 'PQINDEX', 'TI', 'V', 'V40', 'ZN']

In [None]:
# test에 사용될 feature만 남기기
train_oil_test_cols = train[oil_test_cols+['Y_LABEL']]

In [None]:
# object형 feature 인코딩

ohe = OneHotEncoder(sparse=False)

def get_ohe_component():
  tmp = pd.DataFrame(data=ohe.fit_transform(train_oil_test_cols[['COMPONENT_ARBITRARY']]), columns=ohe.get_feature_names_out())
  train_oil_test_cols.drop('COMPONENT_ARBITRARY', axis=1, inplace=True)
  df = pd.concat([train_oil_test_cols.reset_index(drop=True), tmp], axis=1)
  return df

if 'COMPONENT_ARBITRARY' in train_oil_test_cols.columns:
  train_data = get_ohe_component()

In [None]:
# x, y 데이터 분리

train_X = train_data.drop('Y_LABEL', axis=1)
train_Y = train_data['Y_LABEL']

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=22)
train_X, train_Y = smote.fit_resample(train_X, train_Y)

In [None]:
# 데이터 정규화

# def get_values(value):
#   return value.values.reshape(-1, 1)

# for col in train_X.columns:
#   scaler = StandardScaler()
#   train_X[col] = scaler.fit_transform(get_values(train_X[col]))

In [None]:
# train / validation 데이터 분리

train_x, val_x, train_y, val_y = train_test_split(train_X, train_Y, random_state=22, test_size=0.1, stratify=train_Y)
train_x.shape, val_x.shape

## 3. 모델 만들기 (ML)

In [None]:
from sklearn.ensemble import RandomForestClassifier

train_tmp = train_x.drop('COMPONENT_ARBITRARY_COMPONENT4', axis=1)
val_tmp = val_x.drop('COMPONENT_ARBITRARY_COMPONENT4', axis=1)

rf = RandomForestClassifier(random_state=22, max_depth=31)
rf.fit(train_tmp, train_y)

In [None]:
rf_pred = rf.predict(val_tmp)
rf_f1_score = f1_score(val_y, rf_pred, average='macro')
print(rf_f1_score)

In [None]:
# print(list(train_x.columns))
# print(list(val_x.columns))
# feature 줄여보기
dropped_result = []
for col in train_x.columns:
  train_tmp = train_x.drop(col, axis=1)
  val_tmp = val_x.drop(col, axis=1)
  rf = RandomForestClassifier(random_state=22, max_depth=31)
  rf.fit(train_tmp, train_y)
  rf_pred = rf.predict(val_tmp)
  rf_f1_score = f1_score(val_y, rf_pred, average='macro')
  dropped_result.append([f'drop column: {col}', rf_f1_score])

print(dropped_result)

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

def score(model, x, y):
  y_pred = model.predict(x)
  result = f1_score(y,y_pred, average='macro')
  return result

Skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=22)
cvs = cross_val_score(rf, train_x, train_y, cv=Skfold, scoring='accuracy')

In [None]:
df = pd.DataFrame(list(zip(train_x, rf.feature_importances_)), columns=['feature', 'importance']).sort_values('importance', ascending=False)
df = df.reset_index(drop=True)
df.head(15)

In [None]:
from xgboost import XGBClassifier
train_tmp = train_x.drop('COMPONENT_ARBITRARY_COMPONENT4', axis=1)
val_tmp = val_x.drop('COMPONENT_ARBITRARY_COMPONENT4', axis=1)

xg = XGBClassifier(random_state=22, max_depth=35)
xg.fit(train_tmp, train_y)

In [None]:
xg_pred = xg.predict(val_tmp)
xg_f1_score = f1_score(val_y, xg_pred, average='macro')
print(xg_f1_score)

## 4. 제출 파일 만들기

In [None]:
submission = pd.read_csv('/content/drive/MyDrive/dacon/oil_abnormal/sample_submission.csv')

In [None]:
test = pd.read_csv('/content/drive/MyDrive/dacon/oil_abnormal/test.csv')

In [None]:
test = test[oil_test_cols]

In [None]:
def get_ohe_component():
  tmp = pd.DataFrame(data=ohe.transform(test[['COMPONENT_ARBITRARY']]), columns=ohe.get_feature_names_out())
  test.drop('COMPONENT_ARBITRARY', axis=1, inplace=True)
  df = pd.concat([test.reset_index(drop=True), tmp], axis=1)
  return df

test_data = get_ohe_component()

In [None]:
# dl_result = model.predict(test_data)
# print(dl_result.flatten())
# dl_result = np.where(dl_result > 0.5, 1, 0)

In [None]:
test_data = test_data.drop('COMPONENT_ARBITRARY_COMPONENT4', axis=1)
rf_result = rf.predict(test_data)

In [None]:
submission['Y_LABEL'] = rf_result

In [None]:
submission.to_csv('/content/drive/MyDrive/dacon/oil_abnormal/rf_submission4.csv', index=False)