# 1. 디렉토리, 라이브러리 불러오기
# 1. Set directory, get related libraries

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
import lightgbm as lgb
import joblib 
import pickle

# 2. 데이터 불러오기 및 전처리
# 2. Get data, a little bit of preprocessing

### a. 데이터 불러오기
### a. Get data

In [3]:
# test data 불러오기
# get test data
test = pd.read_csv("test_feature.csv")

# train data 불러오기
# get train data
train = pd.read_csv("train_preprocessed.csv")

### b. test set 전처리
### b. Preprocessing of the test set

#### (1) NA 비율이 높은 변수 제거
#### (1) Remove columns with high proportions of NA values

In [4]:
# NA 비율이 높은 변수 리스트 불러오기
# Get names of columns with high proportions of NA values
with open( "na_drop_list", "rb" ) as file:
    na_drop_list = pickle.load(file)

test = test.drop(na_drop_list, axis = 1)

In [5]:
# 8개 변수 삭제
# removal of 8 columns (prior: 170)
test.shape 

(8000, 162)

#### (2) 상관관계 분석을 통해 변수 제거
#### (2) Remove columns using correlation analysis

In [6]:
# 상관관계가 높은 변수 리스트 불러오기
# Get names of columns having high correlation values to the target variable
with open("col_drop_list", "rb") as file:
     col_drop_list = pickle.load(file)

test = test.drop(col_drop_list, axis = 1)

In [7]:
# 39개 변수 삭제
# removal of 39 columns (prior: 162)
test.shape 

(8000, 123)

#### (3) MICE imputation

In [8]:
imputer=IterativeImputer(random_state=26)
test=pd.DataFrame(imputer.fit_transform(test))



In [9]:
test.isnull().values.any() 
# NA 없음!
# No NAs shown after MICE imputation

False

#### (4) 분산이 0인 변수 제거
#### (4) Removal of columns that have variance values of 0

In [10]:
# train set에 적용한 모델 불러오기
# get model for train set
var_drop = joblib.load('var_drop.pkl')

# 분산이 0인 1개의 열 삭제
# delete single column with variance value of 0
test = pd.DataFrame(var_drop.transform(test))

In [11]:
test.shape

(8000, 122)

#### (5) 표준화 및 PCA
#### (5) Standardization and PCA

In [12]:
# train set에 적용한 모델 불러오기 (표준화, PCA)
# get model performed on train set (standardization, PCA)
standard = joblib.load('standard.pkl')
pca = joblib.load('pca.pkl')

# 표준화하기
# standardization
test = standard.transform(test)

# PCA
test = pca.transform(test)

# 결과 확인
# check results, features reduced to 54 (prior:122)
print("줄어든 특성 개수:", test.shape[1])

줄어든 특성 개수: 54


# 3. Test Prediction

In [13]:
# 모델 불러오기 
# get LGBM model
model = joblib.load('LGBM.pkl')

In [14]:
train_X = train.drop('class', axis = 1)
train_y = train['class']

In [15]:
# 데이터 형태 변환 및 모델 fitting
# Change format of data to apply LGBM, fit model
train_ds = lgb.Dataset(train_X, label=train_y) 
model.fit(train_X, train_y)





LGBMClassifier(bagging_fraction=1.0, bagging_freq=5, boosting='gbdt',
               feature_fraction=0.9, is_training_metric=True, learning_rate=0.3,
               metric='auc', num_iterations=600, objective='binary', seed=26)

In [16]:
prediction = model.predict(test)

# 4. 최종 제출 파일 저장
# 4. Save final submission file

In [17]:
submission = pd.DataFrame(columns = ['id', 'class'])
submission['id'] = range(1,8001)

In [18]:
# 'class' 변수를 예측값으로 바꿔주기
# change elements of column 'class' with the predicted valeus
submission['class'] = prediction

# 0/1로 되어있는 'class'변수를 'neg'와 'pos'로 변환
# change one-hot encoded results to categories 'neg' and 'pos'
submission.loc[(submission['class'] == 0.0), 'class'] = 'neg'
submission.loc[(submission['class'] == 1.0), 'class'] = 'pos'
submission['class'].value_counts()

neg    7798
pos     202
Name: class, dtype: int64

In [19]:
# csv 파일로 내보내기
# Output as a csv file
submission.to_csv("submission.csv", header = True, index = False)