In [1]:
import sklearn
import pandas as pd
import random
import os
import numpy as np
import cv2
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import warnings

# hyperparameter
import optuna
from optuna.samplers import TPESampler
warnings.filterwarnings("ignore")
from sklearn.impute import KNNImputer

#Modeling
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, VotingClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostRegressor

from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error, mean_squared_log_error, f1_score
from sklearn.model_selection import KFold, StratifiedKFold

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
SEED = 25
seed_everything(SEED) 

In [3]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

train.drop(columns=["PRODUCT_ID", "TIMESTAMP", 'LINE', 'PRODUCT_CODE'], inplace=True)
test.drop(columns=["PRODUCT_ID", "TIMESTAMP", 'LINE', 'PRODUCT_CODE'], inplace=True)
y_class = train['Y_Class']
y = train['Y_Quality']

num_features = test.select_dtypes(exclude=['object']).columns.to_list()


#for col in num_features:
#    train[col] = train[col].fillna(train[col].median())

scaler = StandardScaler()
#scaler = QuantileTransformer()
train[num_features] = scaler.fit_transform(train[num_features])
test[num_features] = scaler.transform(test[num_features])

X = train.drop(columns=['Y_Class', 'Y_Quality'])
X_test = test

#from math import *
corr = pd.read_csv('correlation/correlation.csv')
# Y_Quality 제거
corr = corr.iloc[:-1,:]
important = list(corr[abs(corr['correlation'])>=0.1]['feature'])
#important
X = X[important]
X_test = X_test[important]

dup = ~X.T.duplicated()
X = X.loc[:, dup]
X_test = X_test.loc[:, dup]

#num_features = X_test.select_dtypes(exclude=['object']).columns.to_list()
#scaler = StandardScaler()
#X[num_features] = scaler.fit_transform(X[num_features])
#X_test[num_features] = scaler.transform(X_test[num_features])
imputer = KNNImputer()
X = imputer.fit_transform(X)
X_test = imputer.transform(X_test)

In [5]:
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
import lightgbm
import joblib
import time
import warnings
import os
warnings.filterwarnings(action='ignore', category=UserWarning)

skf = KFold(n_splits = 10, random_state = SEED, shuffle = True) #총 6번의 fold 진행
n = 0 #x번째 fold인지 기록

fold_target_pred = []
fold_score = []

#파일 디렉토리 생성
model_dir = f'./model'
os.makedirs(model_dir, exist_ok=True)

for train_index, valid_index in skf.split(X): #label 기준으로 stratified k fold 진행
    n += 1
    
    val_pred_name = [] #validation pred model 이름 저장
    val_pred = []      #validation set pred 결과 저장
    target_pred = []   #test set pred 결과 저장
    
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y.values[train_index], y.values[valid_index]
    
    ### Create Model ###
    #CAT model
    start_time_cat = time.time()
    model_cat = CatBoostRegressor(verbose = 200,
                            learning_rate = 0.02,
                            random_seed = 42,
                            iterations = 5000,
                            loss_function='MultiRMSE',
                            #ignored_features = [8, 9, 31, 32, 33, 34, 45, 50, 51, 53, 54, 55],
                            od_wait = 200,
                            depth = 9)
    
    model_cat.fit(X_train, y_train, eval_set=(X_valid, y_valid))
    end_time_cat = time.time()

    #model cat 저장
    cat_path = './model/cat_{}'.format(n)
    model_cat.save_model(cat_path)
    
    #model cat 불러오기
    #model_cat.load_model(cat_path)
    
    val_pred_name.append("CatBoostRegressor")  # 모델 이름 저장
    val_pred.append(model_cat.predict(X_valid))   # validation set pred 결과 저장
    target_pred.append(model_cat.predict(X_test)) # test set pred 결과 저장
    
    ### LGBM model
    start_time_lgb = time.time()
    model_lgbm = LGBMRegressor(n_estimators = 2000, 
                                               learning_rate = 0.01,
                                               max_depth = 16,
                                               min_child_samples = 56,
                                               subsample = 0.4,
                                               num_leaves = 160,
                                               random_state = 42,
                                               verbose=-1,
                                               n_jobs = 8)

    fit_params = dict(
        eval_set=[(X_valid, y_valid)],
        eval_metric = "rmse",
        )

    model_lgbm.fit(X_train, y_train, **fit_params)
    end_time_lgb = time.time()
    val_pred_name.append("LGBMRegressor")  # 모델 이름 저장
    val_pred.append(model_lgbm.predict(X_valid))   # validation set pred 결과 저장
    target_pred.append(model_lgbm.predict(X_test)) # test set pred 결과 저장
    
    #model lgbm 저장
    lgbm_path = './model/lgbm_{}.pkl'.format(n)
    
    # save model
    joblib.dump(model_lgbm, lgbm_path)
    
    #model lgbm 불러오기
    #model_lgbm.load(lgbm_path)

    ### XGB model
    start_time_xgb = time.time()
    model_xgb = XGBRegressor(objective = "reg:squarederror",
                                                  n_estimators = 3000,
                                                  random_state = 42,
                                                  eval_metric = "rmse", 
                                                  learning_rate=0.006,
                                                  subsample=0.75, 
                                                  colsample_bytree = 0.86,
                                                  max_depth=9,
                                                  tree_method='gpu_hist', 
                                                  gpu_id = 0)
    
    fit_params = dict(
        eval_set=[(X_valid, y_valid)],
        )
    
    model_xgb.fit(X_train, y_train, verbose=0, **fit_params)
    end_time_xgb = time.time()
    val_pred_name.append("XGBRegressor")  # 모델 이름 저장
    val_pred.append(model_xgb.predict(X_valid))   # validation set pred 결과 저장
    target_pred.append(model_xgb.predict(X_test)) # test set pred 결과 저장
    
    
    #model xgb 저장
    xgb_path = './model/xgb_{}.pkl'.format(n)
    joblib.dump(model_xgb, xgb_path)
    #model_xgb.save(xgb_path)
    
    #model xgb 불러오기
    #model_xgb.load(xgb_path)
  
    ### average validation pred ###
    preds = np.array(val_pred[0])
    for i in range(1, len(val_pred)):
        preds += val_pred[i]
    preds = preds/len(val_pred)

    ### average target pred ###
    target_preds = target_pred[0]
    for i in range(1, len(target_pred)):
        target_preds += target_pred[i]
    target_preds = target_preds/len(target_pred)
    fold_target_pred.append(target_preds) # append final target pred
    
    print("========== fold %d ==========" %(n))
    for i in range(len(val_pred)):
        print("%s model nrmse : %0.4f" %(val_pred_name[i], mean_squared_error(y_valid, val_pred[i]) ** 0.5))
        
    print('CAT 코드 실행 시간: %10ds' % (end_time_cat - start_time_cat))
    print('LGB 코드 실행 시간: %10ds' % (end_time_lgb - start_time_lgb))
    print('XGB 코드 실행 시간: %10ds' % (end_time_xgb - start_time_xgb))
    print("average model nrmse : %0.4f" %(mean_squared_error(y_valid, preds) ** 0.5))
    fold_score.append(mean_squared_error(y_valid, preds) ** 0.5)

total_score = fold_score[0]
for i in range(1, len(fold_score)):
    total_score += fold_score[i]
    
total_score = total_score/len(fold_score)

print("==============================")
print("Model Sum Average nrmse %0.4f" %(total_score))

0:	learn: 0.0073863	test: 0.0071955	best: 0.0071955 (0)	total: 87.5ms	remaining: 7m 17s
200:	learn: 0.0043105	test: 0.0048082	best: 0.0048082 (200)	total: 6.29s	remaining: 2m 30s
400:	learn: 0.0030564	test: 0.0041107	best: 0.0041107 (400)	total: 12.7s	remaining: 2m 25s
600:	learn: 0.0020714	test: 0.0038218	best: 0.0038218 (600)	total: 19s	remaining: 2m 19s
800:	learn: 0.0014223	test: 0.0037439	best: 0.0037433 (799)	total: 25.4s	remaining: 2m 13s
1000:	learn: 0.0009844	test: 0.0037055	best: 0.0037041 (997)	total: 31.7s	remaining: 2m 6s
1200:	learn: 0.0006877	test: 0.0036883	best: 0.0036880 (1198)	total: 38.1s	remaining: 2m
1400:	learn: 0.0004791	test: 0.0036795	best: 0.0036779 (1300)	total: 44.5s	remaining: 1m 54s
1600:	learn: 0.0003343	test: 0.0036797	best: 0.0036778 (1411)	total: 50.9s	remaining: 1m 47s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.003677791226
bestIteration = 1411

Shrink model to first 1412 iterations.
CatBoostRegressor model nrmse : 0.0037
LG

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def euclidean_distance(row1, row2):
	distance = 0.0
	distance += (row1 - row2[0])**2
	return sqrt(distance)

def get_neighbors(train, test_row, num_neighbors):
	distances = list()
	for train_row in train:
		dist = euclidean_distance(test_row, train_row)
		distances.append((train_row, dist))
	distances.sort(key=lambda tup: tup[1])
	neighbors = list()
	for i in range(num_neighbors):
		neighbors.append(distances[i][0])
	return neighbors

def predict_classification(train, test_row, num_neighbors):
	neighbors = get_neighbors(train, test_row, num_neighbors)
	output_values = [row[-1] for row in neighbors]
	prediction = max(set(output_values), key=output_values.count)
	return prediction
dataset = pd.concat([y, y_class], axis=1)
dataset = dataset.values

knnclassifier = KNeighborsClassifier(1)
knnclassifier.fit(y.to_numpy().reshape(-1,1), y_class.to_numpy().reshape(-1,1))

In [None]:

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
import lightgbm
import joblib
import time
import warnings
import os
warnings.filterwarnings(action='ignore', category=UserWarning)

skf = StratifiedKFold(n_splits = 6, random_state = SEED, shuffle = True) #총 6번의 fold 진행
n = 0 #x번째 fold인지 기록
KNN = 1
fold_target_pred = []
fold_score = []
fold_f1 = []

#파일 디렉토리 생성
model_dir = f'./model'
os.makedirs(model_dir, exist_ok=True)

for train_index, valid_index in skf.split(X, y_class): #label 기준으로 stratified k fold 진행
    n += 1
    
    val_pred_name = [] #validation pred model 이름 저장
    val_pred = []      #validation set pred 결과 저장
    target_pred = []   #test set pred 결과 저장
    class_pred = []
    
    X_train, X_valid = X.values[train_index], X.values[valid_index]
    y_train, y_valid = y.values[train_index], y.values[valid_index]
    
    y_class_valid = y_class.values[valid_index]
    
    ### Create Model ###
    #CAT model
    model_cat = CatBoostRegressor(verbose = 200,
                            learning_rate = 0.02,
                            random_seed = 42,
                            iterations = 5000,
                            loss_function='MultiRMSE',
                            #ignored_features = [8, 9, 31, 32, 33, 34, 45, 50, 51, 53, 54, 55],
                            od_wait = 200,
                            depth = 9)

    #model cat 저장
    cat_path = './model/cat_{}'.format(n)
    model_cat.load_model(cat_path)
    val_pred_name.append("CatBoostRegressor")  # 모델 이름 저장
    cat_output = model_cat.predict(X_valid)
    val_pred.append(cat_output)   # validation set pred 결과 저장
    target_pred.append(model_cat.predict(X_test)) # test set pred 결과 저장
    
    p = knnclassifier.predict(cat_output.reshape(-1, 1))
    
    class_pred.append(p)
    lgbm_path = './model/lgbm_{}.pkl'.format(n)
    
    ### LGBM model
    model_lgbm = joblib.load(lgbm_path)
    lgbm_output = model_lgbm.predict(X_valid)
    val_pred_name.append("LGBMRegressor")  # 모델 이름 저장
    val_pred.append(lgbm_output)   # validation set pred 결과 저장
    target_pred.append(model_lgbm.predict(X_test)) # test set pred 결과 저장
    
    p = knnclassifier.predict(lgbm_output.reshape(-1, 1))
    class_pred.append(p)

    #model lgbm 불러오기
    #model_lgbm.load(lgbm_path)

    ### XGB model
    xgb_path = './model/xgb_{}.pkl'.format(n)
    
    model_xgb = joblib.load(xgb_path)
    xgb_output = model_xgb.predict(X_valid)
    val_pred_name.append("XGBRegressor")  # 모델 이름 저장
    val_pred.append(xgb_output)   # validation set pred 결과 저장
    target_pred.append(model_xgb.predict(X_test)) # test set pred 결과 저장
    
    p = knnclassifier.predict(xgb_output.reshape(-1, 1))
    class_pred.append(p)
    
    ### average validation pred ###
    preds = np.array(val_pred[0])
    for i in range(1, len(val_pred)):
        preds += val_pred[i]
    preds = preds/len(val_pred)
    
    total_class_output = knnclassifier.predict(preds.reshape(-1, 1))

    ### average target pred ###
    target_preds = target_pred[0]
    for i in range(1, len(target_pred)):
        target_preds += target_pred[i]
    target_preds = target_preds/len(target_pred)
    fold_target_pred.append(target_preds) # append final target pred
    
    print("========== fold %d ==========" %(n))
    for i in range(len(val_pred)):
        print("%s model nrmse : %0.4f" %(val_pred_name[i], mean_squared_error(y_valid, val_pred[i]) ** 0.5))
        print("%s model F1-Macro : %0.4f" %(val_pred_name[i], f1_score(y_class_valid, class_pred[i], average='macro')))
        
    print("average model nrmse : %0.4f" %(mean_squared_error(y_valid, preds) ** 0.5))
    print("average model F1-Macro : %0.4f" %(f1_score(y_class_valid, total_class_output, average='macro')))
    fold_score.append(mean_squared_error(y_valid, preds) ** 0.5)
    fold_f1.append(f1_score(y_class_valid, total_class_output, average='macro'))

total_score = fold_score[0]
total_f1_score = fold_f1[0]
for i in range(1, len(fold_score)):
    total_score += fold_score[i]
    total_f1_score += fold_f1[i]
    
total_score = total_score/len(fold_score)
total_f1_score /= len(fold_score)

print("==============================")
print("Model Sum Average nrmse %0.4f" %(total_score))
print("Model Sum Average F1 %0.4f" %(total_f1_score))

In [None]:
final_pred = np.array(fold_target_pred[0])

for i in range(1, 6):
    final_pred += fold_target_pred[i]

final_pred = final_pred/6

class_final_pred = knnclassifier.predict(final_pred.reshape(-1, 1))

In [None]:
#Submission file 준비
submit = pd.read_csv('sample_submission.csv')
submit['Y_Class'] = class_final_pred

In [None]:
class_final_pred

In [None]:
submit.to_csv('submission.csv',index=False)