In [17]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.metrics import f1_score
import random, os
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner
from xgboost import XGBClassifier
import optuna
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler, LabelEncoder, StandardScaler
from fancyimpute import IterativeImputer, IterativeSVD
from sklearn.metrics import log_loss



In [18]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
SEED = 25
seed_everything(SEED) 

In [19]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

train.drop(columns=["PRODUCT_ID", "TIMESTAMP"], inplace=True)
test.drop(columns=["PRODUCT_ID", "TIMESTAMP"], inplace=True)
y = train['Y_Class']

num_features = test.select_dtypes(exclude=['object']).columns.to_list()

#scaler = StandardScaler()
#train[num_features] = scaler.fit_transform(train[num_features])
#test[num_features] = scaler.transform(test[num_features])

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train[i])
    train[i] = le.transform(train[i])
    
    for label in np.unique(test[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i] = le.transform(test[i]) 

X = train
X_test = test

for col in X:
    if X[col].nunique() < 2:
        X.drop(columns=col, inplace=True)
        X_test.drop(columns=col, inplace=True)

dup = ~X.T.duplicated()
X = X.loc[:, dup]
X_test = X_test.loc[:, dup]

num_features = X_test.select_dtypes(exclude=['object']).columns.to_list()

imputer = KNNImputer()
X[num_features] = imputer.fit_transform(X[num_features])
X_test[num_features] = imputer.transform(X_test[num_features])

#X.fillna(0, inplace=True)
#X_test.fillna(0, inplace=True)

X = X.drop(columns=['Y_Class', 'Y_Quality'])

In [None]:
X_train, X_valid = train_test_split(X, stratify=y, train_size=0.8, random_state=12)
y_train, y_valid = X_train['Y_Class'], X_valid['Y_Class']
X_train = X_train.drop(columns=['Y_Class', 'Y_Quality'])
X_valid = X_valid.drop(columns=['Y_Class', 'Y_Quality'])

In [None]:
from optuna.pruners import SuccessiveHalvingPruner

def objective(trial):
  param = {'verbosity':1,
          'objective':'multi:softmax', #
          'max_depth':trial.suggest_int('max_depth',3,30),
          'learning_rate':trial.suggest_loguniform('learning_rate',1e-8,1e-2),
          'n_estimators':trial.suggest_int('n_estimators',100,3000),
          'subsample':trial.suggest_loguniform('subsample',0.7,1),
          'min_child_weight': trial.suggest_int('min_child_weight', 1, 300 ),
          'alpha': trial.suggest_loguniform( 'alpha', 1e-3, 10.0),
          'random_state': 42}
  
  model = XGBClassifier(tree_method='gpu_hist', **param)
  model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100, verbose=False)
  pred = model.predict(np.array(X_valid))
  log_score = log_loss(np.array(y_valid), pred)

  return log_score

study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=1000)

In [None]:
study.best_trial

In [20]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
xgb_clfs = []
scores = []
for train_index, test_index in kf.split(X, y):
    #print(train_index)
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.values[train_index], y.values[test_index]
    
    xgb_params = {'max_depth': 14,
                    'min_child_weight': 2,
                    'learning_rate': 0.046,
                    'subsample': 0.99,
                    'colsample_bytree': 0.79}
            
    model = XGBClassifier(n_estimators=1000, tree_method='gpu_hist', random_state=SEED, **xgb_params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=False)
    
    pred = model.predict(X_valid)
    f1 = f1_score(y_valid, pred, average='macro')
    scores.append(f1)
    print(f1)
    
    xgb_clfs.append(model)
print('Mean F1:', np.mean(scores))
#Mean F1: 0.6877400367735194

0.6078431372549019
0.7246522540640189
0.7605466428995841
0.6214790040876997
0.6283166109253066
0.631983130330784
0.6839476969577502
0.6314060667001843
0.689085424379542
0.7015862524785194
Mean F1: 0.6680846220078291


In [23]:
preds = []
for i in range(10):
    xgb_pred = xgb_clfs[i].predict_proba(X_test)
    pred = xgb_pred
    if i == 0:
        preds = pred
    else:
        preds += pred
final_pred = np.argmax(preds, axis=1)

submit = pd.read_csv('sample_submission.csv')
submit['Y_Class'] = final_pred
submit.to_csv('submission.csv', index=False)
final_pred

array([1, 2, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 2, 0, 2, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 0, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 0, 0,
       2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1,
       2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,