In [17]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.metrics import f1_score
import random, os
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner
import optuna
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

In [18]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
SEED = 2023
seed_everything(SEED) 

In [19]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

#train = train.drop(columns=['TIMESTAMP', 'PRODUCT_ID'])
#test = test.drop(columns=['TIMESTAMP', 'PRODUCT_ID'])
cat_features = ['LINE', 'PRODUCT_CODE', "Y_Quality"]
num_features = [i for i in test.columns if i not in cat_features]

y = train['Y_Class']

dup = ~train.T.duplicated()
train = train.loc[:, dup]
test = test.loc[:, dup]

use_cat = True
if use_cat: 
    X = train.drop(columns=['Y_Class'])
    X_test = test
else: 
    X = train[num_features]
    X_test = test[num_features]


In [24]:
def median_by_LINE(df : pd.DataFrame, test_df : pd.DataFrame):
    LINE = ["T010305", "T010306", "T050304", "T050307", "T100304", "T100306"]
    output_train, output_test = [], []
    for i in range(0, 6):
        idx_train = (df['LINE'] == LINE[i])
        idx_test  = (test_df['LINE'] == LINE[i])
        tmp_train = df[idx_train]
        tmp_test = test_df[idx_test]
        
        tmp_train.dropna(how='all',inplace=True, axis=1)
        tmp_test = tmp_test[list(tmp_train.columns[0:1]) + list(tmp_train.columns[3:])]
        
        num_features = tmp_test.select_dtypes(exclude=['object']).columns.to_list()
        mice_imputer = IterativeImputer(estimator = LinearRegression(), 
                       tol= 1e-10, 
                       max_iter=30, 
                       verbose=2, 
                       imputation_order='roman')

        tmp_train[num_features] = mice_imputer.fit_transform(tmp_train[num_features])
        tmp_test[num_features] = mice_imputer.transform(tmp_test[num_features])
        
        output_train.append(tmp_train)
        output_test.append(tmp_test)
    return output_train, output_test

tmp_X, tmp_test_X = median_by_LINE(train, test)

[IterativeImputer] Completing matrix with shape (59, 622)
[IterativeImputer] Ending imputation round 1/30, elapsed time 8.83
[IterativeImputer] Change: 426713.0653668804, scaled tolerance: 0.00037007500000000004 
[IterativeImputer] Ending imputation round 2/30, elapsed time 17.47
[IterativeImputer] Change: 70155.64330452855, scaled tolerance: 0.00037007500000000004 
[IterativeImputer] Ending imputation round 3/30, elapsed time 26.13
[IterativeImputer] Change: 14130.495685400605, scaled tolerance: 0.00037007500000000004 
[IterativeImputer] Ending imputation round 4/30, elapsed time 34.67
[IterativeImputer] Change: 2497.097350096774, scaled tolerance: 0.00037007500000000004 
[IterativeImputer] Ending imputation round 5/30, elapsed time 43.72
[IterativeImputer] Change: 418.02944907111305, scaled tolerance: 0.00037007500000000004 
[IterativeImputer] Ending imputation round 6/30, elapsed time 52.39
[IterativeImputer] Change: 70.18294993151358, scaled tolerance: 0.00037007500000000004 
[Iter

In [43]:
save_X = pd.concat(tmp_X, axis=0).sort_values("PRODUCT_ID")
save_test_X = pd.concat(tmp_test_X, axis=0).sort_values("PRODUCT_ID")

In [None]:
save_X = save_X[train.columns]
save_test_X = save_test_X[test.columns]

In [45]:
save_X.to_csv("dataset/train_mice.csv", index=False)
save_test_X.to_csv("dataset/test_mice.csv", index=False)

In [None]:
num_features = [i for i in test.columns if i not in cat_features]

# calling the  MICE class
mice_imputer = IterativeImputer()
#knn_imputer = KNN()
train[num_features] = mice_imputer.fit_transform(train[num_features])
test[num_features] = mice_imputer.transform(test[num_features])

In [None]:
def objective(trial):
  params = {
    'num_leaves': trial.suggest_int('num_leaves', 8, 20, step=1, log=True), 
    'max_depth': trial.suggest_int('max_depth', 3, 20, step=1, log=False), 
    'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.15, log=True), 
    'n_estimators': trial.suggest_int('n_estimators', 50, 150, step=1, log=True), 
    "metric": "multiclass",
    'class_weight': trial.suggest_categorical('class_weight', ['balanced']),
    'min_child_samples': trial.suggest_int('min_child_samples', 10, 50, step=1, log=False), 
    'subsample': trial.suggest_uniform('subsample', 0.7, 1.0), 
    'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
    'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1),
    'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 1),
    'random_state': SEED
  }

  kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
  scores = []
  for train_index, test_index in kf.split(X, y):
    X_train, X_valid = X.values[train_index], X.values[test_index]
    y_train, y_valid = y.values[train_index], y.values[test_index]
    model = LGBMClassifier(verbose=-1,**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
    pred = model.predict(X_valid)
    f1 = f1_score(y_valid, pred, average='macro')
    scores.append(f1)

  return np.mean(scores)

study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=SEED), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=10000)

In [14]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)

clfs = []
scores = []
for train_index, test_index in kf.split(X, y):
    #print(train_index)
    X_train, X_valid = X.values[train_index], X.values[test_index]
    y_train, y_valid = y.values[train_index], y.values[test_index]
    
    params = study.best_params
    model = LGBMClassifier(random_state=SEED, verbose=-1, **params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],)
    pred = model.predict(X_valid)
    f1 = f1_score(y_valid, pred, average='macro')
    scores.append(f1)
    clfs.append(model)
print(scores)
print('Mean F1:', np.mean(scores))

NameError: name 'study' is not defined

In [77]:
preds = []
for i, model in enumerate(clfs):
    pred = model.predict_proba(X_test)
    if i == 0:
        preds = pred
    else:
        preds += pred
final_pred = np.argmax(preds, axis=1)

submit = pd.read_csv('sample_submission.csv')
submit['Y_Class'] = final_pred
submit.to_csv('submission.csv', index=False)
final_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 2, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 0, 1,
       0, 0, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1,
       2, 1, 1, 2, 1, 2, 1, 0, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 0, 1,
       1, 1, 2, 2, 1, 2, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1,
       2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,