In [1]:
!pip install -q mlflow===2.6.0
!pip install -q lightgbm===4.0.0
!pip install -q scikit-learn===1.3.1

In [2]:
import mlflow
import pandas as pd
# import Model
import importlib
import shutil
import numpy as np

import catboost as cat
import lightgbm as lgb
from sklearn import model_selection
from sklearn import metrics

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')


In [3]:
# Metadata and utilities.
data = {
    "train": {
        "scan_10cm": "/kaggle/input/humyn-scan-to-element/train/train/train_scan.csv",
        "targets": "/kaggle/input/humyn-scan-to-element/train/train/train_assay.csv",
        "preprocessed": "/kaggle/input/humyn-scan-to-element/data-preprocessed/train_preprocessed.csv",
    },
    "public": {
        "scan_10cm": "/kaggle/input/humyn-scan-to-element/public/public/public_scan.csv",
        "targets": "/kaggle/input/humyn-scan-to-element/public/public/public_template.csv",
    },
    "private": {
        "scan_10cm": "/kaggle/input/humyn-scan-to-element/private/private/private_scan.csv",
        "targets": "/kaggle/input/humyn-scan-to-element/private/private/private_template.csv",
    },
}

metadata = {
    "targets": [
        "Element 1",
        "Element 2",
        "Element 3",
        "Element 4",
        "Element 5",
        "Element 6",
        "Element 7",
        "Element 8",
        "Element 9",
        "Element 10",
        "Element 11",
    ],
    "forbidden_features": ["from", "HolNum", "to", "ID"],
    "y_identifier": "ID",
}


def gather_10cm_data(scan_10cm, holNum, fromDepth, toDepth):
    matching_hole_number = scan_10cm[scan_10cm["HolNum"] == holNum]
    matching_hole_number_and_from_depth = matching_hole_number[
        matching_hole_number["from"] >= fromDepth
    ]
    matching_hole_number_and_to_depth = matching_hole_number_and_from_depth[
        matching_hole_number_and_from_depth["to"] <= toDepth
    ]

    return matching_hole_number_and_to_depth

In [4]:
train_targets = pd.read_csv(data["train"]["targets"])
train_scan_10 = pd.read_csv(data["train"]["scan_10cm"])

train_targets.shape, train_scan_10.shape

((12055, 15), (83520, 52))

In [5]:
public_targets = pd.read_csv(data["public"]["targets"])
public_scan_10 = pd.read_csv(data["public"]["scan_10cm"])

public_targets.shape, public_scan_10.shape

((2845, 15), (34572, 52))

In [6]:
private_targets = pd.read_csv(data["private"]["targets"])
private_scan_10 = pd.read_csv(data["private"]["scan_10cm"])

private_targets.shape, private_scan_10.shape

((2879, 15), (32231, 52))

In [7]:
def gather_10cm_data(scan_10cm, holNum, fromDepth, toDepth):
    matching_hole_number = scan_10cm[scan_10cm["HolNum"] == holNum]
    matching_hole_number_and_from_depth = matching_hole_number[
        matching_hole_number["from"] >= fromDepth
    ]
    matching_hole_number_and_to_depth = matching_hole_number_and_from_depth[
        matching_hole_number_and_from_depth["to"] <= toDepth
    ]

    return matching_hole_number_and_to_depth

In [8]:
train_df = pd.read_csv("/kaggle/input/humyn-scan-to-element/train.csv")
public_df = pd.read_csv("/kaggle/input/humyn-scan-to-element/public.csv")
private_df = pd.read_csv("/kaggle/input/humyn-scan-to-element/private.csv")

train_df.shape, public_df.shape, private_df.shape

((73561, 64), (29536, 64), (27448, 64))

In [9]:
train_df.isna().sum().describe()

count       64.000000
mean      3051.968750
std      10436.094114
min          0.000000
25%          0.000000
50%         76.000000
75%        367.500000
max      59862.000000
dtype: float64

In [10]:
(train_df.isna().sum() / train_df.shape[0]).describe()

count    64.000000
mean      0.041489
std       0.141870
min       0.000000
25%       0.000000
50%       0.001033
75%       0.004996
max       0.813774
dtype: float64

In [11]:
train_df

Unnamed: 0,HolNum,from,to,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,...,Element 3,Element 4,Element 5,Element 6,Element 7,Element 8,Element 9,Element 10,Element 11,ID
0,2.0,183.0,183.1,168.0,85.0,2.76,61.0,0.014930,0.0,0.0,...,60.0,1.0,19900.0,,28800.0,,17100.0,,61400.0,1.0
1,2.0,183.1,183.2,170.0,93.0,2.75,62.0,0.000956,0.0,0.0,...,60.0,1.0,19900.0,,28800.0,,17100.0,,61400.0,1.0
2,2.0,183.2,183.3,152.0,97.0,2.77,55.0,0.000457,0.0,0.0,...,60.0,1.0,19900.0,,28800.0,,17100.0,,61400.0,1.0
3,2.0,183.3,183.4,164.0,85.0,2.74,60.0,0.000759,0.0,0.0,...,60.0,1.0,19900.0,,28800.0,,17100.0,,61400.0,1.0
4,2.0,183.4,183.5,178.0,93.0,2.88,62.0,0.008605,0.0,0.0,...,60.0,1.0,19900.0,,28800.0,,17100.0,,61400.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73556,117.0,161.3,161.4,374.0,87.0,2.71,138.0,0.000076,0.0,0.0,...,,,,,,,,,,20642.0
73557,117.0,161.4,161.5,657.0,98.0,2.72,242.0,0.000109,0.0,0.0,...,,,,,,,,,,20642.0
73558,117.0,161.5,161.6,583.0,90.0,2.75,212.0,0.000439,0.0,0.0,...,,,,,,,,,,20642.0
73559,117.0,161.6,161.7,295.0,90.0,2.73,108.0,0.000387,0.0,0.0,...,,,,,,,,,,20642.0


In [12]:
train_df

Unnamed: 0,HolNum,from,to,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,...,Element 3,Element 4,Element 5,Element 6,Element 7,Element 8,Element 9,Element 10,Element 11,ID
0,2.0,183.0,183.1,168.0,85.0,2.76,61.0,0.014930,0.0,0.0,...,60.0,1.0,19900.0,,28800.0,,17100.0,,61400.0,1.0
1,2.0,183.1,183.2,170.0,93.0,2.75,62.0,0.000956,0.0,0.0,...,60.0,1.0,19900.0,,28800.0,,17100.0,,61400.0,1.0
2,2.0,183.2,183.3,152.0,97.0,2.77,55.0,0.000457,0.0,0.0,...,60.0,1.0,19900.0,,28800.0,,17100.0,,61400.0,1.0
3,2.0,183.3,183.4,164.0,85.0,2.74,60.0,0.000759,0.0,0.0,...,60.0,1.0,19900.0,,28800.0,,17100.0,,61400.0,1.0
4,2.0,183.4,183.5,178.0,93.0,2.88,62.0,0.008605,0.0,0.0,...,60.0,1.0,19900.0,,28800.0,,17100.0,,61400.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73556,117.0,161.3,161.4,374.0,87.0,2.71,138.0,0.000076,0.0,0.0,...,,,,,,,,,,20642.0
73557,117.0,161.4,161.5,657.0,98.0,2.72,242.0,0.000109,0.0,0.0,...,,,,,,,,,,20642.0
73558,117.0,161.5,161.6,583.0,90.0,2.75,212.0,0.000439,0.0,0.0,...,,,,,,,,,,20642.0
73559,117.0,161.6,161.7,295.0,90.0,2.73,108.0,0.000387,0.0,0.0,...,,,,,,,,,,20642.0


In [13]:
def create_features(df):
    
    feature_cols = [f'Feature {i}' for i in range(1, 50)] + ['from', 'to']
    
    agg_fe_df = df.groupby(["ID"])[feature_cols].agg(['mean', 'std', 'min', 'max', 'last', 'first', 'sem', 'median', 'sum', 'nunique'])
    agg_fe_df.columns = ['_'.join(x) for x in agg_fe_df.columns]
    agg_fe_df.reset_index(inplace=True)
    
#     agg_fe_df, new_columns = add_more_num_features(agg_fe_df)
    
#     agg_fe_df3 = df.groupby(["ID"])[feature_cols].last(3).agg(['mean', 'std', 'min', 'max', 'sum'])
#     agg_fe_df3 = df.groupby(["ID"]).last(3).groupby("ID")[feature_cols].agg(['mean', 'std', 'min', 'max', 'sum'])
#     agg_fe_df3.columns = ['_'.join(x) for x in agg_fe_df3.columns]
#     agg_fe_df3 = agg_fe_df3.add_prefix("L3_")
#     agg_fe_df3.reset_index(inplace=True)
    
#     print(agg_fe_df3.head())
    
#     agg_fe_df = agg_fe_df.merge(agg_fe_df3, on='ID', how='left')
    
#     diff_num_features = [f'diff_{col}' for col in feature_cols]
#     cids = df['ID'].values
#     diff_df = df.groupby("ID")[feature_cols].diff().add_prefix('diff_')
#     diff_df.insert(0,'ID',cids)
#     diff_df = diff_df.groupby("ID",sort=False)[diff_num_features].agg(['mean','std', 'median', 'min', 'max'])
#     diff_df.columns = ['_'.join(x) for x in diff_df.columns]
#     diff_df.reset_index(inplace=True)
    
#     agg_fe_df = agg_fe_df.merge(diff_df, on='ID', how='left')
    
    return agg_fe_df

In [14]:
train_fe_df = create_features(train_df)
public_fe_df = create_features(public_df)
private_fe_df = create_features(private_df)

In [15]:
train_fe_df = train_fe_df.merge(train_targets, on='ID', how='left')
public_fe_df = public_fe_df.merge(public_targets, on='ID', how='left')
private_fe_df = private_fe_df.merge(private_targets, on='ID', how='left')

In [16]:
train_fe_df.shape, public_fe_df.shape, private_fe_df.shape

((7494, 525), (2845, 525), (2879, 525))

In [17]:
target_cols = ['Element 1', 'Element 2', 'Element 3', 'Element 4',
       'Element 5', 'Element 6', 'Element 7', 'Element 8', 'Element 9',
       'Element 10', 'Element 11']

drop_cols = ['HolNum', 'from', 'to', 'ID']

train_cols = [col for col in train_fe_df.columns if col not in target_cols + drop_cols]

train_cols.__len__(), target_cols.__len__()

(510, 11)

In [18]:
# train_fe_df = train_fe_df[~train_fe_df[target_cols].isna().any(axis=1)]
# train_fe_df[target_cols].isna().any(axis=1).sum()

In [19]:
scaler_params_df = pd.read_csv("/kaggle/input/humyn-scan-to-element/scaler_params.csv")

In [20]:
scaler_params_df

Unnamed: 0,Element,Min,Max
0,Element 1,0.005,77.0
1,Element 2,1.0,176500.0
2,Element 3,0.5,3110.0
3,Element 4,0.01,19300.0
4,Element 5,0.01,372000.0
5,Element 6,40.0,103000.0
6,Element 7,50.0,343000.0
7,Element 8,50.0,100000.0
8,Element 9,30.0,54400.0
9,Element 10,400.0,234000.0


In [21]:
train_fe_df.shape, public_fe_df.shape, private_fe_df.shape

((7494, 525), (2845, 525), (2879, 525))

In [22]:
def comp_score(y_true, y_pred, scaler_params, target_col):
    
    scaler_params.set_index('Element', inplace=True)
    
    def scale_col(col, df):
        min = scaler_params.loc[col]['Min']
        max = scaler_params.loc[col]['Max']
        
        return ((df - min) / (max - min))
    
    y_pred = scale_col(target_col, y_pred)
    y_true = scale_col(target_col, y_true)
    
    rmse_col = np.sqrt(metrics.mean_squared_error(y_true, y_pred))
    
    return rmse_col

In [23]:
target_cols

['Element 1',
 'Element 2',
 'Element 3',
 'Element 4',
 'Element 5',
 'Element 6',
 'Element 7',
 'Element 8',
 'Element 9',
 'Element 10',
 'Element 11']

In [24]:
hyper_params_dict = {
    'Element 1' : {'num_leaves': 69, 'reg_alpha': 0.5287326621758871, 'reg_lambda': 2.4729982913019057, 'bagging_fraction': 0.8606119033646991, 'bagging_freq': 10}, 
    'Element 2' : {'num_leaves': 64, 'reg_alpha': 0.04121827943757443, 'reg_lambda': 0.248266586732281, 'bagging_fraction': 0.774498078538542, 'bagging_freq': 7}, 
    'Element 3' : {'num_leaves': 92, 'reg_alpha': 0.01563166917010462, 'reg_lambda': 6.617117863940754, 'bagging_fraction': 0.5034515857061538, 'bagging_freq': 3}, 
    'Element 4' : {'num_leaves': 77, 'reg_alpha': 0.0014510334322778821, 'reg_lambda': 0.03232247214991263, 'bagging_fraction': 0.5304841706453828, 'bagging_freq': 5},
    'Element 5' : {'num_leaves': 66, 'reg_alpha': 1.2368066832029059, 'reg_lambda': 3.488251372248249, 'bagging_fraction': 0.5248555379874488, 'bagging_freq': 1}, 
    'Element 6' : {'num_leaves': 8, 'reg_alpha': 0.002620056927942123, 'reg_lambda': 0.002163365824995371, 'bagging_fraction': 0.7502017385841376, 'bagging_freq': 1}, 
    'Element 7' : {'num_leaves': 7, 'reg_alpha': 5.659702092324108, 'reg_lambda': 0.012311638334853584, 'bagging_fraction': 0.8977217792006633, 'bagging_freq': 2}, 
    'Element 8' : {'num_leaves': 39, 'reg_alpha': 0.45736490403833924, 'reg_lambda': 0.00788363242637418, 'bagging_fraction': 0.5787475570663563, 'bagging_freq': 1}, 
    'Element 9' : {'num_leaves': 17, 'reg_alpha': 0.034196228587034076, 'reg_lambda': 0.2808064724275206, 'bagging_fraction': 0.7518959006853106, 'bagging_freq': 2},
    'Element 10' : {'num_leaves': 2, 'reg_alpha': 0.031283332000286046, 'reg_lambda': 0.0014702587011116376, 'bagging_fraction': 0.9308968911409915, 'bagging_freq': 1} , 
    'Element 11' : {'num_leaves': 18, 'reg_alpha': 0.003878736962863597, 'reg_lambda': 2.4259170124215625, 'bagging_fraction': 0.9184485638720784, 'bagging_freq': 8}
}

In [25]:
# hyper_params_dict = {
#     'Element 1' : {'max_depth': 100, 'num_leaves': 181, 'reg_alpha': 0.0010125083661711855, 'reg_lambda': 0.07000948049773605, 'colsample_bytree': 0.9868421990248336, 'subsample': 0.990784113193073, 'reg_sqrt': 'true'},
#     'Element 2' : {'max_depth': 5, 'num_leaves': 64, 'reg_alpha': 0.0027203654174694394, 'reg_lambda': 0.06768164227437246, 'colsample_bytree': 0.8515578116249436, 'subsample': 0.896043235981948, 'reg_sqrt': 'true'},
#     'Element 3' : {'max_depth': 5, 'num_leaves': 137, 'reg_alpha': 0.16558553363376735, 'reg_lambda': 0.01937736968112384, 'colsample_bytree': 0.7837859140452501, 'subsample': 0.7528080174976094, 'reg_sqrt': 'true'},
#     'Element 4' : {'max_depth': 100, 'num_leaves': 71, 'reg_alpha': 0.039056624615821364, 'reg_lambda': 0.9905682905196831, 'colsample_bytree': 0.7083762898708019, 'subsample': 0.8847121819346591, 'reg_sqrt': 'false'},
#     'Element 5' : {'max_depth': 5, 'num_leaves': 101, 'reg_alpha': 0.10080465067944423, 'reg_lambda': 0.07358105506052906, 'colsample_bytree': 0.8542500710866981, 'subsample': 0.959226315881138, 'reg_sqrt': 'true'},
#     'Element 6' : {'max_depth': 10, 'num_leaves': 29, 'reg_alpha': 0.37744520766727185, 'reg_lambda': 0.014108866716250603, 'colsample_bytree': 0.7094926248555823, 'subsample': 0.9057654926553005, 'reg_sqrt': 'true'},
#     'Element 7' : {'max_depth': 5, 'num_leaves': 171, 'reg_alpha': 0.04250126687710346, 'reg_lambda': 0.0018067899238887342, 'colsample_bytree': 0.8759639723192044, 'subsample': 0.8715658188414552, 'reg_sqrt': 'false'},
#     'Element 8' : {'max_depth': -1, 'num_leaves': 30, 'reg_alpha': 0.02966208181398974, 'reg_lambda': 0.2764354632651332, 'colsample_bytree': 0.9557818418237125, 'subsample': 0.7872535612446406, 'reg_sqrt': 'false'},
#     'Element 9' : {'max_depth': 10, 'num_leaves': 37, 'reg_alpha': 0.0057185822248889216, 'reg_lambda': 0.014100838085982422, 'colsample_bytree': 0.7743053119173764, 'subsample': 0.8816537044447643, 'reg_sqrt': 'true'},
#     'Element 10' : {'max_depth': 5, 'num_leaves': 85, 'reg_alpha': 0.9279522223373954, 'reg_lambda': 0.015589526518143656, 'colsample_bytree': 0.9014972432804638, 'subsample': 0.9594828072290378, 'reg_sqrt': 'false'},
#     'Element 11' : {'max_depth': -1, 'num_leaves': 51, 'reg_alpha': 0.20440033038416047, 'reg_lambda': 0.051211408938815706, 'colsample_bytree': 0.7642156480340841, 'subsample': 0.9111271773489782, 'reg_sqrt': 'true'},
# }

In [26]:
# params = {

# 'learning_rate': 0.1,

# 'feature_fraction': 0.9,

# "max_depth": 19,

# "num_leaves": 93,

# "max_bin": 255,

# 'min_data_in_leaf':5,

# }

In [27]:
skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X_public = public_fe_df[train_cols]
X_private = private_fe_df[train_cols]

public_predict_dict = {}
private_predict_dict = {}

comp_scores_list = []

for target in target_cols:

    print("Target : ", target)
    
    tmp_train_fe_df = train_fe_df[train_fe_df[target].notna()].reset_index(drop=True)
    nan_train_fe_df = train_fe_df[train_fe_df[target].isna()].reset_index(drop=True)
    X_nan = nan_train_fe_df[train_cols]
    
    print("train_df : ", tmp_train_fe_df.shape)
    print("nan train_df : ", nan_train_fe_df.shape)

    public_predict_list = []
    private_predict_list = []
    nan_predict_list = []
    
    oof_valid_preds = np.zeros(tmp_train_fe_df.shape[0], )

    num_bins = int(np.floor(1+(3.3)*np.log2(len(tmp_train_fe_df))))
    bins = pd.cut(tmp_train_fe_df[target], bins=num_bins, labels=False)

    for fold, (train_idx, valid_idx) in enumerate(skf.split(tmp_train_fe_df, bins)):
        
        X_train, y_train = tmp_train_fe_df.iloc[train_idx][train_cols], tmp_train_fe_df.iloc[train_idx][target]
        X_valid, y_valid = tmp_train_fe_df.iloc[valid_idx][train_cols], tmp_train_fe_df.iloc[valid_idx][target]

        print("Fold : ", fold)
        print("Trian :", X_train.shape, y_train.shape)
        print("Valid :", X_valid.shape, y_valid.shape)

        params = {
            "objective": "poisson",
            "metric": "rmse",
            "n_estimators" : 10000,
            "boosting_type": "gbdt",                
            "seed": 42,
#             'reg_sqrt' : 'true',
#             'learning_rate': 0.05, 
        }
        
        params = {**params, **hyper_params_dict[target]}
        
        print("Params : ", params)
        

        model = lgb.LGBMRegressor(**params)
        
        early_stopping_callback = lgb.early_stopping(100, first_metric_only=True, verbose=False)
#         verbose_callback = lgb.log_evaluation(500)
        
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], 
                  
                  callbacks=[early_stopping_callback],
        )
        
        valid_predict = model.predict(X_valid)
        public_predict = model.predict(X_public)
        private_predict = model.predict(X_private)
        nan_predict = model.predict(X_nan)
        
        oof_valid_preds[valid_idx] = valid_predict
        public_predict_list.append(public_predict)
        private_predict_list.append(private_predict)
        nan_predict_list.append(nan_predict)
        
        score = metrics.mean_squared_error(y_valid, valid_predict)
        print("Score : ", score)
        
    oof_score = comp_score(tmp_train_fe_df[target].values, oof_valid_preds, scaler_params_df.copy(), target_col=target)
#     comp_scores_list.append(oof_score)
    
    print("########### Comp oof_score : ", oof_score)
    
    print("===---*10")
    print("===---*10")
    print("===---*10")
    

    public_preds = np.mean(public_predict_list, axis=0)
    private_preds = np.mean(private_predict_list, axis=0)
    nan_preds = np.mean(nan_predict_list, axis=0)
    
    nan_train_fe_df[target] = nan_preds
    
    tmp_train_fe_df = pd.concat([tmp_train_fe_df, nan_train_fe_df]).reset_index(drop=True)
    
    public_predict_list = []
    private_predict_list = []
    oof_valid_preds = np.zeros(tmp_train_fe_df.shape[0], )

    num_bins = int(np.floor(1+(3.3)*np.log2(len(tmp_train_fe_df))))
    bins = pd.cut(tmp_train_fe_df[target], bins=num_bins, labels=False)

    for fold, (train_idx, valid_idx) in enumerate(skf.split(tmp_train_fe_df, bins)):
        
        X_train, y_train = tmp_train_fe_df.iloc[train_idx][train_cols], tmp_train_fe_df.iloc[train_idx][target]
        X_valid, y_valid = tmp_train_fe_df.iloc[valid_idx][train_cols], tmp_train_fe_df.iloc[valid_idx][target]

        print("Fold : ", fold)
        print("Trian :", X_train.shape, y_train.shape)
        print("Valid :", X_valid.shape, y_valid.shape)

        # params = {
        #     'iterations':10000,
        #     'loss_function': 'MultiRMSE',
        #     'eval_metric' : 'MultiRMSE',
        #     'task_type': 'CPU',

        # }
        # model = cat.CatBoostRegressor(**params)
        # model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], 
        #             early_stopping_rounds=200,
        #             verbose=100)


        params = {
            "objective": "poisson",
            "metric": "rmse",
            "n_estimators" : 10000,
            "boosting_type": "gbdt",                
            "seed": 42,
#             'reg_sqrt' : 'true',
#             'learning_rate': 0.05, 
        }
        
#         params = {**params, **hyper_params_dict[target]}
        
        print("Params : ", params)
        

        model = lgb.LGBMRegressor(**params)
        
        early_stopping_callback = lgb.early_stopping(100, first_metric_only=True, verbose=False)
#         verbose_callback = lgb.log_evaluation(500)
        
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], 
#                     early_stopping_rounds=200
                    #eval_metric='multi_logloss',
                    #eval_metric=accuracy_for_lgbm,
                    #categorical_feature=cat_cols,
                    # verbose=100,
                  
                  callbacks=[early_stopping_callback],
        )
        
        valid_predict = model.predict(X_valid)
        public_predict = model.predict(X_public)
        private_predict = model.predict(X_private)
        
        oof_valid_preds[valid_idx] = valid_predict
        public_predict_list.append(public_predict)
        private_predict_list.append(private_predict)
        
        score = metrics.mean_squared_error(y_valid, valid_predict)
        print("Score : ", score)
        
    oof_score = comp_score(tmp_train_fe_df[target].values, oof_valid_preds, scaler_params_df.copy(), target_col=target)
    
    comp_scores_list.append(oof_score)
    
    print("########### Comp oof_score : ", oof_score)

    public_preds = np.mean(public_predict_list, axis=0)
    private_preds = np.mean(private_predict_list, axis=0)

    public_predict_dict[target] = public_preds
    private_predict_dict[target] = private_preds
    

score_a = sum(comp_scores_list[:4])
score_b = sum(comp_scores_list[4:])
public_total_score = (1 / 14) * (((7 / 4) * score_a) + (score_b))

print("==--**" * 30)
print(comp_scores_list)
print("Score a: ", score_a)
print("Score b: ", score_a)
print("Final Score : ", public_total_score)

Target :  Element 1
train_df :  (6864, 525)
nan train_df :  (630, 525)
Fold :  0
Trian : (5491, 510) (5491,)
Valid : (1373, 510) (1373,)
Params :  {'objective': 'poisson', 'metric': 'rmse', 'n_estimators': 10000, 'boosting_type': 'gbdt', 'seed': 42, 'num_leaves': 69, 'reg_alpha': 0.5287326621758871, 'reg_lambda': 2.4729982913019057, 'bagging_fraction': 0.8606119033646991, 'bagging_freq': 10}
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97600
[LightGBM] [Info] Number of data points in the train set: 5491, number of used features: 509
[LightGBM] [Info] Start training from score -1.862346
Score :  0.08295927897728178
Fold :  1
Trian : (5491, 510) (5491,)
Valid : (1373, 510) (1373,)
Params :  {'objective': 'poisson', 'metric': 'rmse', 'n_estimators': 10000, 'boosting_type': 'gbdt', 'seed': 42, 'num_leaves': 69, 'reg_alpha': 0.5287326621758871, 'reg_lambda': 2.4729982913019057, 'bagging_fraction': 0.8606119033646991, 'bagging_freq': 10}
You can set 

In [28]:
tmp_train_fe_df

Unnamed: 0,ID,Feature 1_mean,Feature 1_std,Feature 1_min,Feature 1_max,Feature 1_last,Feature 1_first,Feature 1_sem,Feature 1_median,Feature 1_sum,...,Element 5,Element 6,Element 7,Element 8,Element 9,Element 10,Element 11,from,HolNum,to
0,1.0,176.600000,15.500538,152.0,201.0,187.0,168.0,4.901700,174.0,1766.0,...,19900.0,,28800.0,,17100.0,,61400.000000,183.0,2,184.0
1,2.0,167.200000,21.811312,137.0,196.0,144.0,196.0,6.897342,165.0,1672.0,...,13900.0,,19300.0,,16800.0,,69700.000000,184.0,2,185.0
2,3.0,375.800000,42.868792,283.0,437.0,377.0,354.0,13.556302,374.5,3758.0,...,13500.0,,41000.0,,20500.0,,77200.000000,185.0,2,186.0
3,4.0,269.142857,109.968827,45.0,373.0,280.0,315.0,41.564310,309.0,1884.0,...,4500.0,,80700.0,,12900.0,,44300.000000,186.0,2,187.0
4,5.0,364.400000,57.777735,228.0,421.0,303.0,386.0,18.270924,378.0,3644.0,...,1900.0,,33900.0,,8600.0,,41900.000000,187.0,2,188.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7489,20637.0,719.000000,71.342679,531.0,801.0,734.0,754.0,22.560536,734.5,7190.0,...,,,,,,,22230.706403,157.0,117,158.0
7490,20638.0,681.333333,108.328513,482.0,798.0,653.0,798.0,44.224930,711.0,4088.0,...,,,,,,,19186.554391,158.0,117,158.6
7491,20640.0,813.538462,227.847396,438.0,1479.0,815.0,798.0,63.193498,789.0,10576.0,...,,,,,,,51565.495908,158.7,117,160.0
7492,20641.0,734.600000,87.083868,505.0,813.0,794.0,755.0,27.538337,749.5,7346.0,...,,,,,,,36861.738583,160.0,117,161.0


In [29]:
tmp_df = pd.concat([tmp_train_fe_df, nan_train_fe_df]).reset_index(drop=True)

In [30]:
tmp_df.shape

(7784, 525)

In [31]:
tmp_train_fe_df.shape

(7494, 525)

In [32]:
nan_train_fe_df.shape

(290, 525)

In [33]:
public_preds_df = pd.DataFrame(public_predict_dict)
private_preds_df = pd.DataFrame(private_predict_dict)

public_preds_df.shape, private_preds_df.shape

((2845, 11), (2879, 11))

In [34]:
public_preds_df['ID'] = public_fe_df['ID'].values
private_preds_df['ID'] = private_fe_df['ID'].values

In [35]:
public_preds_df.isna().sum()

Element 1     0
Element 2     0
Element 3     0
Element 4     0
Element 5     0
Element 6     0
Element 7     0
Element 8     0
Element 9     0
Element 10    0
Element 11    0
ID            0
dtype: int64

In [36]:
private_preds_df.isna().sum()

Element 1     0
Element 2     0
Element 3     0
Element 4     0
Element 5     0
Element 6     0
Element 7     0
Element 8     0
Element 9     0
Element 10    0
Element 11    0
ID            0
dtype: int64

In [37]:
public_df

Unnamed: 0,HolNum,from,to,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,...,Element 3,Element 4,Element 5,Element 6,Element 7,Element 8,Element 9,Element 10,Element 11,ID
0,5.0,109.3,109.4,86.0,45.0,2.64,33.0,0.000440,0.00000,0.000054,...,,,,,,,,,,652.0
1,5.0,109.4,109.5,321.0,97.0,2.67,120.0,0.000584,0.00082,0.000157,...,,,,,,,,,,652.0
2,5.0,109.5,109.6,256.0,89.0,2.65,97.0,0.000530,0.00000,0.000000,...,,,,,,,,,,652.0
3,5.0,109.6,109.7,237.0,97.0,2.65,89.0,0.000592,0.00000,0.000000,...,,,,,,,,,,652.0
4,5.0,109.7,109.8,272.0,89.0,2.65,103.0,0.000763,0.00000,0.000026,...,,,,,,,,,,652.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29531,116.0,393.9,394.0,727.0,88.0,2.67,272.0,0.000461,0.00000,0.000072,...,,,,,,,,,,20495.0
29532,116.0,394.0,394.1,733.0,95.0,2.69,273.0,0.000486,0.00000,0.000735,...,,,,,,,,,,20496.0
29533,116.0,394.1,394.2,341.0,91.0,2.70,126.0,0.000467,0.00000,0.000000,...,,,,,,,,,,20496.0
29534,116.0,394.2,394.3,498.0,95.0,2.69,185.0,0.000392,0.00000,0.000000,...,,,,,,,,,,20496.0


In [38]:
# public_df[target_cols] = public_preds_df[target_cols]
# private_df[target_cols] = private_preds_df[target_cols]

In [39]:
public_predictions_df = public_preds_df.groupby("ID")[target_cols].mean().reset_index()
private_predictions_df = private_preds_df.groupby("ID")[target_cols].mean().reset_index()

In [40]:
public_predictions_df = public_predictions_df.merge(public_targets[['ID', 'from', 'HolNum', 'to']], how='left', on='ID')
private_predictions_df = private_predictions_df.merge(private_targets[['ID', 'from', 'HolNum', 'to']], how='left', on='ID')

public_predictions_df.shape, private_predictions_df.shape

((2845, 15), (2879, 15))

In [41]:
public_predictions_df.isna().sum()

ID            0
Element 1     0
Element 2     0
Element 3     0
Element 4     0
Element 5     0
Element 6     0
Element 7     0
Element 8     0
Element 9     0
Element 10    0
Element 11    0
from          0
HolNum        0
to            0
dtype: int64

In [42]:
private_predictions_df.isna().sum()

ID            0
Element 1     0
Element 2     0
Element 3     0
Element 4     0
Element 5     0
Element 6     0
Element 7     0
Element 8     0
Element 9     0
Element 10    0
Element 11    0
from          0
HolNum        0
to            0
dtype: int64

In [43]:
all_predictions = pd.concat(
    [public_predictions_df, private_predictions_df], axis=0, ignore_index=True
).astype('float64')

all_predictions = all_predictions[['Element 1', 'Element 2', 'Element 3', 'Element 4', 'Element 5',
       'Element 6', 'Element 7', 'Element 8', 'Element 9', 'Element 10',
       'Element 11', 'from', 'HolNum', 'to', 'ID']]


all_predictions.to_csv("predictions.csv", index=False)

In [44]:
all_predictions.describe()

Unnamed: 0,Element 1,Element 2,Element 3,Element 4,Element 5,Element 6,Element 7,Element 8,Element 9,Element 10,Element 11,from,HolNum,to,ID
count,5724.0,5724.0,5724.0,5724.0,5724.0,5724.0,5724.0,5724.0,5724.0,5724.0,5724.0,5724.0,5724.0,5724.0,5724.0
mean,0.159834,2547.910222,68.384212,41.957207,9735.467701,18325.029673,17574.487121,40546.256167,6684.108789,70194.390167,45915.71837,235.389434,64.311495,236.418688,9794.99703
std,0.169127,3686.238532,65.791792,100.359874,11422.647333,15587.247326,20681.019453,15693.820454,3829.607711,16095.896192,21148.326109,95.522194,33.714302,95.599225,5827.190831
min,0.0218,55.979712,9.305387,19.772241,247.232391,372.740611,997.066669,728.894205,386.372694,11121.133286,10738.394358,71.7,4.0,73.0,67.0
25%,0.04286,395.194629,25.528279,20.062091,1652.827679,4051.491944,4661.62008,30314.239951,3546.298808,65857.475707,32241.867078,171.075,33.0,172.0,4575.75
50%,0.097363,1241.501719,46.156304,23.118855,4826.624924,12646.065392,10363.955901,40862.400752,6494.441933,73807.621395,42559.927679,215.0,70.0,216.0,9529.5
75%,0.210079,3286.934285,86.457454,27.195795,14543.368513,31623.046724,22513.115957,53071.703157,9347.660772,79539.243527,54417.957144,271.0,96.0,272.0,15350.25
max,1.330021,59866.423842,807.271232,2460.374716,103081.762232,58265.830857,172705.514942,77775.161111,35510.748708,120545.302096,233803.176359,632.0,116.0,633.4,20496.0
