In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool, cv
from utils_testing import optuna_logging, train_xgb_model, train_cat_model, train_lgb_model

In [None]:
train_df = pd.read_pickle("../data/train_df_interim.pickle")
test_df = pd.read_pickle("../data/test_df_interim.pickle")

train_df.shape, test_df.shape

In [None]:
target = 'PCT_DESAT_TO_ORIG'

# XGB

### local

In [None]:
xgb_indep = ['AVG_ORIG_OIL_SAT',
         'DIP',
         'FT_DIST_PAT_1',
         'FT_DIST_PAT_2',
         'Lin_Dist_Inj_Factor',
         'Lin_Dist_Prod_Factor',
         'ORIG_OIL_H',
         'SAND',
         'SGMT_CUM_STM_INJ_1',
         'SGMT_CUM_STM_INJ_2',
         'SGMT_CUM_STM_INJ_3',
         'TOTAL_GNTL_INJ',
         'TOTAL_INJ',
         'TOTAL_PROD',
         'fe_SAND_AGE',
         'fe_SAND_DIP_cum_mean',
         'fe_SAND_last_active',
         'fe_Sand_usage_count',
         'fe_WELL_AGE',
         'fe_WELL_SAND_AGE',
         'fe_Well_DIP_cum_mean',
         'fe_Well_Sand_last_active',
         'fe_Well_Sand_usage_count',
         'fe_Well_last_active',
         'fe_avg_well_distance',
         'fe_avg_well_injection',
         'fe_injection_difference',
         'fe_injection_difference_PROD_percentage',
         'fe_injection_difference_percentage',
         'fe_prod_inj_percentage',
         'fe_sand_reservoir_percentage',
         'fe_total_injected_percentage_1',
         'fe_total_injected_percentage_2',
         'fe_total_injected_percentage_3',
         'fe_total_wells',
         'fe_unique_well_count',
         'CMPL_FAC_ID',
         'fe_pipe_avg_AVG_ORIG_OIL_SAT_cum_mean'
       ]

In [None]:
xgb_params = {'objective' : 'reg:squarederror'
              ,'eval_metric': 'rmse'
              ,'max_depth' : 5
              ,'eta' : 0.01
              ,'subsample': 0.9
              ,'colsample_bytree': 0.9
              ,'min_child_weight':20
              ,'gamma': 1
    #           ,'tree_method' : 'gpu_hist'
              }

# xgb_params['task_type'] = 'CPU'
# xgb_params['eval_metric'] = 'RMSE'

xgb_fold_iterations, xgb_fold_results, xgb_models_fold, xgb_fold_prediction = train_xgb_model(train_df=train_df,
                                                                                      indep=xgb_indep, 
                                                                                      target=target,
                                                                                      xgb_params = xgb_params)

xgb_avg_iteration = int(np.mean(xgb_fold_iterations))
print("Fold iterations:", xgb_fold_iterations)
print("Average iteration:", xgb_avg_iteration)
print("Fold results:", xgb_fold_results)
print("Avg.Fold results:", np.mean(xgb_fold_results))

### Prod

In [None]:
dtrain_prod = xgb.DMatrix(data= train_df[xgb_indep] , label=train_df[target])
dtest_prod = xgb.DMatrix(data= test_df[xgb_indep])

train_prod_iter = xgb_avg_iteration# + int(0.2*xgb_avg_iteration)
print(f"Training for {train_prod_iter} iterations")
np.random.seed(100)
xgb_model_prod = xgb.train(xgb_params,
                           dtrain_prod,
#                            evals = eval_set,
                           num_boost_round = train_prod_iter,
#                             feval = xgb_eval_rmspe,
#                             maximize = False,
#                            verbose_eval = True,
#                            early_stopping_rounds = 50
                          )

xgb_prod_prediction = xgb_model_prod.predict(dtest_prod)
xgb_prod_prediction = np.where(xgb_prod_prediction<0, 0, xgb_prod_prediction)
xgb_prod_prediction = np.where(xgb_prod_prediction>1, 1, xgb_prod_prediction)
xgb_prod_prediction

# LGB

### local

In [None]:
lgb_indep = ['AVG_ORIG_OIL_SAT',
         'FT_DIST_PAT_1',
         'FT_DIST_PAT_2',
         'Lin_Dist_Inj_Factor',
         'Lin_Dist_Prod_Factor',
         'ORIG_OIL_H',
         'SAND',
         'SGMT_CUM_STM_INJ_1',
         'SGMT_CUM_STM_INJ_2',
         'SGMT_CUM_STM_INJ_3',
         'TOTAL_GNTL_INJ',
         'TOTAL_PROD',
         'fe_SAND_AGE',
         'fe_SAND_DIP_cum_mean',
         'fe_SAND_last_active',
         'fe_Sand_usage_count',
         'fe_WELL_AGE',
         'fe_WELL_SAND_AGE',
         'fe_Well_DIP_cum_mean',
         'fe_Well_Sand_last_active',
         'fe_Well_Sand_usage_count',
         'fe_Well_last_active',
         'fe_avg_well_injection',
         'fe_injection_difference',
         'fe_injection_difference_PROD_percentage',
         'fe_injection_difference_percentage',
         'fe_prod_inj_percentage',
         'fe_sand_reservoir_percentage',
         'fe_total_injected_percentage_1',
         'fe_total_injected_percentage_2',
         'fe_total_injected_percentage_3',
         'fe_total_wells',
         'fe_unique_well_count',
         'CMPL_FAC_ID',
         'fe_pipe_avg_AVG_ORIG_OIL_SAT_cum_mean']

In [None]:
lgb_params = {'boosting_type': 'gbdt',
             'objective': 'regression',
             'metric': 'rmse',
             'num_leaves': 8,
             'learning_rate': 0.04,
             'feature_fraction': 1,
             'bagging_fraction': 0.9,
             'bagging_freq': 1,
             'min_child_samples': 20,
             'verbose': -1}

fold_iterations, fold_results, lgb_models_fold, lgb_fold_prediction = train_lgb_model(train_df=train_df, 
                                                                                      indep=lgb_indep, 
                                                                                      target=target,
                                                                                      lgb_params = lgb_params)

lgb_avg_iteration = int(np.mean(fold_iterations))
print("Fold iterations:", fold_iterations)
print("Average iteration:", lgb_avg_iteration)
print("Fold results:", fold_results)
print("Avg.Fold results:", np.mean(fold_results))

### Prod

In [None]:
lgb_train_prod = lgb.Dataset(train_df[lgb_indep], train_df[target], free_raw_data=False)

# params = trial.params
# params['metric'] = 'rmse'
# params['verbose'] = 1

lgb_best_iteration = lgb_avg_iteration# + int(0.2*lgb_avg_iteration)
print(f"Training for {lgb_best_iteration}")
np.random.seed(100)
lgb_model_prod = lgb.train(lgb_params
                           ,lgb_train_prod
                           ,num_boost_round=lgb_best_iteration
#                            ,valid_sets=lgb_test_local
#                            ,feval=lgb_eval_rmspe
#                            ,categorical_feature=['stock_id']
#                            ,early_stopping_rounds=50
                          )

lgb_prod_prediction = lgb_model_prod.predict(test_df[lgb_indep])
lgb_prod_prediction = np.where(lgb_prod_prediction<0, 0, lgb_prod_prediction)
lgb_prod_prediction = np.where(lgb_prod_prediction>1, 1, lgb_prod_prediction)
lgb_prod_prediction

# CatBoost

### local

In [None]:
cat_indep = ['AVG_ORIG_OIL_SAT',
 'FT_DIST_PAT_1',
 'FT_DIST_PAT_2',
 'FT_DIST_PAT_3',
 'Lin_Dist_Inj_Factor',
 'Lin_Dist_Prod_Factor',
 'ORIG_OIL_H',
 'SAND',
 'SGMT_CUM_STM_INJ_3',
 'TOTAL_GNTL_INJ',
 'TOTAL_INJ',
 'TOTAL_PROD',
 'fe_SAND_AGE',
 'fe_SAND_DIP_cum_mean',
 'fe_SAND_last_active',
 'fe_Sand_usage_count',
 'fe_WELL_AGE',
 'fe_WELL_SAND_AGE',
 'fe_Well_DIP_cum_mean',
 'fe_Well_Sand_last_active',
 'fe_Well_Sand_usage_count',
 'fe_Well_last_active',
 'fe_Well_usage_count',
 'fe_avg_well_distance',
 'fe_prod_inj_percentage',
 'fe_sand_reservoir_percentage',
 'fe_total_injected_percentage_1',
 'fe_total_injected_percentage_2',
 'fe_total_injected_percentage_3',
 'fe_total_wells',
 'fe_unique_well_count',
 'CMPL_FAC_ID',
 'fe_pipe_avg_AVG_ORIG_OIL_SAT_cum_mean',
 'fe_pipe_TOTAL_PROD_cum_mean'
            ]

In [None]:
cat_params = {'learning_rate' :0.04
              , 'depth' : 5
              , 'subsample' : 0.9
              , 'colsample_bylevel' : 0.9
#               , 'min_child_samples':100
              , 'task_type' : "CPU"
#               , loss_function='RMSE'
              , 'eval_metric' : 'RMSE'
             }


# cat_params['task_type'] = 'CPU'
# cat_params['eval_metric'] = 'RMSE'

cat_fold_iterations, cat_fold_results, cat_models_fold, cat_fold_prediction = train_cat_model(train_df=train_df,
                                                                                      indep=cat_indep, 
                                                                                      target=target,
                                                                                      cat_params = cat_params)

cat_avg_iteration = int(np.mean(cat_fold_iterations))
print("Fold iterations:", cat_fold_iterations)
print("Average iteration:", cat_avg_iteration)
print("Fold results:", cat_fold_results)
print("Avg.Fold results:", np.mean(cat_fold_results))

### Prod

In [None]:
cat_best_iteration = cat_avg_iteration# + int(0.2*cat_avg_iteration)
print(f"Training for {cat_best_iteration}")

np.random.seed(100)
cat_prod_model = CatBoostRegressor(**cat_params
                                    ,iterations=cat_best_iteration
                                    ,early_stopping_rounds=50
                                    )
# Training on overall dataset
cat_prod_model.fit(train_df[cat_indep],
                   train_df[target])

cat_prod_prediction = cat_prod_model.predict(test_df[cat_indep])

cat_prod_prediction = np.where(cat_prod_prediction<0, 0, cat_prod_prediction)
cat_prod_prediction = np.where(cat_prod_prediction>1, 1, cat_prod_prediction)
cat_prod_prediction

In [None]:
cat_prod_prediction

# Ensemble

### local

In [None]:
ens_fold_results = []

    
for fold_i in range(0, 5):
    ens = np.mean([xgb_fold_prediction[fold_i],
                     lgb_fold_prediction[fold_i],
                     cat_fold_prediction[fold_i]],                      
                  axis=0)
        
    fold_rmse = np.sqrt(mean_squared_error(train_df[target][train_df.fold==fold_i], ens))
    ens_fold_results.append(np.round(fold_rmse, 5))
    
    print(f"Current fold: {fold_i}, RMSE {fold_rmse}")

print("Fold results:", ens_fold_results)
print("Avg.Fold results:", np.mean(ens_fold_results))


# Fold models

In [None]:
def fold_models_ensemble(all_models):
    
    all_predictions = []
    for model in all_model.keys():

        if model == 'xgb':
            dtest_prod = xgb.DMatrix(data= test_df[xgb_indep])
            print(model)
            for fold_i in all_model[model]:
                xgb_model = all_model[model][fold_i]
                print(xgb_model)

                xgb_prod_prediction = xgb_model.predict(dtest_prod)
                xgb_prod_prediction = np.where(xgb_prod_prediction<0, 0, xgb_prod_prediction)
                xgb_prod_prediction = np.where(xgb_prod_prediction>1, 1, xgb_prod_prediction)
                all_predictions.append(xgb_prod_prediction)

        elif model == 'lgb':
            print(model)
            for fold_i in all_model[model]:
                lgb_model = all_model[model][fold_i]
                print(lgb_model)

                lgb_prod_prediction = lgb_model.predict(test_df[lgb_indep])
                lgb_prod_prediction = np.where(lgb_prod_prediction<0, 0, lgb_prod_prediction)
                lgb_prod_prediction = np.where(lgb_prod_prediction>1, 1, lgb_prod_prediction)
                all_predictions.append(lgb_prod_prediction)

        else:
            print(model)
            for fold_i in all_model[model]:
                cat_model = all_model[model][fold_i]
                print(cat_model)
                cat_prod_prediction = cat_prod_model.predict(test_df[cat_indep])
                cat_prod_prediction = np.where(cat_prod_prediction<0, 0, cat_prod_prediction)
                cat_prod_prediction = np.where(cat_prod_prediction>1, 1, cat_prod_prediction)
                all_predictions.append(cat_prod_prediction)
                
        prod_prediction = np.mean(all_predictions, axis=0)
        return prod_prediction

all_model_dict = {'xgb': xgb_models_fold,
                  'lgb': lgb_models_fold,
                  'cat': cat_models_fold
                 }            

# ensemble_prod_prediction = fold_models_ensemble(all_models=all_model_dict)

### Prod

In [None]:
ensemble_prod_prediction = np.mean([xgb_prod_prediction, 
                                    lgb_prod_prediction, 
                                    cat_prod_prediction], axis=0)
ensemble_prod_prediction

In [None]:
ens_submission = pd.DataFrame({'PCT_DESAT_TO_ORIG':ensemble_prod_prediction})
ens_submission.to_csv("../sub/ens_sub_26.csv", index=False)
ens_submission