In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.model_selection import KFold, GroupKFold
import xgboost as xgb
from sklearn.svm import SVR

from datetime import datetime
import gc
import optuna
from utils import optuna_logging
import pytz
UTC = pytz.utc  
timeZ_Kl = pytz.timezone('Asia/Kolkata')

In [None]:
train_df = pd.read_pickle("../data/train_df_interim.pickle")
test_df = pd.read_pickle("../data/test_df_interim.pickle")

train_df.shape, test_df.shape

In [None]:
drop = ['SURV_DTE'
        , 'sand_target_avg'
        , 'CMPL_FAC_ID'
        , 'fold'
        
#         , 'SAND_AGE'
#         , 'WELL_AGE'
#         , 'WELL_SAND_AGE'
#         , 'Well_last_active'
#         , 'SAND_last_active'
#         , 'Well_Sand_last_active'
        
#         ,'total_injected'
       ]
target = 'PCT_DESAT_TO_ORIG'
indep = train_df.columns.difference(drop+[target])
indep

# Linear Regression

In [None]:
def train_lm_model(train_df):
    
    num_rounds = 100000

    fold_results = []
    lm_models_fold = {}

    print("")
    for fold_i in range(0, train_df.fold.max()+1):

        train_fold = train_df[train_df.fold!=fold_i].copy()
        valid_fold = train_df[train_df.fold==fold_i].copy()

        np.random.seed(100)
        lm_model_local = LinearRegression()
        lm_model_local.fit(train_fold[indep], train_fold[target])
        lm_local_prediction = lm_model_local.predict(valid_fold[indep])

        lm_local_prediction = np.where(lm_local_prediction<0, 0, lm_local_prediction)
        lm_local_prediction = np.where(lm_local_prediction>1, 1, lm_local_prediction)

        fold_rmse = np.sqrt(mean_squared_error(valid_fold[target], lm_local_prediction))
        
        fold_results.append(np.round(fold_rmse, 5))
        lm_models_fold[fold_i] = lm_model_local
        
        print(f"Current fold: {fold_i}, RMSE {fold_rmse}")
    
    return fold_results, lm_models_fold

In [None]:
lin_train_df = train_df.copy()
lin_train_df = lin_train_df.replace({-999:0})

scaler = MinMaxScaler()
scaler.fit(lin_train_df[indep])
lin_train_df[indep] = scaler.transform(lin_train_df[indep])

fold_results, lgb_models_fold = train_lm_model(train_df=lin_train_df)

print("Fold results:", fold_results)
print("Avg.Fold results:", np.mean(fold_results))

# SVR

In [None]:
def train_svr_model(train_df):
    
    num_rounds = 100000

    fold_results = []
    svr_models_fold = {}

    print("")
    for fold_i in range(0, train_df.fold.max()+1):

        train_fold = train_df[train_df.fold!=fold_i].copy()
        valid_fold = train_df[train_df.fold==fold_i].copy()

        np.random.seed(100)
        svr_model_local = SVR()
        svr_model_local.fit(train_fold[indep], train_fold[target])
        svr_local_prediction = svr_model_local.predict(valid_fold[indep])

        svr_local_prediction = np.where(svr_local_prediction<0, 0, svr_local_prediction)
        svr_local_prediction = np.where(svr_local_prediction>1, 1, svr_local_prediction)

        fold_rmse = np.sqrt(mean_squared_error(valid_fold[target], svr_local_prediction))
        
        fold_results.append(np.round(fold_rmse, 5))
        svr_models_fold[fold_i] = svr_model_local
        
        print(f"Current fold: {fold_i}, RMSE {fold_rmse}")
    
    return fold_results, svr_models_fold

In [None]:
lin_train_df = train_df.copy()
lin_train_df = lin_train_df.replace({-999:0})

scaler = MinMaxScaler()
scaler.fit(lin_train_df[indep])
lin_train_df[indep] = scaler.transform(lin_train_df[indep])

fold_results, lgb_models_fold = train_svr_model(train_df=lin_train_df)

print("Fold results:", fold_results)
print("Avg.Fold results:", np.mean(fold_results))

# Extra Trees Regressor

In [None]:
def ET_model_training(train, valid, n_estimators, params, patience):

    np.random.seed(150)
    ET = ExtraTreesRegressor(**params)

    ET.fit(train[indep], train[target])
    ET_prediction = ET.predict(valid[indep])

    err = np.sqrt(mean_squared_error(valid[target], ET_prediction))
#     print(f"{1}: {err}")

    best_err = 100000
    counter = 0
    for i in range(2, n_estimators):
        np.random.seed(150)
        ET.n_estimators+=1

        ET.fit(train[indep], train[target])
        ET_prediction = ET.predict(valid[indep])

        err = np.sqrt(mean_squared_error(valid[target], ET_prediction))

        if err < best_err:
            best_err = err
            counter = 0
#             print(f"{i}: {err}")

        else:
            counter+=1        
#             print(f"{i}: {err}, increasing counter to {counter}")

            if counter == patience:
#                 print(f"Stopping the training at {i} with best error: {best_err}")
                break

    return best_err, i, ET

In [None]:
for depth in range(1, 15):
    print(f"Depth: {depth}")
    ET_params = {'n_estimators':1, 
                 'max_depth':depth, 
#                  'min_samples_leaf':10,
    #              'min_samples_split':10,
                 'warm_start':True}
    n_rounds = 1000

    fold_iterations = []
    fold_results = []
    ET_models_fold = {}

    for fold_i in range(0, train_df.fold.max()+1):

        train_fold = train_df[train_df.fold!=fold_i].copy()
        valid_fold = train_df[train_df.fold==fold_i].copy()

        fold_err, fold_iter, fold_model_ET = ET_model_training(train=train_fold, 
                                                               valid=valid_fold, 
                                                               n_estimators=n_rounds, 
                                                               params = ET_params,
                                                               patience=5)
        print(f"Current fold: {fold_i}, iteration {fold_iter}, RMSE {fold_err}")

        fold_results.append(np.round(fold_err, 5))
        fold_iterations.append(fold_iter)
        ET_models_fold[fold_i] = fold_model_ET

    avg_iteration = int(np.mean(fold_iterations))
    print("Fold iterations:", fold_iterations)
    print("Average iteration:", avg_iteration)
    print("Fold results:", fold_results)
    print("Avg.Fold results:", np.mean(fold_results))
    print("")

In [None]:
ET_imp = pd.DataFrame({'indep':indep,
                       'imp':ET_models_fold[3].feature_importances_}).sort_values(['imp'], ascending=False).reset_index(drop=True)
ET_imp

# Random Forest Regressor

In [None]:
def RF_model_training(train, valid, n_estimators, params, patience):

    np.random.seed(150)
    RF = RandomForestRegressor(**params)

    RF.fit(train[indep], train[target])
    RF_prediction = RF.predict(valid[indep])

    err = np.sqrt(mean_squared_error(valid[target], RF_prediction))
#     print(f"{1}: {err}")

    best_err = 100000
    counter = 0
    for i in range(2, n_estimators):
        np.random.seed(150)
        RF.n_estimators+=1

        RF.fit(train[indep], train[target])
        RF_prediction = RF.predict(valid[indep])

        err = np.sqrt(mean_squared_error(valid[target], RF_prediction))

        if err < best_err:
            best_err = err
            counter = 0
#             print(f"{i}: {err}")

        else:
            counter+=1        
#             print(f"{i}: {err}, increasing counter to {counter}")

            if counter == patience:
#                 print(f"Stopping the training at {i} with best error: {best_err}")
                break

    return best_err, i, RF

In [None]:
for depth in range(3, 15):
    print(f"Depth: {depth}")

    RF_params = {'n_estimators':1,              
                 'max_depth':depth, 
#                  'min_samples_leaf':10,
#                  'min_samples_split':10,
                 'warm_start':True}
    n_rounds = 1000

    fold_iterations = []
    fold_results = []
    RF_models_fold = {}

    for fold_i in range(0, train_df.fold.max()+1):

        train_fold = train_df[train_df.fold!=fold_i].copy()
        valid_fold = train_df[train_df.fold==fold_i].copy()

        fold_err, fold_iter, fold_model_RF = RF_model_training(train=train_fold, 
                                                               valid=valid_fold, 
                                                               n_estimators=n_rounds, 
                                                               params = RF_params,
                                                               patience=5)
        print(f"Current fold: {fold_i}, iteration {fold_iter}, RMSE {fold_err}")

        fold_results.append(np.round(fold_err, 5))
        fold_iterations.append(fold_iter)
        RF_models_fold[fold_i] = fold_model_RF

    avg_iteration = int(np.mean(fold_iterations))
    print("Fold iterations:", fold_iterations)
    print("Average iteration:", avg_iteration)
    print("Fold results:", fold_results)
    print("Avg.Fold results:", np.mean(fold_results))
    print("")

In [None]:
RF_imp = pd.DataFrame({'indep':indep,
                       'imp':RF_models_fold[3].feature_importances_}).sort_values(['imp'], ascending=False).reset_index(drop=True)
RF_imp

# GBM

In [None]:
def GBM_model_training(train, valid, n_estimators, params, patience):

    np.random.seed(150)
    GBM = GradientBoostingRegressor(**params)

    GBM.fit(train[indep], train[target])
    GBM_prediction = GBM.predict(valid[indep])

    err = np.sqrt(mean_squared_error(valid[target], GBM_prediction))
#     print(f"{1}: {err}")

    best_err = 100000
    counter = 0
    for i in range(2, n_estimators):
        np.random.seed(150)
        GBM.n_estimators+=1

        GBM.fit(train[indep], train[target])
        GBM_prediction = GBM.predict(valid[indep])

        err = np.sqrt(mean_squared_error(valid[target], GBM_prediction))

        if err < best_err:
            best_err = err
            counter = 0
#             print(f"{i}: {err}")

        else:
            counter+=1        
#             print(f"{i}: {err}, increasing counter to {counter}")

            if counter == patience:
#                 print(f"Stopping the training at {i} with best error: {best_err}")
                break

    return best_err, i, GBM

In [None]:
for depth in range(1, 10):   
    print(f"\nDepth: {depth}")
    GBM_params = {'n_estimators':1, 
                 'max_depth':depth, 
                 'min_samples_leaf':10,
                  'learning_rate':0.05,
    #              'min_samples_split':10,
                 'warm_start':True}
    n_rounds = 1000

    fold_iterations = []
    fold_results = []
    GBM_models_fold = {}

    for fold_i in range(0, train_df.fold.max()+1):

        train_fold = train_df[train_df.fold!=fold_i].copy()
        valid_fold = train_df[train_df.fold==fold_i].copy()

        fold_err, fold_iter, fold_model_GBM = GBM_model_training(train=train_fold, 
                                                                 valid=valid_fold, 
                                                                 n_estimators=n_rounds, 
                                                                 params = GBM_params,
                                                                 patience=5)
        print(f"Current fold: {fold_i}, iteration {fold_iter}, RMSE {fold_err}")

        fold_results.append(np.round(fold_err, 5))
        fold_iterations.append(fold_iter)
        GBM_models_fold[fold_i] = fold_model_GBM

    avg_iteration = int(np.mean(fold_iterations))
    print("Fold iterations:", fold_iterations)
    print("Average iteration:", avg_iteration)
    print("Fold results:", fold_results)
    print("Avg.Fold results:", np.mean(fold_results))

In [None]:
GBM_imp = pd.DataFrame({'indep':indep,
                        'imp':GBM_models_fold[3].feature_importances_}).sort_values(['imp'], ascending=False).reset_index(drop=True)
GBM_imp