# Project Geminae MidPoint Model
## Gradient Boosted Regression Model for 3 and 6 month projections

Tom Gregg

2024-02-25

## Setting Up The Model

In [1]:
# Import Basic Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
from datetime import datetime

In [2]:
# Importing Libraries and Packages to perform Boosted Tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from xgboost import XGBRegressor

In [3]:
# Max Display 
pd.options.display.max_columns = None
pd.options.display.max_rows = None

## Importing and Preparing Data

In [4]:
# Creating our file path for the CSV
file_path = 'https://raw.githubusercontent.com/tbgregg000/Capstone/main/GenericWellDataPrepped.csv'
df = pd.read_csv(file_path).copy()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16707 entries, 0 to 16706
Data columns (total 89 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Well Index                               16707 non-null  int64  
 1   TrueVerticalDepth_FT                     16707 non-null  float64
 2   MeasuredDepth_FT                         16707 non-null  float64
 3   UpperPerforation_FT                      16707 non-null  float64
 4   LowerPerforation_FT                      16707 non-null  float64
 5   PerforationInterval_FT                   16707 non-null  float64
 6   LateralLength_FT                         16707 non-null  float64
 7   ProppantLoad_LBSPerGAL                   16707 non-null  float64
 8   ProppantIntensity_LBSPerFT               16707 non-null  float64
 9   TotalProppant_LBS                        16707 non-null  float64
 10  TotalWaterPumped_GAL                     16707

In [6]:
# Dropping 2020 since the data in this year is thrown off
df = df[df['YearOfDrilling'] != 2020]
# df = df[df['YearOfDrilling'] >= 2017]
# Drop any null 12 month values


In [7]:
df.dropna(subset=['First12MonthGas_MCFPer1000FT'], inplace=True)

In [8]:
# df['YearOfDrilling'].value_counts()

In [9]:
df_cleaned = df.copy()

### Partitioning the Data into Target Variables

In [10]:
# Splitting data into Water, Gas, and Oil 
# Splitting data into 3 month and 6 month
y_w_3 = df_cleaned['First3MonthWater_BBL']
y_g_3 = df_cleaned['First3MonthGas_MCF']
y_o_3 = df_cleaned['First3MonthOil_BBL']
y_w_6 = df_cleaned['First6MonthWater_BBL']
y_g_6 = df_cleaned['First6MonthGas_MCF']
y_o_6 = df_cleaned['First6MonthOil_BBL']
y_w_9 = df_cleaned['First9MonthWater_BBL']
y_g_9 = df_cleaned['First9MonthGas_MCF']
y_o_9 = df_cleaned['First9MonthOil_BBL']
y_w_12 = df_cleaned['First12MonthWater_BBL']
y_g_12 = df_cleaned['First12MonthGas_MCF']
y_o_12 = df_cleaned['First12MonthOil_BBL']
# y_w_36 = df_cleaned['First36MonthWater_BBL']
# y_g_36 = df_cleaned['First36MonthGas_MCFPer1000FT']
# y_o_36 = df_cleaned['First36MonthOil_BBLPer1000FT']
y_w_peak = df_cleaned['PeakWater_BBL']
y_g_peak = df_cleaned['PeakGas_MCF']
y_o_peak = df_cleaned['PeakOil_BBL']
y_w_cum = df_cleaned['CumWater_BBL']
y_g_cum = df_cleaned['CumGas_MCF']
y_o_cum = df_cleaned['CumOil_BBL']

In [11]:
# Creating X using just the non-production columns
X = df_cleaned.iloc[:, :26]
X = X.drop("Well Index", axis=1)

# Date Cleanup
columns_to_change = ['InitialProductionDate','DrillingStartDate','DrillingCompletionDate']
for col in columns_to_change:
    X[col] = pd.to_datetime(X[col])

# Loop through specific columns and rename
for col in columns_to_change:
    new_name = col + 'Num'
    X.rename(columns={col: new_name}, inplace=True)
    X[new_name] = X[new_name].astype('int64') / 10**9


# Dropping a few unnecessary columns
# X = X.drop('InitialProductionMonth', axis = 1)
X = X.drop('DrillingCompletionDateNum', axis = 1)
X = X.drop('DrillingDuration_DAYS', axis = 1)
# X = X.drop('ProductionMonthsCount', axis = 1)
X = X.drop('YearOfDrilling', axis = 1)
X = X.drop('InitialProductionYear', axis = 1)


# # Dummy Variables for OilTest_Method
# # Use pd.get_dummies to create dummy variables
# dummy_vars = pd.get_dummies(X['OilTest_Method'], prefix='OilTest_Method', drop_first=True)

# # Add the dummy variables as new columns to your DataFrame
# X = pd.concat([X.drop("OilTest_Method", axis=1), dummy_vars], axis=1)

# Converting Objects to Ints
# for col in X.columns:
#     if pd.api.types.is_object_dtype(X[col]):
#         X[col] = X[col].str.replace(',', '')
#         X[col] = X[col].str.replace(' ', '')
#         X[col] = X[col].astype(float)

In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13940 entries, 0 to 16140
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   TrueVerticalDepth_FT        13940 non-null  float64
 1   MeasuredDepth_FT            13940 non-null  float64
 2   UpperPerforation_FT         13940 non-null  float64
 3   LowerPerforation_FT         13940 non-null  float64
 4   PerforationInterval_FT      13940 non-null  float64
 5   LateralLength_FT            13940 non-null  float64
 6   ProppantLoad_LBSPerGAL      13940 non-null  float64
 7   ProppantIntensity_LBSPerFT  13940 non-null  float64
 8   TotalProppant_LBS           13940 non-null  float64
 9   TotalWaterPumped_GAL        13940 non-null  float64
 10  WaterIntensity_GALPerFT     13940 non-null  float64
 11  TotalFluidPumped_BBL        13940 non-null  float64
 12  FluidIntensity_BBLPerFT     13940 non-null  float64
 13  AcidVolume_BBL              139

### Creating our Train, Test, Calibration, and New sets

In [13]:
# Creating the test and train split using seed 99
# Quite nice how we can just use the exact same X set

# X_train, X_test, y_train_w_3, y_test_w_3 = train_test_split(X, y_w_3, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_w_3, y_rest_w_3_1 = train_test_split(X, y_w_3, test_size=2000, random_state=965)
X_test, X_rest2, y_test_w_3, y_rest_w_3_2 = train_test_split(X_rest1, y_rest_w_3_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_w_3, y_new_w_3 = train_test_split(X_rest2, y_rest_w_3_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_g_3, y_test_g_3 = train_test_split(X, y_g_3, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_g_3, y_rest_g_3_1 = train_test_split(X,y_g_3, test_size=2000, random_state=965)
X_test, X_rest2, y_test_g_3, y_rest_g_3_2 = train_test_split(X_rest1, y_rest_g_3_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_g_3, y_new_g_3 = train_test_split(X_rest2, y_rest_g_3_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_o_3, y_test_o_3 = train_test_split(X, y_o_3, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_o_3, y_rest_o_3_1 = train_test_split(X,y_o_3, test_size=2000, random_state=965)
X_test, X_rest2, y_test_o_3, y_rest_o_3_2 = train_test_split(X_rest1, y_rest_o_3_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_o_3, y_new_o_3 = train_test_split(X_rest2, y_rest_o_3_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_w_6, y_test_w_6 = train_test_split(X, y_w_6, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_w_6, y_rest_w_6_1 = train_test_split(X,y_w_6, test_size=2000, random_state=965)
X_test, X_rest2, y_test_w_6, y_rest_w_6_2 = train_test_split(X_rest1, y_rest_w_6_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_w_6, y_new_w_6 = train_test_split(X_rest2, y_rest_w_6_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_g_6, y_test_g_6 = train_test_split(X, y_g_6, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_g_6, y_rest_g_6_1 = train_test_split(X,y_g_6, test_size=2000, random_state=965)
X_test, X_rest2, y_test_g_6, y_rest_g_6_2 = train_test_split(X_rest1, y_rest_g_6_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_g_6, y_new_g_6 = train_test_split(X_rest2, y_rest_g_6_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_o_6, y_test_o_6 = train_test_split(X, y_o_6, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_o_6, y_rest_o_6_1 = train_test_split(X,y_o_6, test_size=2000, random_state=965)
X_test, X_rest2, y_test_o_6, y_rest_o_6_2 = train_test_split(X_rest1, y_rest_o_6_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_o_6, y_new_o_6 = train_test_split(X_rest2, y_rest_o_6_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_w_9, y_test_w_9 = train_test_split(X, y_w_9, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_w_9, y_rest_w_9_1 = train_test_split(X,y_w_9, test_size=2000, random_state=965)
X_test, X_rest2, y_test_w_9, y_rest_w_9_2 = train_test_split(X_rest1, y_rest_w_9_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_w_9, y_new_w_9 = train_test_split(X_rest2, y_rest_w_9_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_g_9, y_test_g_9 = train_test_split(X, y_g_9, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_g_9, y_rest_g_9_1 = train_test_split(X,y_g_9, test_size=2000, random_state=965)
X_test, X_rest2, y_test_g_9, y_rest_g_9_2 = train_test_split(X_rest1, y_rest_g_9_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_g_9, y_new_g_9 = train_test_split(X_rest2, y_rest_g_9_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_o_9, y_test_o_9 = train_test_split(X, y_o_9, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_o_9, y_rest_o_9_1 = train_test_split(X,y_o_9, test_size=2000, random_state=965)
X_test, X_rest2, y_test_o_9, y_rest_o_9_2 = train_test_split(X_rest1, y_rest_o_9_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_o_9, y_new_o_9 = train_test_split(X_rest2, y_rest_o_9_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_w_12, y_test_w_12 = train_test_split(X, y_w_12, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_w_12, y_rest_w_12_1 = train_test_split(X,y_w_12, test_size=2000, random_state=965)
X_test, X_rest2, y_test_w_12, y_rest_w_12_2 = train_test_split(X_rest1, y_rest_w_12_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_w_12, y_new_w_12 = train_test_split(X_rest2, y_rest_w_12_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_g_12, y_test_g_12 = train_test_split(X, y_g_12, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_g_12, y_rest_g_12_1 = train_test_split(X,y_g_12, test_size=2000, random_state=965)
X_test, X_rest2, y_test_g_12, y_rest_g_12_2 = train_test_split(X_rest1, y_rest_g_12_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_g_12, y_new_g_12 = train_test_split(X_rest2, y_rest_g_12_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_o_12, y_test_o_12 = train_test_split(X, y_o_12, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_o_12, y_rest_o_12_1 = train_test_split(X,y_o_12, test_size=2000, random_state=965)
X_test, X_rest2, y_test_o_12, y_rest_o_12_2 = train_test_split(X_rest1, y_rest_o_12_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_o_12, y_new_o_12 = train_test_split(X_rest2, y_rest_o_12_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_w_peak, y_test_w_peak = train_test_split(X, y_w_peak, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_w_peak, y_rest_w_peak_1 = train_test_split(X,y_w_peak, test_size=2000, random_state=965)
X_test, X_rest2, y_test_w_peak, y_rest_w_peak_2 = train_test_split(X_rest1, y_rest_w_peak_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_w_peak, y_new_w_peak = train_test_split(X_rest2, y_rest_w_peak_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_g_peak, y_test_g_peak = train_test_split(X, y_g_peak, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_g_peak, y_rest_g_peak_1 = train_test_split(X,y_g_peak, test_size=2000, random_state=965)
X_test, X_rest2, y_test_g_peak, y_rest_g_peak_2 = train_test_split(X_rest1, y_rest_g_peak_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_g_peak, y_new_g_peak = train_test_split(X_rest2, y_rest_g_peak_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_o_peak, y_test_o_peak = train_test_split(X, y_o_peak, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_o_peak, y_rest_o_peak_1 = train_test_split(X,y_o_peak, test_size=2000, random_state=965)
X_test, X_rest2, y_test_o_peak, y_rest_o_peak_2 = train_test_split(X_rest1, y_rest_o_peak_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_o_peak, y_new_o_peak = train_test_split(X_rest2, y_rest_o_peak_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_w_cum, y_test_w_cum = train_test_split(X, y_w_cum, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_w_cum, y_rest_w_cum_1 = train_test_split(X,y_w_cum, test_size=2000, random_state=965)
X_test, X_rest2, y_test_w_cum, y_rest_w_cum_2 = train_test_split(X_rest1, y_rest_w_cum_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_w_cum, y_new_w_cum = train_test_split(X_rest2, y_rest_w_cum_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_g_cum, y_test_g_cum = train_test_split(X, y_g_cum, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_g_cum, y_rest_g_cum_1 = train_test_split(X,y_g_cum, test_size=2000, random_state=965)
X_test, X_rest2, y_test_g_cum, y_rest_g_cum_2 = train_test_split(X_rest1, y_rest_g_cum_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_g_cum, y_new_g_cum = train_test_split(X_rest2, y_rest_g_cum_2, test_size = 500, random_state=965)

# X_train, X_test, y_train_o_cum, y_test_o_cum = train_test_split(X, y_o_cum, test_size=0.2, random_state=965)
X_train, X_rest1, y_train_o_cum, y_rest_o_cum_1 = train_test_split(X,y_o_cum, test_size=2000, random_state=965)
X_test, X_rest2, y_test_o_cum, y_rest_o_cum_2 = train_test_split(X_rest1, y_rest_o_cum_1, test_size = 1500, random_state=965)
X_calib, X_new, y_calib_o_cum, y_new_o_cum = train_test_split(X_rest2, y_rest_o_cum_2, test_size = 500, random_state=965)


### Fine Tuning Hyper Parameters using Loops

In [14]:
# Create an empty dictionary to store XGBRegressor instances
gb_mod_10 = {}

# Loop through the range and create/update dictionary entries
for num in range(10):
    var_name = f'gb_mod_10_{num}'  # Construct variable name dynamically
    gb_mod_10[var_name] = XGBRegressor(learning_rate=0.06 + (0.005 * num), n_estimators=500, max_depth=10, random_state=965, alpha=0.5)
    gb_mod_10[var_name].fit(X_train, y_train_o_3)
    print("XG Boost (learning rate = ",0.06 + (0.005 * num),") Train R2: ", gb_mod_10[var_name].score(X_train, y_train_o_3))
    print("XG Boost (learning rate = ",0.06 + (0.005 * num),") Test R2: ", gb_mod_10[var_name].score(X_test, y_test_o_3))

XG Boost (learning rate =  0.06 ) Train R2:  0.9915351081391621
XG Boost (learning rate =  0.06 ) Test R2:  0.389739744481144
XG Boost (learning rate =  0.065 ) Train R2:  0.9941380638561781
XG Boost (learning rate =  0.065 ) Test R2:  0.3901930522812673
XG Boost (learning rate =  0.06999999999999999 ) Train R2:  0.9944145744795003
XG Boost (learning rate =  0.06999999999999999 ) Test R2:  0.3839234965577798
XG Boost (learning rate =  0.075 ) Train R2:  0.996575609450063
XG Boost (learning rate =  0.075 ) Test R2:  0.4170027528616582
XG Boost (learning rate =  0.08 ) Train R2:  0.9975892304042269
XG Boost (learning rate =  0.08 ) Test R2:  0.386707034626112
XG Boost (learning rate =  0.08499999999999999 ) Train R2:  0.9977076538978837
XG Boost (learning rate =  0.08499999999999999 ) Test R2:  0.3785232305442783
XG Boost (learning rate =  0.09 ) Train R2:  0.9986296414364447
XG Boost (learning rate =  0.09 ) Test R2:  0.38405263015327107
XG Boost (learning rate =  0.095 ) Train R2:  0.9

In [15]:
# Create an empty dictionary to store XGBRegressor instances
gb_mod_10 = {}

# Loop through the range and create/update dictionary entries
for num in range(6):
    var_name = f'gb_mod_10_{num}'  # Construct variable name dynamically
    gb_mod_10[var_name] = XGBRegressor(learning_rate=0.03 + (0.005 * num), n_estimators=500, max_depth=10, random_state=965, alpha=0.5)
    gb_mod_10[var_name].fit(X_train, y_train_o_3)
    print("XG Boost (learning rate = ",0.03 + (0.005 * num),") Train R2: ", gb_mod_10[var_name].score(X_train, y_train_o_3))
    print("XG Boost (learning rate = ",0.03 + (0.005 * num),") Test R2: ", gb_mod_10[var_name].score(X_test, y_test_o_3))

XG Boost (learning rate =  0.03 ) Train R2:  0.9433097596538988
XG Boost (learning rate =  0.03 ) Test R2:  0.40809563188110953
XG Boost (learning rate =  0.034999999999999996 ) Train R2:  0.9612023862859818
XG Boost (learning rate =  0.034999999999999996 ) Test R2:  0.39582213162308477
XG Boost (learning rate =  0.04 ) Train R2:  0.9717119064619312
XG Boost (learning rate =  0.04 ) Test R2:  0.3824954207086242
XG Boost (learning rate =  0.045 ) Train R2:  0.9791250198357623
XG Boost (learning rate =  0.045 ) Test R2:  0.37735883996126607
XG Boost (learning rate =  0.05 ) Train R2:  0.984966851453586
XG Boost (learning rate =  0.05 ) Test R2:  0.39701698950565123
XG Boost (learning rate =  0.055 ) Train R2:  0.9887290673336463
XG Boost (learning rate =  0.055 ) Test R2:  0.40310401730129986


In [16]:
lr = 0.035 # just optimized for this
md = 10
est = 200
alp = 0.5

# Create an empty dictionary to store XGBRegressor instances
gb_mod_10 = {}

# Loop through the range and create/update dictionary entries
for num in range(10):
    var_name = f'gb_mod_10_{num}'  # Construct variable name dynamically
    gb_mod_10[var_name] = XGBRegressor(learning_rate=lr, n_estimators=est + (50 * num), max_depth=md, random_state=965, alpha=alp)
    gb_mod_10[var_name].fit(X_train, y_train_o_3)
    print("XG Boost ( estimators = ",est + (50 * num),") Train R2: ", gb_mod_10[var_name].score(X_train, y_train_o_3))
    print("XG Boost ( estimators = ",est + (50 * num),") Test R2: ", gb_mod_10[var_name].score(X_test, y_test_o_3))

XG Boost ( estimators =  200 ) Train R2:  0.8567536331760716
XG Boost ( estimators =  200 ) Test R2:  0.3991756596763064
XG Boost ( estimators =  250 ) Train R2:  0.8893196358390982
XG Boost ( estimators =  250 ) Test R2:  0.39786930957781286
XG Boost ( estimators =  300 ) Train R2:  0.9097475156772324
XG Boost ( estimators =  300 ) Test R2:  0.39593591179862986
XG Boost ( estimators =  350 ) Train R2:  0.9282470238718458
XG Boost ( estimators =  350 ) Test R2:  0.3959080218298021
XG Boost ( estimators =  400 ) Train R2:  0.9390811327196601
XG Boost ( estimators =  400 ) Test R2:  0.396128983367496
XG Boost ( estimators =  450 ) Train R2:  0.9520749442368077
XG Boost ( estimators =  450 ) Test R2:  0.397595264384737
XG Boost ( estimators =  500 ) Train R2:  0.9612023862859818
XG Boost ( estimators =  500 ) Test R2:  0.39582213162308477
XG Boost ( estimators =  550 ) Train R2:  0.9685696705768135
XG Boost ( estimators =  550 ) Test R2:  0.3967559358525582
XG Boost ( estimators =  600 ) 

In [17]:
lr = 0.035  #optimized
md = 10     #optiimized
est = 450   #optimized
alp = 0.0
#lam = 1

# Create an empty dictionary to store XGBRegressor instances
gb_mod_10 = {}

# Loop through the range and create/update dictionary entries
for num in range(6):
    var_name = f'gb_mod_10_{num}'  # Construct variable name dynamically
    gb_mod_10[var_name] = XGBRegressor(learning_rate=lr, n_estimators=est, max_depth=md, random_state=965, alpha=alp + (0.2*num))
    gb_mod_10[var_name].fit(X_train, y_train_o_3)
    print("XG Boost ( alpha = ", alp + (0.2*num),") Train R2: ", gb_mod_10[var_name].score(X_train, y_train_o_3))
    print("XG Boost ( alpha = ", alp + (0.2*num),") Test R2: ", gb_mod_10[var_name].score(X_test, y_test_o_3))

XG Boost ( alpha =  0.0 ) Train R2:  0.9496583873610573
XG Boost ( alpha =  0.0 ) Test R2:  0.39972961634160575
XG Boost ( alpha =  0.2 ) Train R2:  0.9482064740930605
XG Boost ( alpha =  0.2 ) Test R2:  0.3985010602174768
XG Boost ( alpha =  0.4 ) Train R2:  0.9471028329323509
XG Boost ( alpha =  0.4 ) Test R2:  0.40273923779366894
XG Boost ( alpha =  0.6000000000000001 ) Train R2:  0.9470239506732545
XG Boost ( alpha =  0.6000000000000001 ) Test R2:  0.3897839514401895
XG Boost ( alpha =  0.8 ) Train R2:  0.9514188453584278
XG Boost ( alpha =  0.8 ) Test R2:  0.3977853757733085
XG Boost ( alpha =  1.0 ) Train R2:  0.9522787815608128
XG Boost ( alpha =  1.0 ) Test R2:  0.3862685104777397


In [18]:
lr = 0.035  #optimized
md = 10     #optiimized
est = 450   #optimized
alp = 0.6   #optimized
lam = 0.25

# Create an empty dictionary to store XGBRegressor instances
gb_mod_10 = {}

# Loop through the range and create/update dictionary entries
for num in range(8):
    var_name = f'gb_mod_10_{num}'  # Construct variable name dynamically
    gb_mod_10[var_name] = XGBRegressor(learning_rate=lr, n_estimators=est, max_depth=md, random_state=965, alpha=alp, reg_lambda = lam + (0.25*num))
    gb_mod_10[var_name].fit(X_train, y_train_o_3)
    print("XG Boost ( lambda = ", lam + (0.25*num),") Train R2: ", gb_mod_10[var_name].score(X_train, y_train_o_3))
    print("XG Boost ( lambda = ", lam + (0.25*num),") Test R2: ", gb_mod_10[var_name].score(X_test, y_test_o_3))

XG Boost ( lambda =  0.25 ) Train R2:  0.9643088746716435
XG Boost ( lambda =  0.25 ) Test R2:  0.37763983325360284
XG Boost ( lambda =  0.5 ) Train R2:  0.9620527970863152
XG Boost ( lambda =  0.5 ) Test R2:  0.3644175061372895
XG Boost ( lambda =  0.75 ) Train R2:  0.9548315058380666
XG Boost ( lambda =  0.75 ) Test R2:  0.3862444708903645
XG Boost ( lambda =  1.0 ) Train R2:  0.9470239506732545
XG Boost ( lambda =  1.0 ) Test R2:  0.3897839514401895
XG Boost ( lambda =  1.25 ) Train R2:  0.9519147075608531
XG Boost ( lambda =  1.25 ) Test R2:  0.3797754918516003
XG Boost ( lambda =  1.5 ) Train R2:  0.9463730134200795
XG Boost ( lambda =  1.5 ) Test R2:  0.3873681506595742
XG Boost ( lambda =  1.75 ) Train R2:  0.9447155922972836
XG Boost ( lambda =  1.75 ) Test R2:  0.3808718982752074
XG Boost ( lambda =  2.0 ) Train R2:  0.9406035545645792
XG Boost ( lambda =  2.0 ) Test R2:  0.3989889706772416


In [19]:
lr = 0.035  #optimized
md = 10     #optiimized
est = 450   #optimized
alp = 0.6   #optimized
lam = 1

# Create an empty dictionary to store XGBRegressor instances
gb_mod_10 = {}

# Loop through the range and create/update dictionary entries
for num in range(1):
    var_name = f'gb_mod_10_{num}'  # Construct variable name dynamically
    gb_mod_10[var_name] = XGBRegressor(learning_rate=lr, n_estimators=est, max_depth=md, random_state=965, alpha=alp)
    gb_mod_10[var_name].fit(X_train, y_train_o_3)
    print("XG Boost ( lambda = ", lam ,") Train R2: ", gb_mod_10[var_name].score(X_train, y_train_o_3))
    print("XG Boost ( lambda = ", lam ,") Test R2: ", gb_mod_10[var_name].score(X_test, y_test_o_3))

XG Boost ( lambda =  1 ) Train R2:  0.9470239506732545
XG Boost ( lambda =  1 ) Test R2:  0.3897839514401895


In [20]:
X_train.head()

Unnamed: 0,TrueVerticalDepth_FT,MeasuredDepth_FT,UpperPerforation_FT,LowerPerforation_FT,PerforationInterval_FT,LateralLength_FT,ProppantLoad_LBSPerGAL,ProppantIntensity_LBSPerFT,TotalProppant_LBS,TotalWaterPumped_GAL,WaterIntensity_GALPerFT,TotalFluidPumped_BBL,FluidIntensity_BBLPerFT,AcidVolume_BBL,OilTest_Method_FLOWING,OilTest_Method_GAS LIFT,OilTest_Method_PUMPING,FractureStages,AvgStageSpacing_FT,InitialProductionDateNum,DrillingStartDateNum
9443,7205.0,17756.0,7504.0,17653.0,10149.0,10307.0,1.005714,1983.571429,12797900.0,12606148.0,1650.285714,299715.428571,39.142857,695.285714,0.0,0.0,1.0,,,1569888000.0,1561766000.0
12236,8936.0,19601.0,8928.0,19550.0,10622.0,10424.0,0.9,2501.0,26565430.0,29591200.0,2786.0,704552.0,66.0,21.0,0.0,1.0,0.0,,,1617235000.0,1581206000.0
5052,8466.0,19199.0,8800.0,19021.0,10221.0,10487.0,0.89,1480.0,15123220.0,17028800.0,1666.0,405448.0,40.0,1333.0,0.0,1.0,0.0,63.0,166.0,1512086000.0,1502237000.0
12777,8835.0,19553.0,9081.0,19458.0,10377.0,10504.0,0.99,1514.0,15705600.0,15901284.0,1532.0,378602.0,36.0,469.0,0.0,0.0,1.0,,,1625098000.0,1594166000.0
6439,9314.0,14163.0,9670.0,14074.0,4404.0,4650.0,0.7,1583.0,6970653.0,9930660.0,2255.0,236444.0,54.0,1286.142857,0.0,0.0,1.0,,,1530403000.0,1519776000.0


## We have the optimized values below. Now we need to create a model for all time periods and all elements

In [21]:
# Great this works

list_of_train_columns=[
y_train_o_3,
y_train_o_6,
y_train_o_9,
y_train_o_12,
y_train_o_cum,
y_train_o_peak,
y_train_w_3,
y_train_w_6,
y_train_w_9,
y_train_w_12,
y_train_w_cum,
y_train_w_peak,
y_train_g_3,
y_train_g_6,
y_train_g_9,
y_train_g_12,
y_train_g_cum,
y_train_g_peak
]

list_of_test_columns = [
y_test_o_3,
y_test_o_6,
y_test_o_9,
y_test_o_12,
y_test_o_cum,
y_test_o_peak,
y_test_w_3,
y_test_w_6,
y_test_w_9,
y_test_w_12,
y_test_w_cum,
y_test_w_peak,
y_test_g_3,
y_test_g_6,
y_test_g_9,
y_test_g_12,
y_test_g_cum,
y_test_g_peak
]

list_of_y_calib_columns = [
y_calib_o_3,
y_calib_o_6,
y_calib_o_9,
y_calib_o_12,
y_calib_o_cum,
y_calib_o_peak,
y_calib_w_3,
y_calib_w_6,
y_calib_w_9,
y_calib_w_12,
y_calib_w_cum,
y_calib_w_peak,
y_calib_g_3,
y_calib_g_6,
y_calib_g_9,
y_calib_g_12,
y_calib_g_cum,
y_calib_g_peak    
]

list_of_y_new_columns = [
y_new_o_3,
y_new_o_6,
y_new_o_9,
y_new_o_12,
y_new_o_cum,
y_new_o_peak,
y_new_w_3,
y_new_w_6,
y_new_w_9,
y_new_w_12,
y_new_w_cum,
y_new_w_peak,
y_new_g_3,
y_new_g_6,
y_new_g_9,
y_new_g_12,
y_new_g_cum,
y_new_g_peak 
]

In [22]:
display_list_columns=[
'o_3',
'o_6',
'o_9',
'o_12',
'o_cum',
'o_peak',
'w_3',
'w_6',
'w_9',
'w_12',
'w_cum',
'w_peak',
'g_3',
'g_6',
'g_9',
'g_12',
'g_cum',
'g_peak'
]

In [23]:
# Create a list of tuples by zipping train_list and test_list
data_tuples = []
for i in range(min(len(list_of_train_columns), len(list_of_test_columns))):
    data_tuples.append((list_of_train_columns[i], list_of_test_columns[i]))

In [24]:
# Loop to create our final models
lr = 0.035  #optimized
md = 10     #optiimized
est = 450   #optimized
alp = 0.6   #optimized
lam = 1     #optimized

# Create an empty dictionary to store XGBRegressor instances
boosted_models_list = {}
y_pred = {}
# Loop through the range and create/update dictionary entries
for i in range(min(len(list_of_train_columns), len(list_of_test_columns))):
    var_name = f'xgb_mod_{display_list_columns[i]}'  # Construct variable name dynamically
    boosted_models_list[var_name] = XGBRegressor(learning_rate=lr, n_estimators=est, max_depth=md, random_state=965, alpha=alp)
    train_ref = data_tuples[i][0]
    test_ref = data_tuples[i][1]
    boosted_models_list[var_name].fit(X_train, train_ref)
    y_pred[i] = boosted_models_list[var_name].predict(X_test)
    print("XG Boost ( train set = ", display_list_columns[i],") Train R2: ", boosted_models_list[var_name].score(X_train, train_ref))
    print("XG Boost ( test set = ", display_list_columns[i],") Test R2: ", boosted_models_list[var_name].score(X_test, test_ref))

XG Boost ( train set =  o_3 ) Train R2:  0.9470239506732545
XG Boost ( test set =  o_3 ) Test R2:  0.3897839514401895
XG Boost ( train set =  o_6 ) Train R2:  0.9567971259485112
XG Boost ( test set =  o_6 ) Test R2:  0.44254937146942275
XG Boost ( train set =  o_9 ) Train R2:  0.9597185762764434
XG Boost ( test set =  o_9 ) Test R2:  0.4605571708281735
XG Boost ( train set =  o_12 ) Train R2:  0.9607609597324354
XG Boost ( test set =  o_12 ) Test R2:  0.4808869868618745
XG Boost ( train set =  o_cum ) Train R2:  0.9569279803767942
XG Boost ( test set =  o_cum ) Test R2:  0.39443193255661024
XG Boost ( train set =  o_peak ) Train R2:  0.9555697360528334
XG Boost ( test set =  o_peak ) Test R2:  0.31668587018285155
XG Boost ( train set =  w_3 ) Train R2:  0.9511878339142115
XG Boost ( test set =  w_3 ) Test R2:  0.40253536746506857
XG Boost ( train set =  w_6 ) Train R2:  0.9545163278407647
XG Boost ( test set =  w_6 ) Test R2:  0.42533082561123314
XG Boost ( train set =  w_9 ) Train R2:

In [25]:
print("Shape of X_train:", X_train.shape)
print("Shape of train_ref:", train_ref.shape)

Shape of X_train: (11940, 21)
Shape of train_ref: (11940,)


In [26]:
list_of_y_calib_columns[2]

258       56494.0
3450     129882.0
15077    164244.0
3098     126674.0
9749     241967.0
3022      37957.0
6024     174607.0
1051      50142.0
8109      73308.0
7386      33013.0
11734    199472.0
15523    189172.0
13251    233745.0
3285     160975.0
1345     109935.0
13713    191237.0
13705    190302.0
15098    208906.0
13177    161126.0
6846      94064.0
12413     76591.0
75        41316.0
13490    189819.0
5211      94120.0
1057      53771.0
14439    121618.0
11850    116288.0
434       18119.0
8499     137536.0
1992     124115.0
9556     191835.0
11885    200754.0
2884      74856.0
4211      92618.0
1881     157110.0
12925    164354.0
4547     133584.0
770       33205.0
8353      74387.0
3498      97884.0
1277      19883.0
11602    113848.0
11758     31841.0
11973     90349.0
6069      91459.0
14595    212886.0
15591    121069.0
7590     154268.0
14499     69695.0
5981      79257.0
91        30516.0
13247    176058.0
1429      21256.0
13021     72965.0
2431      68575.0
456      1

In [27]:
print(type(boosted_models_list))

<class 'dict'>


In [28]:
list(boosted_models_list.items())[1][1]

XGBRegressor(alpha=0.6, base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.035, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=10, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=450, n_jobs=None,
             num_parallel_tree=None, ...)

### Performing Conformal Predictions

In [29]:
# pip install mapie

In [30]:
from mapie.regression import MapieRegressor

In [32]:
# Need to create the mapie regressor from the gb_mods
# This somehow then needs to look through associated x and ys on the fit
n=0
mapie_reg_list = {}
for guy in display_list_columns:
    var_name = f'mapie_reg_{guy}'
    mapie_reg_list[var_name] = MapieRegressor(estimator=list(boosted_models_list.items())[n][1],cv="prefit").fit(X_calib, list_of_y_calib_columns[n])
    # y_pred[n], y_pis = mapie_reg_list[var_name].predict(X_new,alpha=1/3)
    n = n+1

In [33]:
type(boosted_models_list)

dict

In [34]:
first_key = next(iter(boosted_models_list))

In [46]:
boosted_models_list[first_key]

XGBRegressor(alpha=0.6, base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.035, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=10, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=450, n_jobs=None,
             num_parallel_tree=None, ...)

In [36]:
for key, value in boosted_models_list.items():
    # Do something with key and value
    print(key, value)  # For example, print the key and value

xgb_mod_o_3 XGBRegressor(alpha=0.6, base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.035, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=10, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=450, n_jobs=None,
             num_parallel_tree=None, ...)
xgb_mod_o_6 XGBRegressor(alpha=0.6, base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical

In [48]:
list(mapie_reg_list.items())[2]

('mapie_reg_o_9',
 MapieRegressor(cv='prefit',
                estimator=XGBRegressor(alpha=0.6, base_score=None, booster=None,
                                       callbacks=None, colsample_bylevel=None,
                                       colsample_bynode=None,
                                       colsample_bytree=None, device=None,
                                       early_stopping_rounds=None,
                                       enable_categorical=False,
                                       eval_metric=None, feature_types=None,
                                       gamma=None, grow_policy=None,
                                       importance_type=None,
                                       interaction_constraints=None,
                                       learning_rate=0.035, max_bin=None,
                                       max_cat_threshold=None,
                                       max_cat_to_onehot=None,
                                       max_delta

In [117]:
# Need to loop through the pis as well
n = 0
al = 1/2
y_pis = {}
for guy in display_list_columns:
    y_pred[n],y_pis[n] = list(mapie_reg_list.items())[n][1].predict(X_new,alpha=al)
    n = n+1

In [118]:
for i in range(60,65):
    print(f"Well {i+1}")
    #print(X_new_o_3.iloc[i])  # Print the features of the i-th well
    print(f"Predicted Oil 12 Months: {y_pred[3][i]:.2f}")  # Print the predicted oil
    print(f"Actual Oil 12 Months: {y_test_o_12.iloc[i]}") # Print the actual oil
    #interval = y_pis[i].flatten()  # Flatten the interval if it's not already a 1D array
    print(f"{al*100}% interval: [{float(y_pis[3][i][0]):.2f};{float(y_pis[3][i][1]):.2f}]")  # Print the 67% prediction interval
    if (float(y_pis[3][i][0]) <= y_test_o_12.iloc[i] <= float(y_pis[3][i][1])):
        print("In Range")
    else:
        print("Out of Range")
    print("---") # Separate the Well data
    

Well 61
Predicted Oil 12 Months: 169592.05
Actual Oil 12 Months: 169692.0
50.0% interval: [143572.18;195611.91]
In Range
---
Well 62
Predicted Oil 12 Months: 83600.21
Actual Oil 12 Months: 106220.0
50.0% interval: [57580.34;109620.08]
In Range
---
Well 63
Predicted Oil 12 Months: 228560.75
Actual Oil 12 Months: 226964.0
50.0% interval: [202540.88;254580.62]
In Range
---
Well 64
Predicted Oil 12 Months: 90485.12
Actual Oil 12 Months: 113044.0
50.0% interval: [64465.26;116504.99]
In Range
---
Well 65
Predicted Oil 12 Months: 205151.84
Actual Oil 12 Months: 84157.0
50.0% interval: [179131.98;231171.71]
Out of Range
---


In [119]:
for i in range(60,65):
    print(f"Well {i+1}")
    #print(X_new_o_3.iloc[i])  # Print the features of the i-th well
    print(f"Predicted Oil 9 Months: {y_pred[2][i]:.2f}")  # Print the predicted oil
    print(f"Actual Oil 9 Months: {y_test_o_9.iloc[i]}") # Print the actual oil
    #interval = y_pis[i].flatten()  # Flatten the interval if it's not already a 1D array
    print(f"{al*100}% interval: [{float(y_pis[2][i][0]):.2f};{float(y_pis[2][i][1]):.2f}]")  # Print the 67% prediction interval
    if (float(y_pis[2][i][0]) <= y_test_o_9.iloc[i] <= float(y_pis[2][i][1])):
        print("In Range")
    else:
        print("Out of Range")
        
    print("---") # Separate the Well data
    

Well 61
Predicted Oil 9 Months: 149274.42
Actual Oil 9 Months: 160868.0
50.0% interval: [125675.89;172872.95]
In Range
---
Well 62
Predicted Oil 9 Months: 70921.02
Actual Oil 9 Months: 86384.0
50.0% interval: [47322.49;94519.55]
In Range
---
Well 63
Predicted Oil 9 Months: 204431.22
Actual Oil 9 Months: 188187.0
50.0% interval: [180832.69;228029.75]
In Range
---
Well 64
Predicted Oil 9 Months: 79148.86
Actual Oil 9 Months: 94273.0
50.0% interval: [55550.33;102747.39]
In Range
---
Well 65
Predicted Oil 9 Months: 170137.25
Actual Oil 9 Months: 73289.0
50.0% interval: [146538.72;193735.78]
Out of Range
---
