# Project Geminae MidPoint Model
## Gradient Boosted Regression Model for 3 and 6 month projections

Tom Gregg

2024-02-25

## Setting Up The Model

In [1]:
# Import Basic Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
from datetime import datetime

In [2]:
# Importing Libraries and Packages to perform Boosted Tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from xgboost import XGBRegressor

In [3]:
# Max Display 
pd.options.display.max_columns = None
pd.options.display.max_rows = None

## Importing and Preparing Data

In [4]:
# Creating our file path for the CSV
file_path = 'https://raw.githubusercontent.com/tbgregg000/Capstone/main/Cleaned_GenericWellData.csv'
df = pd.read_csv(file_path).copy()


In [5]:
dff = pd.read_csv(file_path).copy()

In [6]:
df.head(10)

Unnamed: 0,Well Index,InitialProductionDate,DrillingStartDate,DrillingCompletionDate,TrueVerticalDepth_FT,MeasuredDepth_FT,InitialProductionYear,UpperPerforation_FT,LowerPerforation_FT,PerforationInterval_FT,LateralLength_FT,FractureStages,AvgStageSpacing_FT,ProppantLoad_LBSPerGAL,ProppantIntensity_LBSPerFT,TotalProppant_LBS,TotalWaterPumped_GAL,WaterIntensity_GALPerFT,TotalFluidPumped_BBL,FluidIntensity_BBLPerFT,AcidVolume_BBL,YearOfDrilling,DrillingDuration_DAYS,ProductionMonthsCount,InitialProductionMonth,OilTest_Method,First36MonthWater_BBL,First3MonthProd_BOEPer1000FT,First3MonthGas_MCF,First3MonthGas_MCFPer1000FT,First3MonthProd_MCFE,First3MonthProd_MCFEPer1000FT,First3MonthOil_BBL,First3MonthOil_BBLPer1000FT,First3MonthWater_BBL,First6MonthProd_BOE,First6MonthProd_BOEPer1000FT,First6MonthGas_MCF,First6MonthGas_MCFPer1000FT,First6MonthProd_MCFE,First6MonthProd_MCFEPer1000FT,First6MonthOil_BBL,First6MonthOil_BBLPer1000FT,First6MonthWater_BBL,First9MonthProd_BOE,First9MonthProd_BOEPer1000FT,First9MonthGas_MCF,First9MonthGas_MCFPer1000FT,First9MonthProd_MCFE,First9MonthProd_MCFEPer1000FT,First9MonthOil_BBL,First9MonthOil_BBLPer1000FT,First9MonthWater_BBL,First12MonthProd_BOE,First12MonthProd_BOEPer1000FT,First12MonthGas_MCF,First12MonthGas_MCFPer1000FT,First12MonthProd_MCFE,First12MonthProd_MCFEPer1000FT,First12MonthOil_BBL,First12MonthOil_BBLPer1000FT,First12MonthWater_BBL,First36MonthProd_BOE,First36MonthProd_BOEPer1000FT,First36MonthGas_MCFPer1000FT,First36MonthProd_MCFE,First36MonthProd_MCFEPer1000FT,First36MonthOil_BBLPer1000FT,First36MonthWaterProductionBBLPer1000Ft,PeakProd_BOE,PeakProd_BOEPer1000FT,PeakGas_MCF,PeakGas_MCFPer1000FT,PeakProd_MCFE,PeakProd_MCFEPer1000FT,PeakOil_BBL,PeakOil_BBLPer1000FT,PeakWater_BBL,CumProd_BOE,CumProd_BOEPer1000FT,CumGas_MCF,CumGas_MCFPer1000FT,CumProd_MCFE,CumProd_MCFEPer1000FT,CumOil_BBL,CumOil_BBLPer1000FT,CumWater_BBL
0,19,2011-08-01,2011-05-21,2011-08-16,9261.0,15153.0,2011,9441.0,15050.0,5609.0,5786.0,39.0,215.0,0.55,764.0,4283838.0,7722388.0,1377.0,183866.0,33.0,417.0,2011,17.0,147,2011 / 08,GAS LIFT,184345.0,7110.0,56955,10154.0,239271,42658.0,30386,5417.0,59745,76729,13680.0,130580,23280.0,460376,82078.0,54966,9800.0,102241,89776.0,16006.0,162192.0,28916.0,538656.0,96034.0,62744.0,11186.0,115261.0,104446.0,18621.0,190091.0,33890.0,626675.0,111727.0,72764.0,12973.0,131265.0,175084.0,31215.0,68934.0,1050506.0,187289.0,19726.0,32866.0,18939,3377.0,28125,5014.0,113635,20259.0,14701,2621.0,31916,285722,50940.0,770923,137444.0,1714334,305640.0,157235,28033.0,217954
1,20,2011-08-01,2011-05-20,2011-08-11,9357.0,13438.0,2011,9764.0,13320.0,3556.0,3791.0,39.0,215.0,0.5,750.0,2666320.0,5322585.0,1497.0,126728.0,36.0,417.0,2011,17.0,147,2011 / 08,PUMPING,323803.0,7794.0,41472,11663.0,166290,46763.0,20803,5850.0,135634,41144,11570.0,62726,17639.0,246866,69422.0,30690,8630.0,191455,51869.0,14586.0,83317.0,23430.0,311215.0,87518.0,37983.0,10681.0,228034.0,61623.0,17329.0,100815.0,28351.0,369741.0,103977.0,44821.0,12604.0,257872.0,98613.0,27731.0,49650.0,591676.0,166388.0,19456.0,91058.0,10261,2886.0,16274,4576.0,61568,17314.0,7967,2240.0,52906,133831,37635.0,256962,72262.0,802986,225812.0,91004,25592.0,331001
2,21,2011-08-01,2011-04-17,2011-08-03,9284.0,13537.0,2011,9617.0,13440.0,3823.0,3993.0,39.0,215.0,0.42,597.0,2284100.0,5413999.0,1416.0,128905.0,34.0,417.0,2011,17.0,147,2011 / 08,GAS LIFT,68113.0,4889.0,25324,6624.0,112135,29332.0,14468,3785.0,19672,31327,8194.0,50757,13277.0,187962,49166.0,22867,5982.0,30598,39960.0,10453.0,67822.0,17740.0,239761.0,62715.0,28656.0,7496.0,37786.0,45262.0,11839.0,77216.0,20198.0,271571.0,71036.0,32392.0,8473.0,42222.0,90533.0,23681.0,50123.0,543196.0,142086.0,15327.0,17817.0,7901,2067.0,15576,4074.0,47406,12400.0,5305,1388.0,7213,151182,39545.0,378152,98915.0,907089,237272.0,88156,23059.0,81555
3,25,2011-09-01,2011-07-02,2011-09-25,9274.0,13665.0,2011,9879.0,13570.0,3691.0,4208.0,39.0,215.0,0.4,464.0,1712421.0,4284989.0,1161.0,102024.0,28.0,417.0,2011,17.0,146,2011 / 09,PUMPING,57252.0,4899.0,15653,4241.0,108491,29393.0,15473,4192.0,18400,29732,8055.0,28749,7789.0,178395,48332.0,24941,6757.0,26994,37130.0,10060.0,38561.0,10447.0,222779.0,60357.0,30703.0,8318.0,32183.0,46763.0,12669.0,47117.0,12765.0,280577.0,76016.0,38910.0,10542.0,39505.0,73392.0,19884.0,22563.0,440349.0,119303.0,16123.0,15511.0,9281,2514.0,6038,1636.0,55684,15086.0,8278,2243.0,11822,111848,30303.0,150014,40643.0,671090,181818.0,86846,23529.0,77528
4,29,2011-10-01,2011-08-02,2011-10-08,9355.0,13710.0,2011,9797.0,13613.0,3816.0,4054.0,39.0,215.0,0.0,1.0,4400.0,7102499.0,1861.0,169107.0,44.0,417.0,2011,17.0,145,2011 / 10,PUMPING,59050.0,12635.0,55234,14474.0,289294,75811.0,39010,10223.0,21416,70423,18455.0,93313,24453.0,422539,110728.0,54871,14379.0,28917,82063.0,21505.0,127426.0,33393.0,492376.0,129029.0,60825.0,15939.0,31829.0,95676.0,25072.0,160354.0,42021.0,574054.0,150433.0,68950.0,18069.0,35936.0,162800.0,42663.0,85624.0,976802.0,255975.0,28392.0,15474.0,18361,4812.0,25445,6668.0,110165,28869.0,14643,3837.0,10083,269409,70600.0,662306,173560.0,1616454,423599.0,159025,41673.0,114237
5,30,2011-10-01,2011-08-11,2011-10-13,7926.0,15092.0,2011,8120.0,15050.0,6930.0,7010.0,39.0,215.0,0.9,1105.0,7659950.0,8509746.0,1228.0,202613.0,29.0,417.0,2011,17.0,145,2011 / 10,FLOWING,125052.0,740.0,6692,966.0,30779,4441.0,4015,579.0,13499,9235,1333.0,13186,1903.0,55412,7996.0,7038,1016.0,23439,11741.0,1694.0,17513.0,2527.0,70447.0,10165.0,8822.0,1273.0,29167.0,20286.0,2927.0,25113.0,3624.0,121716.0,17564.0,16101.0,2323.0,51835.0,48914.0,7058.0,6989.0,293484.0,42350.0,5893.0,18045.0,21334,3079.0,16885,2437.0,128006,18471.0,18520,2672.0,56531,157268,22694.0,79150,11421.0,943609,136163.0,144077,20790.0,307183
6,36,2011-11-01,2011-09-08,2011-11-11,9290.0,13563.0,2011,9448.0,13460.0,4012.0,3997.0,39.0,215.0,0.63,1122.0,4500000.0,7157999.0,1784.0,170429.0,42.0,417.0,2011,17.0,144,2011 / 11,PUMPING,138736.0,5561.0,12967,3232.0,133873,33368.0,20151,5023.0,43508,38574,9615.0,29210,7281.0,231446,57688.0,33706,8401.0,72224,40982.0,10215.0,32349.0,8063.0,245893.0,61289.0,35591.0,8871.0,76108.0,44799.0,11166.0,35952.0,8961.0,268792.0,66997.0,38807.0,9673.0,82607.0,83813.0,20891.0,22145.0,502876.0,125343.0,17200.0,34580.0,11097,2766.0,6683,1666.0,66581,16595.0,9983,2488.0,21617,142930,35626.0,212021,52847.0,857580,213754.0,107593,26818.0,186385
7,37,2011-11-01,2011-08-29,2011-11-01,8264.0,15937.0,2011,8391.0,15823.0,7432.0,7512.0,39.0,215.0,0.92,1107.0,8227320.0,8961372.0,1206.0,213366.0,29.0,417.0,2011,17.0,144,2011 / 11,FLOWING,104769.0,530.0,2063,278.0,23655,3183.0,3599,484.0,7988,15944,2145.0,8795,1183.0,95662,12872.0,14478,1948.0,34533,24900.0,3350.0,13906.0,1871.0,149397.0,20102.0,22582.0,3038.0,53997.0,33771.0,4544.0,17486.0,2353.0,202624.0,27264.0,30856.0,4152.0,73389.0,50134.0,6746.0,3476.0,300806.0,40474.0,6166.0,14097.0,6155,828.0,3380,455.0,36932,4969.0,5592,752.0,13684,135027,18168.0,38968,5243.0,810163,109010.0,128533,17294.0,221715
8,40,2011-12-01,2011-10-15,2011-12-17,9127.0,13175.0,2011,9440.0,13072.0,3632.0,3791.0,39.0,215.0,0.96,2004.0,15059017.0,6638298.0,1828.0,158055.0,44.0,417.0,2011,17.0,143,2011 / 12,PUMPING,136959.0,3414.0,15641,4307.0,74393,20483.0,9792,2696.0,31831,25144,6923.0,38296,10544.0,150862,41537.0,18761,5165.0,49897,35125.0,9671.0,46363.0,12765.0,210751.0,58026.0,27398.0,7544.0,66895.0,41263.0,11361.0,52223.0,14379.0,247577.0,68166.0,32559.0,8964.0,76813.0,87765.0,24164.0,34946.0,526588.0,144986.0,18340.0,37709.0,6104,1681.0,10192,2806.0,36623,10083.0,5387,1483.0,22858,175899,48430.0,367024,101053.0,1055395,290582.0,114729,31588.0,193234
9,43,2012-01-01,2011-11-28,2012-01-25,9256.0,13613.0,2012,9473.0,13459.0,3986.0,4119.0,39.0,215.0,0.0,1.0,4800.0,6149999.0,1543.0,146429.0,37.0,417.0,2012,17.0,142,2012 / 01,GAS LIFT,89212.0,6212.0,32743,8214.0,148559,37270.0,19303,4843.0,29014,45933,11524.0,65390,16405.0,275598,69141.0,35035,8789.0,46640,48055.0,12056.0,66803.0,16759.0,288331.0,72336.0,36921.0,9263.0,48662.0,61070.0,15321.0,94432.0,23691.0,366420.0,91927.0,45331.0,11373.0,57250.0,124105.0,31135.0,63072.0,744630.0,186811.0,20623.0,22381.0,16713,4193.0,22242,5580.0,100278,25158.0,13006,3263.0,21763,240867,60428.0,673531,168974.0,1445199,362569.0,128611,32266.0,108989


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11557 entries, 0 to 11556
Data columns (total 87 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Well Index                               11557 non-null  int64  
 1   InitialProductionDate                    11557 non-null  object 
 2   DrillingStartDate                        11557 non-null  object 
 3   DrillingCompletionDate                   11557 non-null  object 
 4   TrueVerticalDepth_FT                     11557 non-null  float64
 5   MeasuredDepth_FT                         11557 non-null  float64
 6   InitialProductionYear                    11557 non-null  int64  
 7   UpperPerforation_FT                      11557 non-null  float64
 8   LowerPerforation_FT                      11557 non-null  float64
 9   PerforationInterval_FT                   11557 non-null  float64
 10  LateralLength_FT                         11557

In [8]:
# Dropping Columns After Column Index 43 Since Those Are All 9+ Months
df = df.iloc[:, :44]  # Select columns up to index 42 (excluding 43)  
df.drop(df.columns[26], axis=1, inplace=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11557 entries, 0 to 11556
Data columns (total 43 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Well Index                     11557 non-null  int64  
 1   InitialProductionDate          11557 non-null  object 
 2   DrillingStartDate              11557 non-null  object 
 3   DrillingCompletionDate         11557 non-null  object 
 4   TrueVerticalDepth_FT           11557 non-null  float64
 5   MeasuredDepth_FT               11557 non-null  float64
 6   InitialProductionYear          11557 non-null  int64  
 7   UpperPerforation_FT            11557 non-null  float64
 8   LowerPerforation_FT            11557 non-null  float64
 9   PerforationInterval_FT         11557 non-null  float64
 10  LateralLength_FT               11557 non-null  float64
 11  FractureStages                 11557 non-null  float64
 12  AvgStageSpacing_FT             11557 non-null 

In [9]:
df_cleaned = df.copy()

In [10]:
# Splitting data into Water, Gas, and Oil 
# Splitting data into 3 month and 6 month
y_w_3 = df_cleaned['First3MonthWater_BBL']
y_g_3 = df_cleaned['First3MonthGas_MCF']
y_o_3 = df_cleaned['First3MonthOil_BBL']
y_w_6 = df_cleaned['First6MonthWater_BBL']
y_g_6 = df_cleaned['First6MonthGas_MCF']
y_o_6 = df_cleaned['First6MonthOil_BBL']

In [11]:
# Creating X using just the non-production columns
X = df_cleaned.iloc[:, :26]
X = X.drop("Well Index", axis=1)

# Date Cleanup
columns_to_change = ['InitialProductionDate','DrillingStartDate','DrillingCompletionDate']

# Loop through specific columns and rename
for col in columns_to_change:
    new_name = col + 'Num'
    X.rename(columns={col: new_name}, inplace=True)
    X[new_name] = X[new_name].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").timestamp())


# Dropping a few unnecessary columns
X = X.drop('InitialProductionMonth', axis = 1)
X = X.drop('DrillingCompletionDateNum', axis = 1)
X = X.drop('DrillingDuration_DAYS', axis = 1)
X = X.drop('ProductionMonthsCount', axis = 1)
X = X.drop('YearOfDrilling', axis = 1)
X = X.drop('InitialProductionYear', axis = 1)


# Dummy Variables for OilTest_Method
# Use pd.get_dummies to create dummy variables
dummy_vars = pd.get_dummies(X['OilTest_Method'], prefix='OilTest_Method', drop_first=True)

# Add the dummy variables as new columns to your DataFrame
X = pd.concat([X.drop("OilTest_Method", axis=1), dummy_vars], axis=1)

# Converting Objects to Ints
for col in X.columns:
    if pd.api.types.is_object_dtype(X[col]):
        X[col] = X[col].str.replace(',', '')
        X[col] = X[col].str.replace(' ', '')
        X[col] = X[col].astype(float)

In [12]:
X.head()

Unnamed: 0,InitialProductionDateNum,DrillingStartDateNum,TrueVerticalDepth_FT,MeasuredDepth_FT,UpperPerforation_FT,LowerPerforation_FT,PerforationInterval_FT,LateralLength_FT,FractureStages,AvgStageSpacing_FT,ProppantLoad_LBSPerGAL,ProppantIntensity_LBSPerFT,TotalProppant_LBS,TotalWaterPumped_GAL,WaterIntensity_GALPerFT,TotalFluidPumped_BBL,FluidIntensity_BBLPerFT,AcidVolume_BBL,OilTest_Method_GAS LIFT,OilTest_Method_PUMPING
0,1312171000.0,1305950000.0,9261.0,15153.0,9441.0,15050.0,5609.0,5786.0,39.0,215.0,0.55,764.0,4283838.0,7722388.0,1377.0,183866.0,33.0,417.0,1,0
1,1312171000.0,1305864000.0,9357.0,13438.0,9764.0,13320.0,3556.0,3791.0,39.0,215.0,0.5,750.0,2666320.0,5322585.0,1497.0,126728.0,36.0,417.0,0,1
2,1312171000.0,1303013000.0,9284.0,13537.0,9617.0,13440.0,3823.0,3993.0,39.0,215.0,0.42,597.0,2284100.0,5413999.0,1416.0,128905.0,34.0,417.0,1,0
3,1314850000.0,1309579000.0,9274.0,13665.0,9879.0,13570.0,3691.0,4208.0,39.0,215.0,0.4,464.0,1712421.0,4284989.0,1161.0,102024.0,28.0,417.0,0,1
4,1317442000.0,1312258000.0,9355.0,13710.0,9797.0,13613.0,3816.0,4054.0,39.0,215.0,0.0,1.0,4400.0,7102499.0,1861.0,169107.0,44.0,417.0,0,1


In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11557 entries, 0 to 11556
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   InitialProductionDateNum    11557 non-null  float64
 1   DrillingStartDateNum        11557 non-null  float64
 2   TrueVerticalDepth_FT        11557 non-null  float64
 3   MeasuredDepth_FT            11557 non-null  float64
 4   UpperPerforation_FT         11557 non-null  float64
 5   LowerPerforation_FT         11557 non-null  float64
 6   PerforationInterval_FT      11557 non-null  float64
 7   LateralLength_FT            11557 non-null  float64
 8   FractureStages              11557 non-null  float64
 9   AvgStageSpacing_FT          11557 non-null  float64
 10  ProppantLoad_LBSPerGAL      11557 non-null  float64
 11  ProppantIntensity_LBSPerFT  11557 non-null  float64
 12  TotalProppant_LBS           11557 non-null  float64
 13  TotalWaterPumped_GAL        115

In [14]:
# Creating the test and train split using seed 99
# Quite nice how we can just use the exact same X set
X_train, X_test, y_train_w_3, y_test_w_3 = train_test_split(X, y_w_3, test_size=0.2, random_state=99)
X_train, X_test, y_train_g_3, y_test_g_3 = train_test_split(X, y_g_3, test_size=0.2, random_state=99)
X_train, X_test, y_train_o_3, y_test_o_3 = train_test_split(X, y_o_3, test_size=0.2, random_state=99)

X_train, X_test, y_train_w_6, y_test_w_6 = train_test_split(X, y_w_6, test_size=0.2, random_state=99)
X_train, X_test, y_train_g_6, y_test_g_6 = train_test_split(X, y_g_6, test_size=0.2, random_state=99)
X_train, X_test, y_train_o_6, y_test_o_6 = train_test_split(X, y_o_6, test_size=0.2, random_state=99)


## Boosted Tree Model

Scikit-learn reference:

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html#sklearn-ensemble-gradientboostingregressor

### Doing a GridSearchCV


In [15]:
# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.75, 0.1, 0.25],
    'n_estimators': [300, 400, 500, 750],
    'max_depth': [5, 7, 9, 11],
    'alpha': [0.1, 0.5, 0.75, 0.999]
}
gb_mod_t = XGBRegressor(random_state=99)
grid_search = GridSearchCV(estimator=gb_mod_t, param_grid=param_grid, cv = 2, scoring='r2')
# Fit the grid search to your data


In [16]:
# Grid Search, which is worth skipping

# grid_search.fit(X_train, y_train_w_3)

KeyboardInterrupt: 

In [17]:
# Get the best model and its parameters
# best_model = grid_search.best_estimator_
# best_params = grid_search.best_params_

# Print the best parameters and score
print("Best parameters:", best_params)
print("Best score:", grid_search.best_score_)

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [None]:
# pd.DataFrame(grid_search.cv_results_)

### Doing a Much faster RandomSearchCV

In [18]:
# Define distributions for hyperparameters
# from scipy.stats import uniform, randint
# param_dist = {
#     'learning_rate': uniform(0.05, 0.80),
#     'n_estimators': randint(300, 1000),
#     'max_depth': randint(5, 13),
#     'alpha': uniform(0.2, 0.8)
# }

In [19]:
# # Specify the number of iterations for random search
# n_iter_search = 10

# # Create the RandomizedSearchCV object
# random_search = RandomizedSearchCV(estimator=gb_mod_t, param_distributions=param_dist, n_iter=n_iter_search, cv=5)

In [20]:
# random_search.fit(X_train, y_train_w_3)

KeyboardInterrupt: 

In [None]:
# best_model = random_search.best_estimator_
# best_score = random_search.best_score_
# # 
# # Print the best parameters and score
# print("Best parameters:", best_params)
# print("Best score:", best_score)

In [None]:
# pd.DataFrame(random_search.cv_results_)

### We will do Water First

In [21]:
gb_mod_0 = XGBRegressor(learning_rate=0.1, n_estimators= 300, max_depth = 7, random_state=99, alpha = 0.99)
gb_mod_0.fit(X_train, y_train_w_3)
print("XG Boost (default parameters) Train R2: ", gb_mod_0.score(X_train, y_train_w_3))
print("XG Boost (default parameters) Test R2: ", gb_mod_0.score(X_test, y_test_w_3))

XG Boost (default parameters) Train R2:  0.9162921065915742
XG Boost (default parameters) Test R2:  0.45921692172857564


In [22]:
gb_mod_1 = XGBRegressor(learning_rate=0.01, n_estimators= 300, max_depth = 7, random_state=99, alpha = 0.99)
gb_mod_1.fit(X_train, y_train_w_3)
print("XG Boost (default parameters) Train R2: ", gb_mod_1.score(X_train, y_train_w_3))
print("XG Boost (default parameters) Test R2: ", gb_mod_1.score(X_test, y_test_w_3))

XG Boost (default parameters) Train R2:  0.549776597784744
XG Boost (default parameters) Test R2:  0.4103800942351259


In [23]:
gb_mod_2 = XGBRegressor(learning_rate=1, n_estimators= 300, max_depth = 7, random_state=99, alpha = 0.99)
gb_mod_2.fit(X_train, y_train_w_3)
print("XG Boost (default parameters) Train R2: ", gb_mod_2.score(X_train, y_train_w_3))
print("XG Boost (default parameters) Test R2: ", gb_mod_2.score(X_test, y_test_w_3))

XG Boost (default parameters) Train R2:  0.9999965412403539
XG Boost (default parameters) Test R2:  0.09361186419741274


In [24]:
gb_mod_3 = XGBRegressor(learning_rate=0.1, n_estimators= 300, max_depth = 9, random_state=99, alpha = 0.99)
gb_mod_3.fit(X_train, y_train_w_3)
print("XG Boost (default parameters) Train R2: ", gb_mod_3.score(X_train, y_train_w_3))
print("XG Boost (default parameters) Test R2: ", gb_mod_3.score(X_test, y_test_w_3))

XG Boost (default parameters) Train R2:  0.9828692848654899
XG Boost (default parameters) Test R2:  0.4668922906015066


In [25]:
gb_mod_4 = XGBRegressor(learning_rate=0.075, n_estimators= 500, max_depth = 9, random_state=99, alpha = 0.99)
gb_mod_4.fit(X_train, y_train_w_3)
print("XG Boost (default parameters) Train R2: ", gb_mod_4.score(X_train, y_train_w_3))
print("XG Boost (default parameters) Test R2: ", gb_mod_4.score(X_test, y_test_w_3))

XG Boost (default parameters) Train R2:  0.9915818216403615
XG Boost (default parameters) Test R2:  0.47351664986561437


In [26]:
gb_mod_5 = XGBRegressor(learning_rate=0.075, n_estimators= 500, max_depth = 9, random_state=99, alpha = 0.99)
gb_mod_5.fit(X_train, y_train_w_3)
print("XG Boost (default parameters) Train R2: ", gb_mod_5.score(X_train, y_train_w_3))
print("XG Boost (default parameters) Test R2: ", gb_mod_5.score(X_test, y_test_w_3))

XG Boost (default parameters) Train R2:  0.9915818216403615
XG Boost (default parameters) Test R2:  0.47351664986561437


In [27]:
gb_mod_6 = XGBRegressor(learning_rate=0.1, n_estimators= 50, max_depth = 8, random_state=99, alpha = 0.99)
gb_mod_6.fit(X_train, y_train_w_3)
print("XG Boost (default parameters) Train R2: ", gb_mod_6.score(X_train, y_train_w_3))
print("XG Boost (default parameters) Test R2: ", gb_mod_6.score(X_test, y_test_w_3))

XG Boost (default parameters) Train R2:  0.6903300872855789
XG Boost (default parameters) Test R2:  0.4329306285765565


### Fucking Oil Man

In [28]:
gb_mod_7 = XGBRegressor(learning_rate=0.075, n_estimators= 400, max_depth = 7, random_state=99, alpha = 0.5)
gb_mod_7.fit(X_train, y_train_o_3)
print("XG Boost (default parameters) Train R2: ", gb_mod_7.score(X_train, y_train_o_3))
print("XG Boost (default parameters) Test R2: ", gb_mod_7.score(X_test, y_test_o_3))

XG Boost (default parameters) Train R2:  0.8904321054796389
XG Boost (default parameters) Test R2:  0.43327355965113146


In [None]:
gb_mod_8 = XGBRegressor(learning_rate=0.075, n_estimators= 500, max_depth = 10, random_state=99, alpha = 0.5)
gb_mod_8.fit(X_train, y_train_o_3)
print("XG Boost (default parameters) Train R2: ", gb_mod_8.score(X_train, y_train_o_3))
print("XG Boost (default parameters) Test R2: ", gb_mod_8.score(X_test, y_test_o_3))

In [None]:
gb_mod_9 = XGBRegressor(learning_rate=0.1, n_estimators= 300, max_depth = 8, random_state=99, alpha = 0.5)
gb_mod_9.fit(X_train, y_train_o_3)
print("XG Boost (default parameters) Train R2: ", gb_mod_9.score(X_train, y_train_o_3))
print("XG Boost (default parameters) Test R2: ", gb_mod_9.score(X_test, y_test_o_3))

In [None]:
gb_mod_10 = XGBRegressor(learning_rate=0.1, n_estimators= 500, max_depth = 10, random_state=99, alpha = 0.5)
gb_mod_10.fit(X_train, y_train_o_3)
print("XG Boost (default parameters) Train R2: ", gb_mod_10.score(X_train, y_train_o_3))
print("XG Boost (default parameters) Test R2: ", gb_mod_10.score(X_test, y_test_o_3))

In [None]:
gb_mod_11 = XGBRegressor(learning_rate=0.075, n_estimators= 400, max_depth = 10, random_state=99, alpha = 0.5)
gb_mod_11.fit(X_train, y_train_o_3)
print("XG Boost (default parameters) Train R2: ", gb_mod_11.score(X_train, y_train_o_3))
print("XG Boost (default parameters) Test R2: ", gb_mod_11.score(X_test, y_test_o_3))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = gb_mod_11.predict(X_test)
y_test = y_test_o_3

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {round(mae,2)}")
print(f"Mean Squared Error (MSE): {round(mse,2)}")
print(f"Root Mean Squared Error (RMSE): {round(rmse,2)}")
print(f"R-squared (R²): {round(r2,6)}")

## Let's make some fucking charts

In [None]:
feature_names = X_train.columns
# Extract feature importances from the model
importances = gb_mod_11.feature_importances_
# Sort features and importances in descending order of importance
sorted_idx = importances.argsort()[::-1]
sorted_names = [feature_names[i] for i in sorted_idx][::-1]
sorted_importances = importances[sorted_idx][::-1]

# Create the bar plot
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
plt.barh(sorted_names, sorted_importances)
plt.xlabel('Feature Importance')
plt.ylabel('Feature Name')
plt.title('Feature Importance for Gradient Boosting Model')
plt.xticks(rotation=45, ha='right', fontsize = 8)  # Rotate feature names for better readability
plt.yticks(fontsize = 8)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.tree import plot_tree

# Choose the tree index to visualize (between 0 and number of trees - 1)
tree_index = 4  # Change this to the desired tree index

# Extract the tree object from the model
tree = gb_mod_5.estimators_[tree_index]

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(
        gb_mod_5,
        out_file="tree.dot",
        feature_names=X_train.columns,
        impurity=False,
        rounded=True,
        filled=True
    )
Source.from_file("tree.dot")

In [None]:
dff.describe()

In [None]:
df.head()

In [None]:
# Sample data (modify with your actual data)
var1 = dff['TrueVerticalDepth_FT']
var2 = dff['MeasuredDepth_FT']

# Create the plot
plt.hist(var1, bins='auto', alpha=0.5, label='Vertical Depth')
plt.hist(var2, bins='auto', alpha=0.5, label='Full Measured Length')
plt.xlabel('Feet')
plt.ylabel('Frequency')
plt.title('Frequency Distribution of Well Depth')
plt.legend()
plt.grid(False)
plt.show()

In [None]:
# Sample data (modify with your actual data)
var1 = dff['CumOil_BBL']

# Create the plot
plt.hist(var1, bins='auto', alpha=0.5)
plt.xlabel('Barrels of Oil')
plt.ylabel('Frequency')
plt.title('Frequency Distribution of Oil Production in Barrels')
plt.legend()
plt.grid(False)
plt.show()

In [None]:
# Sample data (modify with your actual data)
var1 = dff['ProductionMonthsCount']

# Create the plot
plt.hist(var1, bins='auto', alpha=0.5)
plt.xlabel('Number of Months')
plt.ylabel('Frequency')
plt.title('Frequency Distribution of Production Timeline per Well')
plt.legend()
plt.grid(False)
plt.show()

In [None]:
# Create the bar plot
# new imports
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scaler = StandardScaler()
scaler.fit(X_train)
std_x_train = X_train.copy()
std_x_test = X_test.copy()

std_train_array = scaler.transform(std_x_train)
std_test_array = scaler.transform(std_x_test)

std_x_train[:] = std_train_array
std_x_test[:] = std_test_array

# Apply PCA
pca = PCA(n_components=len(X_train.columns))
pca.fit(std_x_train[:])


# Example data: Explained variance ratio for each principal component
explained_variance_ratio = np.array(pca.explained_variance_ratio_)

# Cumulative explained variance
cumulative_explained_variance = pca.explained_variance_ratio_.cumsum()

# Number of components
components = range(1, len(explained_variance_ratio) + 1)

# Creating the plot
plt.figure(figsize=(10, 6))
plt.bar(components, explained_variance_ratio, alpha=0.5, label='Individual explained variance')
plt.plot(components, cumulative_explained_variance, marker='o', linestyle='-', color='r', label='Cumulative explained variance')

plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA Explained Variance')
plt.xticks(components, X_train.columns[:pca.n_components_], rotation=45, fontsize = 8, ha='right')
plt.legend(loc='best')

plt.show()
