# Project Geminae MidPoint Model
## Gradient Boosted Regression Model for 3 and 6 month projections

Tom Gregg

2024-02-25

## Setting Up The Model

In [None]:
# Import Basic Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
from datetime import datetime

In [None]:
# Importing Libraries and Packages to perform Boosted Tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from xgboost import XGBRegressor

In [None]:
# Max Display 
pd.options.display.max_columns = None
pd.options.display.max_rows = None

## Importing and Preparing Data

In [None]:
# Creating our file path for the CSV
file_path = 'https://raw.githubusercontent.com/tbgregg000/Capstone/main/Cleaned_GenericWellData.csv'
df = pd.read_csv(file_path).copy()


In [None]:
dff = pd.read_csv(file_path).copy()

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
# Dropping Columns After Column Index 43 Since Those Are All 9+ Months
df = df.iloc[:, :44]  # Select columns up to index 42 (excluding 43)  
df.drop(df.columns[26], axis=1, inplace=True)
print(df.info())

In [None]:
df_cleaned = df.copy()

In [None]:
# Splitting data into Water, Gas, and Oil 
# Splitting data into 3 month and 6 month
y_w_3 = df_cleaned['First3MonthWater_BBL']
y_g_3 = df_cleaned['First3MonthGas_MCF']
y_o_3 = df_cleaned['First3MonthOil_BBL']
y_w_6 = df_cleaned['First6MonthWater_BBL']
y_g_6 = df_cleaned['First6MonthGas_MCF']
y_o_6 = df_cleaned['First6MonthOil_BBL']

In [None]:
# Creating X using just the non-production columns
X = df_cleaned.iloc[:, :26]
X = X.drop("Well Index", axis=1)

# Date Cleanup
columns_to_change = ['InitialProductionDate','DrillingStartDate','DrillingCompletionDate']

# Loop through specific columns and rename
for col in columns_to_change:
    new_name = col + 'Num'
    X.rename(columns={col: new_name}, inplace=True)
    X[new_name] = X[new_name].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").timestamp())


# Dropping a few unnecessary columns
X = X.drop('InitialProductionMonth', axis = 1)
X = X.drop('DrillingCompletionDateNum', axis = 1)
X = X.drop('DrillingDuration_DAYS', axis = 1)
X = X.drop('ProductionMonthsCount', axis = 1)
X = X.drop('YearOfDrilling', axis = 1)
X = X.drop('InitialProductionYear', axis = 1)


# Dummy Variables for OilTest_Method
# Use pd.get_dummies to create dummy variables
dummy_vars = pd.get_dummies(X['OilTest_Method'], prefix='OilTest_Method', drop_first=True)

# Add the dummy variables as new columns to your DataFrame
X = pd.concat([X.drop("OilTest_Method", axis=1), dummy_vars], axis=1)

# Converting Objects to Ints
for col in X.columns:
    if pd.api.types.is_object_dtype(X[col]):
        X[col] = X[col].str.replace(',', '')
        X[col] = X[col].str.replace(' ', '')
        X[col] = X[col].astype(float)

In [None]:
X.head()

In [None]:
X.info()

In [None]:
# Creating the test and train split using seed 99
# Quite nice how we can just use the exact same X set
X_train, X_test, y_train_w_3, y_test_w_3 = train_test_split(X, y_w_3, test_size=0.2, random_state=99)
X_train, X_test, y_train_g_3, y_test_g_3 = train_test_split(X, y_g_3, test_size=0.2, random_state=99)
X_train, X_test, y_train_o_3, y_test_o_3 = train_test_split(X, y_o_3, test_size=0.2, random_state=99)

X_train, X_test, y_train_w_6, y_test_w_6 = train_test_split(X, y_w_6, test_size=0.2, random_state=99)
X_train, X_test, y_train_g_6, y_test_g_6 = train_test_split(X, y_g_6, test_size=0.2, random_state=99)
X_train, X_test, y_train_o_6, y_test_o_6 = train_test_split(X, y_o_6, test_size=0.2, random_state=99)


## Boosted Tree Model

Scikit-learn reference:

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html#sklearn-ensemble-gradientboostingregressor

### Doing a GridSearchCV


In [None]:
# # Define the parameter grid
# param_grid = {
#     'learning_rate': [0.01, 0.75, 0.1, 0.25],
#     'n_estimators': [300, 400, 500, 750],
#     'max_depth': [5, 7, 9, 11],
#     'alpha': [0.1, 0.5, 0.75, 0.999]
# }
# gb_mod_t = GradientBoostingRegressor(random_state=99)
# grid_search = GridSearchCV(estimator=gb_mod_t, param_grid=param_grid, cv = 2, scoring='r2')
# # Fit the grid search to your data


In [None]:
# grid_search.fit(X_train, y_train_w_3)

In [None]:
# Get the best model and its parameters
# best_model = grid_search.best_estimator_
# best_params = grid_search.best_params_

# Print the best parameters and score
# print("Best parameters:", best_params)
# print("Best score:", grid_search.best_score_)

In [None]:
# pd.DataFrame(grid_search.cv_results_)

### Doing a Much faster RandomSearchCV

In [None]:
# Define distributions for hyperparameters
# from scipy.stats import uniform, randint
# param_dist = {
#     'learning_rate': uniform(0.05, 0.80),
#     'n_estimators': randint(300, 1000),
#     'max_depth': randint(5, 13),
#     'alpha': uniform(0.2, 0.8)
# }

In [None]:
# # Specify the number of iterations for random search
# n_iter_search = 10

# # Create the RandomizedSearchCV object
# random_search = RandomizedSearchCV(estimator=gb_mod_t, param_distributions=param_dist, n_iter=n_iter_search, cv=5)

In [None]:
# random_search.fit(X_train, y_train_w_3)

In [None]:
# best_model = random_search.best_estimator_
# best_score = random_search.best_score_

# # Print the best parameters and score
# print("Best parameters:", best_params)
# print("Best score:", best_score)

In [None]:
# pd.DataFrame(random_search.cv_results_)

### We will do Water First

In [None]:
gb_mod_0 = GradientBoostingRegressor(learning_rate=0.1, n_estimators= 300, max_depth = 7, random_state=99, alpha = 0.99)
gb_mod_0.fit(X_train, y_train_w_3)
print("Gradient Boost (default parameters) Train R2: ", gb_mod_0.score(X_train, y_train_w_3))
print("Gradient Boost (default parameters) Test R2: ", gb_mod_0.score(X_test, y_test_w_3))

In [None]:
gb_mod_1 = GradientBoostingRegressor(learning_rate=0.01, n_estimators= 300, max_depth = 7, random_state=99, alpha = 0.99)
gb_mod_1.fit(X_train, y_train_w_3)
print("Gradient Boost (default parameters) Train R2: ", gb_mod_1.score(X_train, y_train_w_3))
print("Gradient Boost (default parameters) Test R2: ", gb_mod_1.score(X_test, y_test_w_3))

In [None]:
gb_mod_2 = GradientBoostingRegressor(learning_rate=1, n_estimators= 300, max_depth = 7, random_state=99, alpha = 0.99)
gb_mod_2.fit(X_train, y_train_w_3)
print("Gradient Boost (default parameters) Train R2: ", gb_mod_2.score(X_train, y_train_w_3))
print("Gradient Boost (default parameters) Test R2: ", gb_mod_2.score(X_test, y_test_w_3))

In [None]:
gb_mod_3 = GradientBoostingRegressor(learning_rate=0.1, n_estimators= 300, max_depth = 9, random_state=99, alpha = 0.99)
gb_mod_3.fit(X_train, y_train_w_3)
print("Gradient Boost (default parameters) Train R2: ", gb_mod_3.score(X_train, y_train_w_3))
print("Gradient Boost (default parameters) Test R2: ", gb_mod_3.score(X_test, y_test_w_3))

In [None]:
gb_mod_4 = GradientBoostingRegressor(learning_rate=0.075, n_estimators= 500, max_depth = 9, random_state=99, alpha = 0.99)
gb_mod_4.fit(X_train, y_train_w_3)
print("Gradient Boost (default parameters) Train R2: ", gb_mod_4.score(X_train, y_train_w_3))
print("Gradient Boost (default parameters) Test R2: ", gb_mod_4.score(X_test, y_test_w_3))

In [None]:
gb_mod_5 = GradientBoostingRegressor(learning_rate=0.075, n_estimators= 500, max_depth = 9, random_state=99, alpha = 0.99)
gb_mod_5.fit(X_train, y_train_w_3)
print("Gradient Boost (default parameters) Train R2: ", gb_mod_5.score(X_train, y_train_w_3))
print("Gradient Boost (default parameters) Test R2: ", gb_mod_5.score(X_test, y_test_w_3))

In [None]:
gb_mod_6 = GradientBoostingRegressor(learning_rate=0.1, n_estimators= 50, max_depth = 8, random_state=99, alpha = 0.99)
gb_mod_6.fit(X_train, y_train_w_3)
print("Gradient Boost (default parameters) Train R2: ", gb_mod_6.score(X_train, y_train_w_3))
print("Gradient Boost (default parameters) Test R2: ", gb_mod_6.score(X_test, y_test_w_3))

### Fucking Oil Man

In [None]:
gb_mod_7 = GradientBoostingRegressor(learning_rate=0.075, n_estimators= 400, max_depth = 7, random_state=99, alpha = 0.5)
gb_mod_7.fit(X_train, y_train_o_3)
print("Gradient Boost (default parameters) Train R2: ", gb_mod_7.score(X_train, y_train_o_3))
print("Gradient Boost (default parameters) Test R2: ", gb_mod_7.score(X_test, y_test_o_3))

In [None]:
gb_mod_8 = GradientBoostingRegressor(learning_rate=0.075, n_estimators= 500, max_depth = 10, random_state=99, alpha = 0.5)
gb_mod_8.fit(X_train, y_train_o_3)
print("Gradient Boost (default parameters) Train R2: ", gb_mod_8.score(X_train, y_train_o_3))
print("Gradient Boost (default parameters) Test R2: ", gb_mod_8.score(X_test, y_test_o_3))

In [None]:
gb_mod_9 = GradientBoostingRegressor(learning_rate=0.1, n_estimators= 300, max_depth = 8, random_state=99, alpha = 0.5)
gb_mod_9.fit(X_train, y_train_o_3)
print("Gradient Boost (default parameters) Train R2: ", gb_mod_9.score(X_train, y_train_o_3))
print("Gradient Boost (default parameters) Test R2: ", gb_mod_9.score(X_test, y_test_o_3))

In [None]:
gb_mod_10 = GradientBoostingRegressor(learning_rate=0.1, n_estimators= 500, max_depth = 10, random_state=99, alpha = 0.5)
gb_mod_10.fit(X_train, y_train_o_3)
print("Gradient Boost (default parameters) Train R2: ", gb_mod_10.score(X_train, y_train_o_3))
print("Gradient Boost (default parameters) Test R2: ", gb_mod_10.score(X_test, y_test_o_3))

In [None]:
gb_mod_11 = GradientBoostingRegressor(learning_rate=0.075, n_estimators= 400, max_depth = 10, random_state=99, alpha = 0.5)
gb_mod_11.fit(X_train, y_train_o_3)
print("Gradient Boost (default parameters) Train R2: ", gb_mod_11.score(X_train, y_train_o_3))
print("Gradient Boost (default parameters) Test R2: ", gb_mod_11.score(X_test, y_test_o_3))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = gb_mod_11.predict(X_test)
y_test = y_test_o_3

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {round(mae,2)}")
print(f"Mean Squared Error (MSE): {round(mse,2)}")
print(f"Root Mean Squared Error (RMSE): {round(rmse,2)}")
print(f"R-squared (R²): {round(r2,6)}")

## Let's make some fucking charts

In [None]:
feature_names = X_train.columns
# Extract feature importances from the model
importances = gb_mod_11.feature_importances_
# Sort features and importances in descending order of importance
sorted_idx = importances.argsort()[::-1]
sorted_names = [feature_names[i] for i in sorted_idx][::-1]
sorted_importances = importances[sorted_idx][::-1]

# Create the bar plot
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
plt.barh(sorted_names, sorted_importances)
plt.xlabel('Feature Importance')
plt.ylabel('Feature Name')
plt.title('Feature Importance for Gradient Boosting Model')
plt.xticks(rotation=45, ha='right', fontsize = 8)  # Rotate feature names for better readability
plt.yticks(fontsize = 8)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.tree import plot_tree

# Choose the tree index to visualize (between 0 and number of trees - 1)
tree_index = 4  # Change this to the desired tree index

# Extract the tree object from the model
tree = gb_mod_5.estimators_[tree_index]

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(
        gb_mod_5,
        out_file="tree.dot",
        feature_names=X_train.columns,
        impurity=False,
        rounded=True,
        filled=True
    )
Source.from_file("tree.dot")

In [None]:
dff.describe()

In [None]:
df.head()

In [None]:
# Sample data (modify with your actual data)
var1 = dff['TrueVerticalDepth_FT']
var2 = dff['MeasuredDepth_FT']

# Create the plot
plt.hist(var1, bins='auto', alpha=0.5, label='Vertical Depth')
plt.hist(var2, bins='auto', alpha=0.5, label='Full Measured Length')
plt.xlabel('Feet')
plt.ylabel('Frequency')
plt.title('Frequency Distribution of Well Depth')
plt.legend()
plt.grid(False)
plt.show()

In [None]:
# Sample data (modify with your actual data)
var1 = dff['CumOil_BBL']

# Create the plot
plt.hist(var1, bins='auto', alpha=0.5)
plt.xlabel('Barrels of Oil')
plt.ylabel('Frequency')
plt.title('Frequency Distribution of Oil Production in Barrels')
plt.legend()
plt.grid(False)
plt.show()

In [None]:
# Sample data (modify with your actual data)
var1 = dff['ProductionMonthsCount']

# Create the plot
plt.hist(var1, bins='auto', alpha=0.5)
plt.xlabel('Number of Months')
plt.ylabel('Frequency')
plt.title('Frequency Distribution of Production Timeline per Well')
plt.legend()
plt.grid(False)
plt.show()

In [None]:
# Create the bar plot
# new imports
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scaler = StandardScaler()
scaler.fit(X_train)
std_x_train = X_train.copy()
std_x_test = X_test.copy()

std_train_array = scaler.transform(std_x_train)
std_test_array = scaler.transform(std_x_test)

std_x_train[:] = std_train_array
std_x_test[:] = std_test_array

# Apply PCA
pca = PCA(n_components=len(X_train.columns))
pca.fit(std_x_train[:])


# Example data: Explained variance ratio for each principal component
explained_variance_ratio = np.array(pca.explained_variance_ratio_)

# Cumulative explained variance
cumulative_explained_variance = pca.explained_variance_ratio_.cumsum()

# Number of components
components = range(1, len(explained_variance_ratio) + 1)

# Creating the plot
plt.figure(figsize=(10, 6))
plt.bar(components, explained_variance_ratio, alpha=0.5, label='Individual explained variance')
plt.plot(components, cumulative_explained_variance, marker='o', linestyle='-', color='r', label='Cumulative explained variance')

plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA Explained Variance')
plt.xticks(components, X_train.columns[:pca.n_components_], rotation=45, fontsize = 8, ha='right')
plt.legend(loc='best')

plt.show()
