In [3]:
import pandas as pd
import numpy as np

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import pickle

# -----------------------------
# File path
# -----------------------------
file_path = "cleaned_wells_final.xlsx"
excel = pd.ExcelFile(file_path)

# -----------------------------
# Column mapping
# -----------------------------
column_map = {
    "Depth": "Depth",
    "Depth(m)": "Depth",
    "Bit Weight(klb)": "WOB",
    "Rotary RPM(RPM)": "RPM",
    "RPM(RPM)": "RPM",
    "Flow In Rate(galUS/min)": "Flow",
    "ROP - Average(m/hr)": "ROP"
}

# -----------------------------
# Read all sheets and clean
# -----------------------------
all_wells = []

for sheet in excel.sheet_names:
    df = pd.read_excel(file_path, sheet_name=sheet)
    df.columns = df.columns.str.strip()

    # Rename columns
    df = df.rename(columns={c: column_map[c] for c in df.columns if c in column_map})

    # Keep only required columns
    required = ["Depth", "WOB", "RPM", "Flow", "ROP"]
    if not all(col in df.columns for col in required):
        continue

    df = df[required].dropna()
    df = df[(df > 0).all(axis=1)]  # remove negative / zero values

    # Optional: store well name
    df["Well"] = sheet

    all_wells.append(df)

# Combine all wells into one dataframe
global_df = pd.concat(all_wells, ignore_index=True)
print("Total samples across all wells:", len(global_df))


Total samples across all wells: 7292


In [8]:
# Features + target
X = global_df[["Depth", "WOB", "RPM", "Flow"]]
y = global_df["ROP"]

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Random Forest
rf_model = RandomForestRegressor(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Train model
rf_model.fit(X_train, y_train)

# -----------------------------
# Evaluate
# -----------------------------
y_pred = rf_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"R² Score : {r2:.4f}")
print(f"MSE      : {mse:.4f}")
print(f"MAE      : {mae:.4f}")

# -----------------------------
# Save model
# -----------------------------
with open("rf_model_depth_global.pkl", "wb") as f:
    pickle.dump(rf_model, f)

print("✅ Model saved as rf_model_depth_global.pkl")


R² Score : 0.5420
MSE      : 65.4646
MAE      : 3.2204
✅ Model saved as rf_model_depth_global.pkl


In [9]:
single_test = pd.DataFrame({
    "Depth": [1405],
    "WOB": [8.5],
    "RPM": [67.2],
    "Flow": [672]
})

predicted_rop = rf_model.predict(single_test)[0]
print("Predicted ROP:", round(predicted_rop, 2), "m/hr")


Predicted ROP: 10.62 m/hr


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import pickle


In [11]:
global_df = pd.concat(all_wells, ignore_index=True)


In [12]:
# Common inputs
X = global_df[["Depth", "ROP"]]

# Targets
y_wob = global_df["WOB"]
y_rpm = global_df["RPM"]
y_flow = global_df["Flow"]


In [13]:
def train_and_save_model(X, y, model_name):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = RandomForestRegressor(
        n_estimators=400,
        max_depth=None,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    print(f"{model_name} R²:", r2_score(y_test, y_pred))
    print(f"{model_name} MAE:", mean_absolute_error(y_test, y_pred))

    with open(model_name, "wb") as f:
        pickle.dump(model, f)

    print(f"✅ Saved {model_name}")


In [14]:
train_and_save_model(X, y_wob, "rf_model_rop_to_wob.pkl")
train_and_save_model(X, y_rpm, "rf_model_rop_to_rpm.pkl")
train_and_save_model(X, y_flow, "rf_model_rop_to_flow.pkl")


rf_model_rop_to_wob.pkl R²: 0.45627659065259696
rf_model_rop_to_wob.pkl MAE: 4.082917353427782
✅ Saved rf_model_rop_to_wob.pkl
rf_model_rop_to_rpm.pkl R²: 0.32716732750299615
rf_model_rop_to_rpm.pkl MAE: 13.96438571829606
✅ Saved rf_model_rop_to_rpm.pkl
rf_model_rop_to_flow.pkl R²: 0.2906272011338503
rf_model_rop_to_flow.pkl MAE: 32.04507152141063
✅ Saved rf_model_rop_to_flow.pkl


In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# Combine all wells
global_df = pd.concat(all_wells, ignore_index=True)

# Inputs
X = global_df[["Depth", "ROP"]]

# Targets
y_wob = global_df["WOB"]
y_rpm = global_df["RPM"]
y_flow = global_df["Flow"]


def train_and_test_ann(X, y, model_name):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=42
    )

    model = MLPRegressor(
        hidden_layer_sizes=(64, 64),
        activation="relu",
        solver="adam",
        learning_rate_init=0.001,
        max_iter=500,
        random_state=42
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    print(f"\n{model_name}")
    print("R²:", r2_score(y_test, y_pred))
    print("MAE:", mean_absolute_error(y_test, y_pred))


# Test ANN models
train_and_test_ann(X, y_wob, "ANN: ROP → WOB")
train_and_test_ann(X, y_rpm, "ANN: ROP → RPM")
train_and_test_ann(X, y_flow, "ANN: ROP → Flow")



ANN: ROP → WOB
R²: 0.021364602583086145
MAE: 5.744929467467702

ANN: ROP → RPM
R²: -0.028006369211563742
MAE: 19.35465312862436

ANN: ROP → Flow
R²: 0.04959027550156481
MAE: 38.06050505331637


In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [18]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def train_and_test_deep_ann(X, y, model_name):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("ann", MLPRegressor(
            hidden_layer_sizes=(128, 64, 32),
            activation="relu",
            solver="adam",
            learning_rate_init=0.001,
            alpha=1e-4,              # L2 regularization
            max_iter=800,
            early_stopping=True,
            n_iter_no_change=20,
            random_state=42
        ))
    ])

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    print(f"\n{model_name}")
    print("R²:", r2_score(y_test, y_pred))
    print("MAE:", mean_absolute_error(y_test, y_pred))


train_and_test_deep_ann(X, y_wob, "Deep ANN: ROP → WOB")
train_and_test_deep_ann(X, y_rpm, "Deep ANN: ROP → RPM")
train_and_test_deep_ann(X, y_flow, "Deep ANN: ROP → Flow")



Deep ANN: ROP → WOB
R²: 0.08378435623404012
MAE: 5.288436006641427

Deep ANN: ROP → RPM
R²: 0.08735173763596515
MAE: 18.577320070151192

Deep ANN: ROP → Flow
R²: 0.18800091367388638
MAE: 36.06422873295022


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import pickle
import numpy as np

def train_rf_with_leaf_tuning(X, y, base_model_name):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    leaf_values = [2, 5, 10, 20, 30, 50]

    best_model = None
    best_r2 = -np.inf
    best_leaf = None

    for leaf in leaf_values:
        model = RandomForestRegressor(
            n_estimators=400,
            max_depth=None,
            min_samples_leaf=leaf,
            random_state=42,
            n_jobs=-1
        )

        model.fit(X_train, y_train)

        # Predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)

        train_mae = mean_absolute_error(y_train, y_train_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)

        print(f"\nmin_samples_leaf = {leaf}")
        print(f"Train R²: {train_r2:.4f} | Test R²: {test_r2:.4f}")
        print(f"Train MAE: {train_mae:.4f} | Test MAE: {test_mae:.4f}")
        print(f"Overfit gap (R²): {train_r2 - test_r2:.4f}")

        if test_r2 > best_r2:
            best_r2 = test_r2
            best_model = model
            best_leaf = leaf

    # Save best model only
    model_name = f"{base_model_name}_leaf{best_leaf}.pkl"
    with open(model_name, "wb") as f:
        pickle.dump(best_model, f)

    print(f"\n✅ Best model saved: {model_name}")
    print(f"Best Test R²: {best_r2:.4f}")


In [22]:
train_rf_with_leaf_tuning(X, y_wob, "rf_rop_to_wob")
train_rf_with_leaf_tuning(X, y_rpm, "rf_rop_to_rpm")
train_rf_with_leaf_tuning(X, y_flow, "rf_rop_to_flow")



min_samples_leaf = 2
Train R²: 0.8297 | Test R²: 0.4563
Train MAE: 2.2690 | Test MAE: 4.0829
Overfit gap (R²): 0.3735

min_samples_leaf = 5
Train R²: 0.6035 | Test R²: 0.3884
Train MAE: 3.2944 | Test MAE: 4.2546
Overfit gap (R²): 0.2151

min_samples_leaf = 10
Train R²: 0.4126 | Test R²: 0.3019
Train MAE: 3.9386 | Test MAE: 4.4130
Overfit gap (R²): 0.1106

min_samples_leaf = 20
Train R²: 0.2882 | Test R²: 0.2420
Train MAE: 4.3900 | Test MAE: 4.5710
Overfit gap (R²): 0.0462

min_samples_leaf = 30
Train R²: 0.2355 | Test R²: 0.2062
Train MAE: 4.6310 | Test MAE: 4.6898
Overfit gap (R²): 0.0293

min_samples_leaf = 50
Train R²: 0.1876 | Test R²: 0.1689
Train MAE: 4.8079 | Test MAE: 4.7662
Overfit gap (R²): 0.0187

✅ Best model saved: rf_rop_to_wob_leaf2.pkl
Best Test R²: 0.4563

min_samples_leaf = 2
Train R²: 0.8018 | Test R²: 0.3272
Train MAE: 7.6693 | Test MAE: 13.9644
Overfit gap (R²): 0.4746

min_samples_leaf = 5
Train R²: 0.6242 | Test R²: 0.3386
Train MAE: 10.9707 | Test MAE: 14.2407
