In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
df = pd.read_csv("Drilling_parameter.csv", index_col=[0])

In [3]:
df.head()

Unnamed: 0,WOB_klbf,RPM,Torque_kNm,SPP_psi,Flow_Rate_gpm,Mud_Weight_ppg,Formation,ROP_ft_per_hr
0,21.854305,85.91861,11.542642,2345.405988,585.997939,10.074542,Sandstone,143.430049
1,47.782144,135.866133,11.17447,2593.362794,702.716165,10.393743,Sandstone,209.794849
2,37.939727,182.212417,27.656365,1500.935798,680.080465,11.91819,Shale,203.043355
3,31.939632,162.511484,11.238655,2249.748199,376.949952,9.860018,Sandstone,144.515207
4,12.020839,172.918561,11.798743,2143.491966,374.624735,11.978599,Sandstone,132.20183


In [4]:
df.isnull().sum()

WOB_klbf          0
RPM               0
Torque_kNm        0
SPP_psi           0
Flow_Rate_gpm     0
Mud_Weight_ppg    0
Formation         0
ROP_ft_per_hr     0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   WOB_klbf        1000 non-null   float64
 1   RPM             1000 non-null   float64
 2   Torque_kNm      1000 non-null   float64
 3   SPP_psi         1000 non-null   float64
 4   Flow_Rate_gpm   1000 non-null   float64
 5   Mud_Weight_ppg  1000 non-null   float64
 6   Formation       1000 non-null   object 
 7   ROP_ft_per_hr   1000 non-null   float64
dtypes: float64(7), object(1)
memory usage: 70.3+ KB


In [6]:
df.shape

(1000, 8)

In [7]:
df['Formation'].unique()

array(['Sandstone', 'Shale', 'Dolomite', 'Limestone'], dtype=object)

In [8]:
from sklearn.model_selection import train_test_split
X = df.drop(['ROP_ft_per_hr'], axis=1)
y = df['ROP_ft_per_hr']

In [9]:
X.head()

Unnamed: 0,WOB_klbf,RPM,Torque_kNm,SPP_psi,Flow_Rate_gpm,Mud_Weight_ppg,Formation
0,21.854305,85.91861,11.542642,2345.405988,585.997939,10.074542,Sandstone
1,47.782144,135.866133,11.17447,2593.362794,702.716165,10.393743,Sandstone
2,37.939727,182.212417,27.656365,1500.935798,680.080465,11.91819,Shale
3,31.939632,162.511484,11.238655,2249.748199,376.949952,9.860018,Sandstone
4,12.020839,172.918561,11.798743,2143.491966,374.624735,11.978599,Sandstone


In [15]:
# Create Column Transformer with 3 types of transformers
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# Select numerical and object features
num_features = X.select_dtypes(exclude="object").columns
object_features = ['Formation']

# Define transformers
numeric_transformer = StandardScaler()
categorical_transformer = OrdinalEncoder()

# Combine them using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("OrdinalEncoder", categorical_transformer, object_features),
        ("StandardScaler", numeric_transformer, num_features)
    ],
    remainder='passthrough'
)


In [16]:
X=preprocessor.fit_transform(X)

In [17]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6
0,2.0,-0.396301,-1.102179,-0.828489,0.636740,0.271711,-0.362667
1,2.0,1.576957,0.119447,-0.879179,1.069707,1.086025,-0.086590
2,3.0,0.827893,1.252992,1.390047,-0.837822,0.928101,1.231907
3,2.0,0.371251,0.771143,-0.870342,0.469708,-1.186764,-0.548210
4,2.0,-1.144685,1.025681,-0.793229,0.284170,-1.202986,1.284155
...,...,...,...,...,...,...,...
995,3.0,-1.365364,0.513409,1.249067,-0.168565,1.406833,0.643386
996,3.0,1.462568,1.539486,-1.187946,-0.544704,-1.307179,0.016454
997,1.0,-1.210440,-1.499978,-0.662991,-0.334570,-0.579986,0.922623
998,2.0,1.575324,-1.540737,-0.730944,0.138174,1.122001,-0.045576


In [18]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((800, 7), (200, 7))

In [19]:
X_train

array([[ 0.        , -1.51992955,  1.1022258 , ...,  0.49557903,
        -0.46615428, -1.30483053],
       [ 3.        ,  1.55658488, -1.31874487, ..., -1.25965087,
         0.32751056, -1.36360895],
       [ 0.        ,  0.41221607, -1.50266862, ..., -1.60739481,
         1.34584718, -0.65387234],
       ...,
       [ 1.        ,  0.97698495,  0.9225677 , ...,  1.13253157,
         0.45812164, -0.14900383],
       [ 3.        ,  0.72164003, -1.63346655, ..., -0.35322496,
        -0.7447836 ,  0.83416898],
       [ 2.        , -0.60241726,  1.29011001, ...,  1.4813321 ,
         0.18851132,  0.2506228 ]])

In [20]:
## Model Training And Model Selection

In [30]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0mm
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.2


In [31]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [32]:
##Create a Function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [33]:
## Beginning Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Adaboost Regressor":AdaBoostRegressor(),
    "Graident BoostRegressor":GradientBoostingRegressor(),
    "Xgboost Regressor":XGBRegressor()  
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 10.4635
- Mean Absolute Error: 8.2145
- R2 Score: 0.9220
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 9.7200
- Mean Absolute Error: 7.7222
- R2 Score: 0.9320


Lasso
Model performance for Training set
- Root Mean Squared Error: 10.6947
- Mean Absolute Error: 8.3406
- R2 Score: 0.9185
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 9.9284
- Mean Absolute Error: 7.9122
- R2 Score: 0.9291


Ridge
Model performance for Training set
- Root Mean Squared Error: 10.4635
- Mean Absolute Error: 8.2146
- R2 Score: 0.9220
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 9.7187
- Mean Absolute Error: 7.7209
- R2 Score: 0.9320


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 12.3657
- Mean Absolute Error: 9.9588
- R2 Score: 0.8910
-------------------

In [34]:
#Initialize few parameter for Hyperparamter tuning
knn_params = {"n_neighbors": [2, 3, 10, 20, 40, 50]}
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}
xgboost_params = {"learning_rate": [0.1, 0.01],
                  "max_depth": [5, 8, 12, 20, 30],
                  "n_estimators": [100, 200, 300],
                  "colsample_bytree": [0.5, 0.8, 1, 0.3, 0.4]}

In [35]:
# Models list for Hyperparameter tuning
randomcv_models = [('KNN', KNeighborsRegressor(), knn_params),
                   ("RF", RandomForestRegressor(), rf_params),
                   ("XGB", XGBRegressor(), xgboost_params )
                   
                   ]

In [36]:
##Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END ......................................n_neighbors=2; total time=   0.0s
[CV] END .....................................n_neighbors=40; total time=   0.0s
[CV] END max_depth=10, max_features=auto, min_samples_split=8, n_estimators=200; total time=   0.0s
[CV] END max_depth=10, max_features=5, min_samples_split=2, n_estimators=500; total time=   2.5s
[CV] END max_depth=5, max_features=auto, min_samples_split=20, n_estimators=500; total time=   0.0s
[CV] END max_depth=5, max_features=auto, min_samples_split=20, n_estimators=500; total time=   0.0s
[CV] END max_depth=5, max_features=auto, min_samples_split=20, n_estimators=500; total time=   0.0s
[CV] END max_depth=None, max_features=7, min_samples_split=20, n_estimators=200; total time=   0.7s
[CV] END max_depth=None, max_features=5, min_samples_sp

In [37]:
## Retraining the models with best parameters
models = {
    "Random Forest Regressor": RandomForestRegressor(n_estimators =1000, min_samples_split= 2, max_features =5, max_depth= 10),
     "K-Neighbors Regressor": KNeighborsRegressor(n_neighbors=10, n_jobs=-1),
    "XGBoost Regressor": XGBRegressor(n_estimators = 100, max_depth=5, learning_rate = 0.1, colsample_bytree= 1)
    
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 4.7298
- Mean Absolute Error: 3.7758
- R2 Score: 0.9841
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 11.6583
- Mean Absolute Error: 9.3742
- R2 Score: 0.9022


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 13.6554
- Mean Absolute Error: 10.8099
- R2 Score: 0.8671
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 14.5991
- Mean Absolute Error: 11.5546
- R2 Score: 0.8467


XGBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 4.1472
- Mean Absolute Error: 3.0915
- R2 Score: 0.9877
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 12.0484
- Mean Absolute Error: 9.4981
- R2 Score: 0.8956


[CV] END .....................................n_neighbors=10; total time=   0.0s
[CV] END .................................

In [41]:
!pip install joblib



In [42]:
# Example: Create a scenario input
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scenario = pd.DataFrame([{
    'WOB_klbf': 30,
    'RPM': 150,
    'Torque_kNm': 8,
    'SPP_psi': 3000,
    'Flow_Rate_gpm': 600,
    'Mud_Weight_ppg': 10,
    'Formation_Encoded': 1,  # e.g., 1 for Sandstone
}])


# Scale scenario input same as training data
scenario_scaled = scaler.fit_transform(scenario)

best_model = models["Random Forest Regressor"]

# Predict using tuned XGBoost
rop_pred = best_model.predict(scenario_scaled)
print(f"Predicted ROP for this scenario: {rop_pred[0]:.2f} ft/hr")

Predicted ROP for this scenario: 153.01 ft/hr


In [43]:
import joblib

# Save model and scaler
joblib.dump(best_model, 'Drilling Efficiency.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Model and scaler saved.")

Model and scaler saved.


In [44]:
rf_model = joblib.load("Drilling Efficiency.pkl")

In [45]:
! pip install streamlit

Collecting streamlit
  Downloading streamlit-1.46.0-py3-none-any.whl.metadata (9.0 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting cachetools<7,>=4.0 (from streamlit)
  Downloading cachetools-6.1.0-py3-none-any.whl.metadata (5.4 kB)
Collecting click<9,>=7.0 (from streamlit)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting protobuf<7,>=3.20 (from streamlit)
  Downloading protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting pyarrow>=7.0 (from streamlit)
  Downloading pyarrow-20.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.meta

In [46]:
import streamlit as st
import pandas as pd
import numpy as np

st.title("Drilling Efficiency Optimization")

st.header("Sample Data")
data = pd.DataFrame({
    'WOB_klbf': np.random.uniform(5, 35, 10),
    'RPM': np.random.uniform(60, 180, 10),
    'ROP_ft_per_hr': np.random.uniform(30, 120, 10)
})
st.dataframe(data)

st.header("Mean ROP")
st.write("Average ROP:", data["ROP_ft_per_hr"].mean())


2025-06-24 13:16:43.947 
  command:

    streamlit run /opt/conda/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]
