In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
columns = [
    'Weight on Bit (klbs)',
    'Rotary RPM (RPM)',
    'Total Pump Output (gal_per_min)',
    'Rate Of Penetration (ft_per_hr)',
    'Standpipe Pressure (psi)',
    'Rotary Torque (kft_lb)', 
    'Hole Depth (feet)', 
    'Bit Depth (feet)'
]

In [3]:
datasets = [
    "78B-32 1 sec data 27200701.csv",
    "56-32 1sec data 27029986.csv"
]

In [4]:
df = pd.read_csv(f"MaskedAutoencoder\\{datasets[0]}")
df = df[columns]
df

Unnamed: 0,Weight on Bit (klbs),Rotary RPM (RPM),Total Pump Output (gal_per_min),Rate Of Penetration (ft_per_hr),Standpipe Pressure (psi),Rotary Torque (kft_lb),Hole Depth (feet),Bit Depth (feet)
0,0.0,0.03,0.0,0.0,0.0,0.00,55.9,16.5
1,0.0,0.00,0.0,0.0,0.0,0.00,55.9,16.5
2,0.0,0.00,0.0,0.0,0.0,0.00,55.9,16.5
3,0.0,0.03,0.0,0.0,0.0,0.00,55.9,16.5
4,0.0,0.00,0.0,0.0,0.0,0.00,55.9,16.5
...,...,...,...,...,...,...,...,...
2928972,0.0,-999.25,0.0,0.0,0.0,-999.25,9500.0,654.3
2928973,0.0,-999.25,0.0,0.0,0.0,-999.25,9500.0,654.3
2928974,0.0,-999.25,0.0,0.0,0.0,-999.25,9500.0,654.3
2928975,0.0,-999.25,0.0,0.0,0.0,-999.25,9500.0,654.3


In [5]:
df[280000:280010]

Unnamed: 0,Weight on Bit (klbs),Rotary RPM (RPM),Total Pump Output (gal_per_min),Rate Of Penetration (ft_per_hr),Standpipe Pressure (psi),Rotary Torque (kft_lb),Hole Depth (feet),Bit Depth (feet)
280000,61.6,80.36,981.85,69.4,3132.48,9.996,2748.4,2748.4
280001,60.9,80.79,981.85,68.36,3126.56,9.814,2748.4,2748.4
280002,61.5,80.64,987.98,66.62,3115.99,9.992,2748.4,2748.4
280003,63.0,80.39,985.93,65.55,3111.07,9.973,2748.5,2748.5
280004,62.9,80.42,983.89,63.49,3107.85,9.992,2748.5,2748.5
280005,62.4,80.58,978.74,62.18,3105.66,10.109,2748.5,2748.5
280006,62.4,80.39,978.83,60.8,3102.79,10.324,2748.5,2748.5
280007,61.6,80.51,980.78,59.73,3113.92,10.562,2748.5,2748.5
280008,62.4,80.7,975.72,58.64,3117.53,10.296,2748.5,2748.5
280009,60.9,80.55,977.76,56.77,3121.89,10.071,2748.5,2748.5


In [6]:
## Drilling only
##Split into drilling segments

base_mask = (
    (df["Hole Depth (feet)"].rolling(10000).mean().diff() > 0) &
    (df["Hole Depth (feet)"] == df["Bit Depth (feet)"]) &
    (df["Hole Depth (feet)"] > 1000)
)

window = 100       # Rolling window size
threshold = 0.3    # Keep if rolling average > threshold

# Compute rolling average of the mask (convert to 0/1 first)
rolling_avg = base_mask.astype(float).rolling(window).mean()

# Final mask based on rolling average threshold
final_mask = (rolling_avg > threshold).fillna(0)

final_mask = final_mask.astype(float).rolling(20000).mean() > 0.6

masked_hole_depth = df["Hole Depth (feet)"].where(final_mask, np.nan)

gap_threshold = 100  # maximum number of consecutive NaNs to merge segments

# Identify indices of non-NaN values
not_nan_idx = masked_hole_depth[masked_hole_depth.notna()].index

# Grouping non-NaN indices based on closeness
groups = []
current_group = []

for i, idx in enumerate(not_nan_idx):
    if i == 0:
        current_group.append(idx)
        continue

    # Check gap from previous index
    if idx - not_nan_idx[i-1] <= gap_threshold:
        current_group.append(idx)
    else:
        groups.append(current_group)
        current_group = [idx]

# Append last group
if current_group:
    groups.append(current_group)

# Fix all NaNs
drilling_segments = [  ]
window_size = 100
for group in groups:
    dfg = df.loc[group].copy()
    
    for col in dfg.columns:
        if np.issubdtype(dfg[col].dtype, np.number):
            series = dfg[col]      
            rolling_mean = series.rolling(window=window_size, min_periods=1, center=True).mean()
            dfg[col] = series.fillna(rolling_mean).bfill(  ).ffill()

    drilling_segments.append(dfg)

# Min Max Normalization
global_min = pd.concat(drilling_segments).min()
global_max = pd.concat(drilling_segments).max()

# Step 2: Normalize each dataframe
normalized_drilling_segments = []
for df in drilling_segments:
    normalized_df = (df - global_min) / (global_max - global_min)
    normalized_drilling_segments.append(normalized_df)

len(normalized_drilling_segments)

9

In [7]:
# Segment into windows
window_size = 60 * 10  # 10 minutes

windows = []

for df in normalized_drilling_segments:
    for i in range(len(df) - window_size + 1):
        window = df.iloc[i:i + window_size]
        windows.append(window.to_numpy())

len(windows)

332561

In [8]:
windows[0]

array([[0.16514523, 0.99633446, 0.78131211, ..., 0.96181687, 0.        ,
        0.05170063],
       [0.1560166 , 0.99663992, 0.78168403, ..., 0.96175895, 0.        ,
        0.05170063],
       [0.1439834 , 0.99685554, 0.78286377, ..., 0.96062324, 0.        ,
        0.05170063],
       ...,
       [0.14854772, 0.99691843, 0.77236608, ..., 0.96327735, 0.00239205,
        0.053969  ],
       [0.1560166 , 0.99680164, 0.77391774, ..., 0.96187005, 0.00240587,
        0.05398212],
       [0.15477178, 0.99622665, 0.77314191, ..., 0.96491919, 0.0024197 ,
        0.05399523]], shape=(600, 8))