In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
columns = [
    'Weight on Bit (klbs)',
    'Rotary RPM (RPM)',
    'Total Pump Output (gal_per_min)',
    'Rate Of Penetration (ft_per_hr)',
    'Standpipe Pressure (psi)',
    'Rotary Torque (kft_lb)', 
    'Hole Depth (feet)', 
    'Bit Depth (feet)'
]

In [9]:
def csv_to_windows(dataset, columns):
    df = pd.read_csv(f"Datasets\\MaskedAutoencoder\\{dataset}")
    df = df[columns]

    base_mask = (
        (df["Hole Depth (feet)"].rolling(10000).mean().diff() > 0) &
        (df["Hole Depth (feet)"] == df["Bit Depth (feet)"]) &
        (df["Hole Depth (feet)"] > 1000)
    )
    
    window = 100       # Rolling window size
    threshold = 0.3    # Keep if rolling average > threshold
    
    # Compute rolling average of the mask (convert to 0/1 first)
    rolling_avg = base_mask.astype(float).rolling(window).mean()
    
    # Final mask based on rolling average threshold
    final_mask = (rolling_avg > threshold).fillna(0)
    
    final_mask = final_mask.astype(float).rolling(20000).mean() > 0.6
    
    masked_hole_depth = df["Hole Depth (feet)"].where(final_mask, np.nan)
    
    gap_threshold = 100  # maximum number of consecutive NaNs to merge segments
    
    # Identify indices of non-NaN values
    not_nan_idx = masked_hole_depth[masked_hole_depth.notna()].index
    
    # Grouping non-NaN indices based on closeness
    groups = []
    current_group = []
    
    for i, idx in enumerate(not_nan_idx):
        if i == 0:
            current_group.append(idx)
            continue
    
        # Check gap from previous index
        if idx - not_nan_idx[i-1] <= gap_threshold:
            current_group.append(idx)
        else:
            groups.append(current_group)
            current_group = [idx]
    
    # Append last group
    if current_group:
        groups.append(current_group)

    print(f"Groups: {len(groups)}")
    # Fix all NaNs
    drilling_segments = [  ]
    window_size = 100
    for group in groups:
        dfg = df.loc[group].copy()
        
        for col in dfg.columns:
            if np.issubdtype(dfg[col].dtype, np.number):
                series = dfg[col]      
                rolling_mean = series.rolling(window=window_size, min_periods=1, center=True).mean()
                dfg[col] = series.fillna(rolling_mean).bfill(  ).ffill()
    
        drilling_segments.append(dfg)
    
    # Min Max Normalization
    global_min = pd.concat(drilling_segments).min()
    global_max = pd.concat(drilling_segments).max()
    
    # Step 2: Normalize each dataframe
    print(f"Drilling Segments: {len(drilling_segments)}")
    normalized_drilling_segments = []
    for df in drilling_segments:
        normalized_df = (df - global_min) / (global_max - global_min)
        normalized_drilling_segments.append(normalized_df)

    window_size = 60 * 10  # 10 minutes

    windows = []
    
    for df in normalized_drilling_segments:
        for i in range(len(df) - window_size + 1):
            window = df.iloc[i:i + window_size]
            windows.append(window.to_numpy())

    return windows

In [10]:
# Autoencoder training: 78B-32 1 sec data 27200701.csv, 27029986-3.csv
# Task Header 1 (DAS Stickslip): 27029986-4.csv
# Task Header 2 (Temp OUT (Degrees)): 27029986-5.csv

In [None]:
windows1 = csv_to_windows("27029986-3.csv", columns)
windows2 = csv_to_windows("78B-32 1 sec data 27200701.csv", columns)
print(f"{len(windows1)}\t{len(windows2)}")
windows = windows1 + windows2

Groups: 3
Drilling Segments: 3
Groups: 9
Drilling Segments: 9


In [None]:
df = pd.read_csv("Datasets\\MaskedAutoencoder\\27029986-3.csv")
df = df[columns]
df.columns