In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import matplotlib.pyplot as plt
from keras.layers import LSTM, RepeatVector, TimeDistributed, Dense
from keras.models import Sequential
from keras import Input
import pandas as pd
from sklearn.model_selection import train_test_split
import copy
import collections
from random import shuffle
import itertools
from os import listdir
import random
import string
import statistics
import pickle
from pathlib import Path

In [2]:
columns = [
    'Weight on Bit (klbs)',
    'Rotary RPM (RPM)',
    'Total Pump Output (gal_per_min)',
    'Rate Of Penetration (ft_per_hr)',
    'Standpipe Pressure (psi)',
    'Rotary Torque (kft_lb)', 
    'Hole Depth (feet)', 
    'Bit Depth (feet)'
]

In [3]:
def csv_to_windows(dataset, columns):
    df = pd.read_csv(f"Datasets\\MaskedAutoencoder\\{dataset}")
    df = df[columns]

    base_mask = (
        (df["Hole Depth (feet)"].rolling(10000).mean().diff() > 0) &
        (df["Hole Depth (feet)"] == df["Bit Depth (feet)"]) &
        (df["Hole Depth (feet)"] > 1000)
    )
    
    window = 100       # Rolling window size
    threshold = 0.3    # Keep if rolling average > threshold
    
    # Compute rolling average of the mask (convert to 0/1 first)
    rolling_avg = base_mask.astype(float).rolling(window).mean()
    
    # Final mask based on rolling average threshold
    final_mask = (rolling_avg > threshold).fillna(0)
    
    final_mask = final_mask.astype(float).rolling(20000).mean() > 0.6
    
    masked_hole_depth = df["Hole Depth (feet)"].where(final_mask, np.nan)
    
    gap_threshold = 100  # maximum number of consecutive NaNs to merge segments
    
    # Identify indices of non-NaN values
    not_nan_idx = masked_hole_depth[masked_hole_depth.notna()].index
    
    # Grouping non-NaN indices based on closeness
    groups = []
    current_group = []
    
    for i, idx in enumerate(not_nan_idx):
        if i == 0:
            current_group.append(idx)
            continue
    
        # Check gap from previous index
        if idx - not_nan_idx[i-1] <= gap_threshold:
            current_group.append(idx)
        else:
            groups.append(current_group)
            current_group = [idx]
    
    # Append last group
    if current_group:
        groups.append(current_group)

    # Fix all NaNs
    drilling_segments = [  ]
    window_size = 100
    for group in groups:
        dfg = df.loc[group].copy()
        
        for col in dfg.columns:
            if np.issubdtype(dfg[col].dtype, np.number):
                series = dfg[col]      
                rolling_mean = series.rolling(window=window_size, min_periods=1, center=True).mean()
                dfg[col] = series.fillna(rolling_mean).bfill(  ).ffill()
    
        drilling_segments.append(dfg)
    
    # Min Max Normalization
    global_min = pd.concat(drilling_segments).min()
    global_max = pd.concat(drilling_segments).max()
    
    # Step 2: Normalize each dataframe
    print(f"Drilling Segments: {len(drilling_segments)}")
    normalized_drilling_segments = []
    for df in drilling_segments:
        normalized_df = (df - global_min) / (global_max - global_min)
        normalized_drilling_segments.append(normalized_df)

    window_size = 60 * 10  # 10 minutes

    windows = []
    count = 1
    for df in normalized_drilling_segments:
        print(f"\t{count}")
        count += 1
        for i in range(len(df) - window_size + 1):
            window = df.iloc[i:i + window_size]
            windows.append(window.to_numpy())

    print(f"Windows: {len(windows):,}".replace(',', ' ')) 
    print(f"Windows per Segment: {len(windows) / len(drilling_segments):,.2f}".replace(',', ' '))
    
    return windows

In [4]:
def mask_data(data, MASKING_PERCENT=0.8):
    masked_data = []
    mask_indices_all = []
    
    for i in range(len(data)):
        arr = data[i]
        print(f"{round( ( i / len(data) ) * 100, 4}")
        # Convert to numpy array if needed
        arr = np.array(arr, dtype=float)
        
        # Create a copy to avoid modifying original
        masked_arr = arr.copy()
        
        # Get total number of elements
        total_elements = arr.size
        n_mask = int(total_elements * MASKING_PERCENT)
        
        # Generate random indices to mask
        flat_indices = np.random.choice(total_elements, size=n_mask, replace=False)
        
        # Convert flat indices to 2D indices
        mask_indices = np.unravel_index(flat_indices, arr.shape)
        
        # Apply mask (set to NaN)
        masked_arr[mask_indices] = np.nan
        
        masked_data.append(masked_arr)
        mask_indices_all.append(list(zip(mask_indices[0], mask_indices[1])))
    
    return masked_data, mask_indices_all

In [5]:
# Autoencoder training: 78B-32 1 sec data 27200701.csv, 27029986-3.csv
# Task Header 1 (DAS Stickslip): 27029986-4.csv
# Task Header 2 (Temp OUT (Degrees)): 27029986-5.csv

In [6]:
windows1 = csv_to_windows("27029986-3.csv", columns)
windows2 = csv_to_windows("78B-32 1 sec data 27200701.csv", columns)

# Shuffle both lists
random.seed(42)
random.shuffle(windows1)
random.shuffle(windows2)

# Take the same amount from each (the minimum length)
min_length = min(len(windows1), len(windows2))
windows1_sampled = windows1[:min_length]
windows2_sampled = windows2[:min_length]

# Combine them
windows = windows1_sampled + windows2_sampled

# Shuffle the combined list
random.shuffle(windows)

print(f"Sampled {min_length:,} from each list".replace(',', ' '))
print(f"Total windows: {len(windows):,}".replace(',', ' '))

Drilling Segments: 3
	1
	2
	3
Windows: 131 072
Windows per Segment: 43 690.67
Drilling Segments: 9
	1
	2
	3
	4
	5
	6
	7
	8
	9
Windows: 332 561
Windows per Segment: 36 951.22
Sampled 131 072 from each list
Total windows: 262 144


In [7]:
train_windows_y, test_windows_y = train_test_split(windows, test_size=0.2, random_state=42)
train_windows_x = mask_data(train_windows_y, MASKING_PERCENT=0.8)
test_windows_x = mask_data(test_windows_y, MASKING_PERCENT=0.8)

0 / 209715
1 / 209715
2 / 209715
3 / 209715
4 / 209715
5 / 209715
6 / 209715
7 / 209715
8 / 209715
9 / 209715
10 / 209715
11 / 209715
12 / 209715
13 / 209715
14 / 209715
15 / 209715
16 / 209715
17 / 209715
18 / 209715
19 / 209715
20 / 209715
21 / 209715
22 / 209715
23 / 209715
24 / 209715
25 / 209715
26 / 209715
27 / 209715
28 / 209715
29 / 209715
30 / 209715
31 / 209715
32 / 209715
33 / 209715
34 / 209715
35 / 209715
36 / 209715
37 / 209715
38 / 209715
39 / 209715
40 / 209715
41 / 209715
42 / 209715
43 / 209715
44 / 209715
45 / 209715
46 / 209715
47 / 209715
48 / 209715
49 / 209715
50 / 209715
51 / 209715
52 / 209715
53 / 209715
54 / 209715
55 / 209715
56 / 209715
57 / 209715
58 / 209715
59 / 209715
60 / 209715
61 / 209715
62 / 209715
63 / 209715
64 / 209715
65 / 209715
66 / 209715
67 / 209715
68 / 209715
69 / 209715
70 / 209715
71 / 209715
72 / 209715
73 / 209715
74 / 209715
75 / 209715
76 / 209715
77 / 209715
78 / 209715
79 / 209715
80 / 209715
81 / 209715
82 / 209715
83 / 209715
84


KeyboardInterrupt



In [None]:
# Make model

model = Sequential()
model.add( LSTM(128, activation='tanh', input_shape=(train_windows[0].shape[0], train_windows[0].shape[1]), return_sequences=True ) )
model.add( LSTM(64, activation='tanh', return_sequences=False ) )
model.add( RepeatVector(train_windows[0].shape[0]) )
model.add( LSTM(64, activation='tanh', return_sequences=True ) )
model.add( LSTM(128, activation='tanh', return_sequences=True ) )
model.add( TimeDistributed( Dense( train_windows[0].shape[0] ) ) )

model.compile(optimizer='adam', loss='mse')
model.summary()

In [None]:
# Make model

fitted_model = model.fit(train_windows_x, train_windows_y, epochs=3, batch_size=32, validation_split=0.1, verbose=1)

plt.plot(fitted_model.history['loss'], label='Training loss')
plt.plot(fitted_model.history['val_loss'], label='Validation loss')
plt.legend()