In [None]:
import pandas as pd
import numpy as np

# fix paths:
import fix_path

from experimentation.generate_data import generate_data_csv
from oilwells.prepare.prepare import add_time_to_failure

%matplotlib inline
import matplotlib.pyplot as plt

import os
import seaborn as sns
sns.set()

### Brief data exploration

You can see more details [here](https://github.com/joehahn/predictive-maintenance-sim) but this quickly runs through to show we're getting similar data.

In [None]:
data_dir = '../data'
os.makedirs(data_dir, exist_ok=True)
path = os.path.join(data_dir, 'oilwells.csv')
# Just try with 50 wells for 3 years for now.
generate_data_csv(path, days=365*3, n_wells=50, n_technicians=5, seed=17)
df = pd.read_csv(path, parse_dates=['time'])
df = add_time_to_failure(df)  # This runs our data preprocessing step locally
df

### Pick a device and plot it

In [None]:
well_id = 0
well = df.query(f'id == {well_id}')
x = well.time
fig, ax = plt.subplots(1, 1, figsize=(15, 6))
ax2 = ax.twinx()
ax2.grid(False)
p = ax.plot(x, well.production_rate, linestyle='-', label='production')
p = ax2.plot(x, well.hours_to_failure, linestyle='-', label='hours to failure', color='black')
for issue in well.issue.unique().tolist():
    if pd.isnull(issue):
        continue
    with_issue = well[well.issue == issue]
    if with_issue.shape[0] > 0:
        x = with_issue.time
        y = with_issue.production_rate
        p = ax.plot(x, y, linestyle='none', marker='o', markersize=10, label=issue)
    p = ax2.plot(well.time, well[f"{issue}_hours_to_failure"], linestyle=':', label=f'{issue} hours to failure', linewidth=5)
p = ax.set_title(f'production efficiency vs time for deviceID={well_id}')
p = ax.set_xlabel('time')
p = ax.set_ylabel('production efficiency')
leg = ax.legend(loc='center left')
leg = ax2.legend(loc='center right')
plt.show();

### Do jammed rotor faults occur at abnormal pressures/temperatures?

In [None]:
# only sample a certain number of points:
sample_idx = np.random.choice(np.arange(df.shape[0]), max(10000, int(df.shape[0] / 10)))
x = df.temperature[sample_idx]
y = df.pressure[sample_idx]
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
p = ax.plot(x, y, marker='.', markersize=1, linestyle='none', alpha=0.4, label='operating')
x = df.temperature[df.issue == 'jammed_rotor']
y = df.pressure[df.issue == 'jammed_rotor']
p = ax.plot(x, y, marker='.', markersize=3, linestyle='none', alpha=0.4, label=issue, color='red')
p = ax.plot(0, 0, marker='+', linestyle='none', markersize=15, color='black', mew=1, label='sweet spot')
p = ax.set_title(issue + ' occurances')
p = ax.set_xlabel(r'temperature T')
p = ax.set_ylabel(r'pressure P')
p = ax.set_xlim(-1.5, 1.5)
p = ax.set_ylim(-1.5, 1.5)
leg = ax.legend(numpoints=1)
for lh in leg.legendHandles: 
    lh.set_alpha(1.0)
    lh.set_linewidth(2.0)
    lh.set_marker('o')
    lh.set_markersize(5.0)
plt.show();