# Exploratory Data Analysis - Volve Production Data

This notebook explores the Volve field production dataset to understand:
- Data structure and quality
- Production trends by wellbore
- Seasonal patterns
- Key statistics and distributions

In [None]:
# Setup
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from src.data_prep import prepare_data, load_processed_data, aggregate_total_production
from src.features import engineer_features
from src.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.2f}'.format)

## 1. Load and Prepare Data

In [None]:
# Find and load data
raw_paths = [
    RAW_DATA_DIR / "Volve production data.csv",
    project_root.parent / "Course Notebooks" / "Data" / "Volve production data.csv",
]

raw_path = None
for path in raw_paths:
    if path.exists():
        raw_path = path
        break

if raw_path:
    df = prepare_data(raw_path, save_output=True)
    print(f"Loaded data from: {raw_path}")
else:
    print("Raw data not found. Please check data/README.md for instructions.")

In [None]:
# Basic info
print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")
print(f"\nWellbores: {df['wellbore'].unique()}")

In [None]:
# Sample data
df.head(10)

## 2. Data Quality Assessment

In [None]:
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
pd.DataFrame({'Missing Count': missing, 'Missing %': missing_pct})

In [None]:
# Summary statistics
df.describe()

In [None]:
# Records per wellbore
wellbore_counts = df.groupby('wellbore').agg({
    'date': ['min', 'max', 'count'],
    'oil': 'sum'
}).round(0)
wellbore_counts.columns = ['First Month', 'Last Month', 'Months', 'Total Oil']
wellbore_counts.sort_values('Total Oil', ascending=False)

## 3. Production Trends

In [None]:
# Total field production over time
total_prod = aggregate_total_production(df)

fig = make_subplots(rows=3, cols=1, shared_xaxes=True,
                    subplot_titles=['Oil Production', 'Gas Production', 'Water Production'])

fig.add_trace(go.Scatter(x=total_prod['date'], y=total_prod['oil'], 
                         mode='lines', name='Oil', line=dict(color='green')), row=1, col=1)
fig.add_trace(go.Scatter(x=total_prod['date'], y=total_prod['gas'], 
                         mode='lines', name='Gas', line=dict(color='red')), row=2, col=1)
fig.add_trace(go.Scatter(x=total_prod['date'], y=total_prod['water'], 
                         mode='lines', name='Water', line=dict(color='blue')), row=3, col=1)

fig.update_layout(height=700, title_text='Volve Field Total Production')
fig.show()

In [None]:
# Production by wellbore
fig = px.line(df, x='date', y='oil', color='wellbore',
              title='Oil Production by Wellbore',
              labels={'oil': 'Oil (Sm³)', 'date': 'Date'})
fig.show()

In [None]:
# Stacked area chart - contribution by wellbore
pivot_df = df.pivot_table(index='date', columns='wellbore', values='oil', aggfunc='sum').fillna(0)

fig = go.Figure()
for col in pivot_df.columns:
    fig.add_trace(go.Scatter(x=pivot_df.index, y=pivot_df[col], 
                             mode='lines', stackgroup='one', name=col))

fig.update_layout(title='Oil Production Contribution by Wellbore',
                  xaxis_title='Date', yaxis_title='Oil (Sm³)')
fig.show()

## 4. Seasonal Analysis

In [None]:
# Add features for analysis
df_features = engineer_features(df)

In [None]:
# Monthly pattern (total production)
total_features = aggregate_total_production(df_features)
total_features['month'] = total_features['date'].dt.month

monthly_avg = total_features.groupby('month')['oil'].mean().reset_index()

fig = px.bar(monthly_avg, x='month', y='oil',
             title='Average Oil Production by Month',
             labels={'oil': 'Avg Oil (Sm³)', 'month': 'Month'})
fig.update_xaxes(tickmode='linear', tick0=1, dtick=1)
fig.show()

In [None]:
# Yearly production totals
total_features['year'] = total_features['date'].dt.year
yearly_totals = total_features.groupby('year')[['oil', 'gas', 'water']].sum().reset_index()

fig = px.bar(yearly_totals, x='year', y='oil',
             title='Annual Oil Production',
             labels={'oil': 'Total Oil (Sm³)', 'year': 'Year'})
fig.show()

## 5. Production Distributions

In [None]:
# Distribution of monthly oil production
fig = px.histogram(df, x='oil', nbins=50, color='wellbore',
                   title='Distribution of Monthly Oil Production',
                   labels={'oil': 'Oil (Sm³)'})
fig.show()

In [None]:
# Box plot by wellbore
fig = px.box(df, x='wellbore', y='oil',
             title='Oil Production Distribution by Wellbore',
             labels={'oil': 'Oil (Sm³)', 'wellbore': 'Wellbore'})
fig.show()

## 6. Correlations

In [None]:
# Correlation between production types
numeric_cols = ['oil', 'gas', 'water', 'on_stream']
corr_matrix = df[numeric_cols].corr()

fig = px.imshow(corr_matrix, text_auto='.2f',
                title='Correlation Matrix',
                color_continuous_scale='RdBu_r')
fig.show()

In [None]:
# Oil vs Gas relationship
fig = px.scatter(df, x='oil', y='gas', color='wellbore',
                 title='Oil vs Gas Production',
                 labels={'oil': 'Oil (Sm³)', 'gas': 'Gas (Sm³)'})
fig.show()

## 7. Key Findings Summary

### Data Overview
- **Time period**: 2007-2016 (varies by wellbore)
- **Wellbores**: 7 production wells
- **Data quality**: Minimal missing values in core production columns

### Production Characteristics
- Production peaked around 2014-2015
- Decline observed towards end of field life (2016)
- Well 15/9-F-11 was the highest producer
- Strong correlation between oil and gas production (associated gas)

### Seasonality
- Some monthly variation, but not strongly seasonal
- Uptime variations contribute to production fluctuations

### Next Steps
- Proceed to forecasting notebook (02_forecast_backtest.ipynb)
- Use exponential smoothing for trend capture
- Consider wellbore-level forecasts for detailed analysis