# Forecasting

In order to generate models, quality checks, reserves, etc., we need forecasts of the well production. This notebook demonstrates:

1. **Data Processing**: Preparing well production data for forecasting
2. **ARPS Decline Curves**: Automatically fitting exponential, hyperbolic, and harmonic decline curves
3. **Forecasting**: Generating production forecasts for individual wells

## ARPS Decline Curves

The Arps decline curve equations are fundamental tools in petroleum engineering for forecasting oil and gas production:

- **Exponential Decline (b=0)**: `q(t) = qi * exp(-Di * t)`
- **Hyperbolic Decline (0<b<1)**: `q(t) = qi * (1 + b * Di * t)^(-1/b)`
- **Harmonic Decline (b=1)**: `q(t) = qi / (1 + Di * t)`

Where:
- `qi` = initial production rate
- `Di` = initial decline rate
- `b` = decline exponent
- `t` = time

In [None]:
csv_path_pattern

'Volumes/shm/petrinex/bronze/conventional/*.CSV'

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from databricks.connect import DatabricksSession as SparkSession

from petrinex.config import load_config
from petrinex.forecast import (
    prepare_well_data,
    forecast_well_production,
    forecast_multiple_wells,
    export_forecast_summary_table,
    export_forecasted_production_table,
    combine_historical_and_forecast
)

In [20]:
spark = SparkSession.builder.getOrCreate()
config = load_config("config.yaml")

In [None]:
# Load fixture data for development/testing
ngl_df = pd.read_parquet("../fixtures/ngl_vol_bronze_cvx.parquet")
print(f"Loaded NGL data: {ngl_df.shape[0]} records, {ngl_df['WellID'].nunique()} unique wells")

# For production use with Databricks, uncomment the following:
# conv_df = spark.table("shm.petrinex.conv_vol_bronze").toPandas()
# ngl_df = spark.table("shm.petrinex.ngl_vol_bronze").toPandas()

In [None]:
# Step 1: Prepare well data for forecasting
processed_data = prepare_well_data(ngl_df, min_months=12)
print(f"After processing: {processed_data['WellID'].nunique()} wells with 12+ months of data")

# Show sample of processed data
print("\nSample processed data:")
sample_well = processed_data['WellID'].iloc[0]
sample_data = processed_data[processed_data['WellID'] == sample_well][
    ['ProductionMonth', 'WellID', 'OilProduction', 'GasProduction', 'DaysFromFirst']
].head(10)
print(sample_data)

In [None]:
# Step 2: Forecast a single well to demonstrate the process
sample_well_data = processed_data[processed_data['WellID'] == sample_well].copy()

# Forecast oil production
oil_forecast = forecast_well_production(
    sample_well_data, 
    forecast_months=24, 
    production_column='OilProduction'
)

if oil_forecast['success']:
    print(f"Oil Forecast for {sample_well}:")
    print(f"  Curve Type: {oil_forecast['curve_type']}")
    print(f"  R-squared: {oil_forecast['r_squared']:.3f}")
    print(f"  Parameters: {oil_forecast['parameters']}")
else:
    print(f"Oil forecast failed: {oil_forecast['error']}")

# Forecast gas production
gas_forecast = forecast_well_production(
    sample_well_data, 
    forecast_months=24, 
    production_column='GasProduction'
)

if gas_forecast['success']:
    print(f"\nGas Forecast for {sample_well}:")
    print(f"  Curve Type: {gas_forecast['curve_type']}")
    print(f"  R-squared: {gas_forecast['r_squared']:.3f}")
    print(f"  Parameters: {gas_forecast['parameters']}")
else:
    print(f"Gas forecast failed: {gas_forecast['error']}")

In [None]:
# Step 3: Visualize the forecast
def plot_forecast(forecast_result, production_column='OilProduction'):
    """Plot historical data and forecast for a well."""
    if not forecast_result['success']:
        print(f"Cannot plot: {forecast_result['error']}")
        return
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Historical data
    hist_data = forecast_result['historical_data']
    hist_data_filtered = hist_data[hist_data[production_column] > 0]
    
    ax.scatter(
        hist_data_filtered['DaysFromFirst'], 
        hist_data_filtered[production_column],
        alpha=0.7, 
        label='Historical Data',
        color='blue'
    )
    
    # Forecast
    forecast_data = forecast_result['forecast']
    ax.plot(
        forecast_data['DaysFromFirst'], 
        forecast_data[f'{production_column}_Forecast'],
        'r-', 
        label=f'{forecast_result["curve_type"].title()} Forecast',
        linewidth=2
    )
    
    ax.set_xlabel('Days from First Production')
    ax.set_ylabel(f'{production_column} Rate')
    ax.set_title(f'{forecast_result["well_id"]} - {production_column} Forecast\\n'
                f'{forecast_result["curve_type"].title()} Curve (R² = {forecast_result["r_squared"]:.3f})')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Plot the oil forecast
if oil_forecast['success']:
    plot_forecast(oil_forecast, 'OilProduction')

# Plot the gas forecast  
if gas_forecast['success']:
    plot_forecast(gas_forecast, 'GasProduction')


In [None]:
# Step 4: Batch forecast multiple wells
print("Running batch forecasting for all wells...")

# Forecast oil production for all wells
oil_forecasts = forecast_multiple_wells(
    processed_data,
    forecast_months=24,
    production_column='OilProduction',
    curve_type='auto',
    min_r_squared=0.5
)

# Forecast gas production for all wells
gas_forecasts = forecast_multiple_wells(
    processed_data,
    forecast_months=24,
    production_column='GasProduction', 
    curve_type='auto',
    min_r_squared=0.5
)

print(f"\nOil Forecasting Results:")
print(f"  Successfully forecast {len(oil_forecasts)} wells")

print(f"\nGas Forecasting Results:")
print(f"  Successfully forecast {len(gas_forecasts)} wells")


In [None]:
# Step 5: Analyze forecasting results
def analyze_forecast_results(forecasts, production_type):
    """Analyze and summarize forecasting results."""
    if not forecasts:
        print(f"No successful {production_type} forecasts to analyze")
        return
    
    # Collect statistics
    curve_types = [f['curve_type'] for f in forecasts.values()]
    r_squared_values = [f['r_squared'] for f in forecasts.values()]
    
    print(f"\n{production_type} Forecasting Analysis:")
    print(f"  Total wells forecast: {len(forecasts)}")
    print(f"  Average R-squared: {np.mean(r_squared_values):.3f}")
    print(f"  Median R-squared: {np.median(r_squared_values):.3f}")
    print(f"  Min R-squared: {np.min(r_squared_values):.3f}")
    print(f"  Max R-squared: {np.max(r_squared_values):.3f}")
    
    # Curve type distribution
    from collections import Counter
    curve_counts = Counter(curve_types)
    print(f"  Curve type distribution:")
    for curve_type, count in curve_counts.most_common():
        pct = 100 * count / len(forecasts)
        print(f"    {curve_type.title()}: {count} wells ({pct:.1f}%)")

# Analyze results
analyze_forecast_results(oil_forecasts, "Oil")
analyze_forecast_results(gas_forecasts, "Gas")


In [None]:
# Step 6: Export forecast results as structured tables

# Export forecast summary tables
oil_summary_table = export_forecast_summary_table(oil_forecasts, "OilProduction")
gas_summary_table = export_forecast_summary_table(gas_forecasts, "GasProduction")

print("Forecast Summary Tables:")
print(f"Oil forecasts: {len(oil_summary_table)} wells")
print(f"Gas forecasts: {len(gas_summary_table)} wells")

# Display sample summary data
if len(oil_summary_table) > 0:
    print("\nOil Forecast Summary (first 3 wells):")
    print(oil_summary_table.head(3))

if len(gas_summary_table) > 0:
    print("\nGas Forecast Summary (first 3 wells):")
    print(gas_summary_table.head(3)[['WellID', 'CurveType', 'RSquared', 'InitialRate_qi', 'DeclineRate_di', 'ForecastCumulative12Month']])

# Export to CSV
oil_summary_table.to_csv("oil_forecast_summary.csv", index=False)
gas_summary_table.to_csv("gas_forecast_summary.csv", index=False)
print(f"\nExported summary tables to CSV files")


In [None]:
# Step 7: Export forecasted production data with same schema as input

# Export forecasted production tables (same schema as original NGL data)
oil_forecast_production = export_forecasted_production_table(oil_forecasts, "OilProduction", processed_data)
gas_forecast_production = export_forecasted_production_table(gas_forecasts, "GasProduction", processed_data)

print("Forecasted Production Tables:")
print(f"Oil production forecasts: {len(oil_forecast_production)} records")
print(f"Gas production forecasts: {len(gas_forecast_production)} records")

# Display sample of forecasted production data
if len(gas_forecast_production) > 0:
    print("\nSample Gas Forecasted Production Data:")
    sample_forecast = gas_forecast_production.head(5)
    print(sample_forecast[['WellID', 'ProductionMonth', 'GasProduction', 'DataType', 'ForecastMethod', 'ForecastRSquared']])

# Combine historical and forecast data for gas production
if len(gas_forecast_production) > 0:
    combined_gas_data = combine_historical_and_forecast(
        processed_data, 
        gas_forecast_production, 
        "GasProduction"
    )
    
    print(f"\nCombined historical + forecast data: {len(combined_gas_data)} records")
    print(f"Historical records: {(combined_gas_data['DataType'] == 'Historical').sum()}")
    print(f"Forecast records: {(combined_gas_data['DataType'] == 'Forecast').sum()}")
    
    # Show sample of combined data for one well
    sample_well = gas_forecast_production['WellID'].iloc[0]
    well_combined = combined_gas_data[combined_gas_data['WellID'] == sample_well]
    print(f"\nSample combined data for well {sample_well}:")
    print(well_combined[['ProductionMonth', 'GasProduction', 'DataType', 'ForecastMethod']].tail(10))


In [None]:
# Step 8: Export all tables to files

# Export forecasted production tables
if len(oil_forecast_production) > 0:
    oil_forecast_production.to_csv("oil_forecasted_production.csv", index=False)
    oil_forecast_production.to_parquet("oil_forecasted_production.parquet", index=False)
    print(f"Exported oil forecasted production: {len(oil_forecast_production)} records")

if len(gas_forecast_production) > 0:
    gas_forecast_production.to_csv("gas_forecasted_production.csv", index=False)
    gas_forecast_production.to_parquet("gas_forecasted_production.parquet", index=False)
    print(f"Exported gas forecasted production: {len(gas_forecast_production)} records")

# Export combined historical + forecast data
if len(gas_forecast_production) > 0:
    combined_gas_data.to_csv("combined_gas_historical_forecast.csv", index=False)
    combined_gas_data.to_parquet("combined_gas_historical_forecast.parquet", index=False)
    print(f"Exported combined gas data: {len(combined_gas_data)} records")

print("\nExported Files:")
print("- Forecast summaries: oil_forecast_summary.csv, gas_forecast_summary.csv")
print("- Forecasted production: oil_forecasted_production.csv/.parquet, gas_forecasted_production.csv/.parquet") 
print("- Combined data: combined_gas_historical_forecast.csv/.parquet")

print("\nAll tables maintain schema compatibility with original NGL input data!")
