# Libs

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from tqdm import tqdm

from scipy.stats import norm
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

pd.options.display.precision = 3
pd.options.display.float_format = lambda x: '%.5f' % x
pd.options.display.max_columns = 15
pd.options.display.max_rows = 6

# Data

In [56]:
df8 = pd.read_csv(r'C:\jupyter\SPP\inputoutput\general_logs\logs8_ntd_v5.csv')
df10 = pd.read_csv(r'C:\jupyter\SPP\inputoutput\general_logs\logs10_ntd_v4.csv')
df8_ntd = pd.read_csv(r'C:\jupyter\SPP\inputoutput\layers\ntd_top_phi_bot8_bp_v4.csv').drop(columns=['Unnamed: 0'])
df10_ntd = pd.read_csv(r'C:\jupyter\SPP\inputoutput\layers\ntd_top_phi_bot10_bp_v4.csv').drop(columns=['Unnamed: 0'])

In [None]:
import math

test = df8[df8.well.isin(['C01','C01AY'])][['well','formation_up','xmean','ymean']].drop_duplicates()
display(test, test['xmean'].iloc[1])
def calculate_distance(x1, y1, x2, y2):
    return math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)

calculate_distance(test['xmean'].iloc[0], test['ymean'].iloc[0], test['xmean'].iloc[1], test['ymean'].iloc[1])

In [None]:
df8.columns, df10.columns

In [None]:
df8_ntd.columns, df10_ntd.columns

# P50 calc

In [None]:
def cdf(df, wellname):
    data = df[df.well == wellname].copy()
    mu = data['htst'].mean()
    sigma = data['htst'].std()

    x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
    cdf = norm.cdf(x, mu, sigma)
    value_p50 = norm.ppf(0.5, mu, sigma)
    plt.plot(x, cdf)
    plt.xlabel('htst')
    plt.ylabel('Cumulative Probability')
    plt.title(f'CDF of htst for well {wellname}')
    print(wellname, mu, sigma, value_p50);

def ecdf(df, wellname, p):
    data = df[(df.well == wellname) & (df.htst > 0.5)]['htst']
    # Sort the data
    x = np.sort(data)
    x_norm = (x - x.min()) / (x.max() - x.min())
    # Calculate ECDF: y-value for each data point
    y = np.arange(1, len(x)+1) / len(x)
    # Plot ECDF
    vnorm_p = np.percentile(x_norm, p*100)
    value_p = np.percentile(x, p*100)
    plt.plot(x_norm, y, marker='.', linestyle='none', label=wellname)
    plt.scatter(vnorm_p, p, s=50, alpha=0.5, ec='black')
    plt.xlabel('htst norm')
    plt.legend()
    plt.ylabel('Cumulative Probability')
    plt.title(f'ECDF of htst for well')
    plt.margins(0.02)  # Keeps data off plot edges
    print(wellname, 'value p50:',value_p)
    return x, y

b39 = ecdf(df8_ntd, 'B39', 0.5)
d37 = ecdf(df8_ntd, 'D37', 0.5)
b01y = ecdf(df8_ntd, 'C16', 0.5)

In [6]:
def quntile_calc(df, cutoff, quant):
    df_lst = []
    for wellname in df.well.unique():
        data = df[(df.well == wellname) & (df.htst > cutoff)]
        p50_1m = data['htst'].quantile(quant)
        df_res = pd.DataFrame({'well': wellname, f'htst_p{quant*100:.0f}_{cutoff}m': p50_1m}, index=[0])
        df_lst.append(df_res)
    df_res = pd.concat(df_lst).reset_index(drop=True)
    return df_res
htst_p50 = quntile_calc(df8_ntd, 1, 0.5)
# df8_ntd_p50 = df8_ntd.set_index('well').join(htst_p50.set_index('well'))

# Agg fuctions bal8

In [None]:
df8.columns

In [None]:
df8_field = df8[['well','formation_up', 'field']].drop_duplicates()
df8_field.replace({'field': {'2_CENTRAL AZERI': 'CENTRAL AZERI', '3_EAST AZERI': 'EAST AZERI', '1_WEST AZERI': 'WEST AZERI'}}, inplace=True)
df10_field = df10[['well','formation_up', 'field']].drop_duplicates()
df_field = pd.concat([df8_field, df10_field]).drop_duplicates().reset_index(drop=True)
df_field.loc[df_field.formation_up == 'Balakhany VIII', 'formation'] = 'bal8'
df_field.loc[df_field.formation_up == 'Balakhany X', 'formation'] = 'bal10'
df_field = df_field.drop(columns=['formation_up'])
df_field

In [58]:
def fluid_assign(df, fm):
    df_fluid = df.groupby('well')['fluid_code'].apply(lambda x: x.mode()).reset_index().drop(columns='level_1')
    df_fluid['fluid'] = 'un'
    df_fluid.loc[df_fluid.fluid_code == 1, 'fluid'] = 'gas'
    df_fluid.loc[df_fluid.fluid_code == 2, 'fluid'] = 'oil'
    df_fluid.loc[df_fluid.fluid_code == 3, 'fluid'] = 'water'
    df_fluid.loc[df_fluid.fluid_code == 4, 'fluid'] = 'tr_gas'
    df_fluid.loc[df_fluid.fluid_code == 5, 'fluid'] = 'res_gas'
    df_fluid.loc[df_fluid.fluid_code == 6, 'fluid'] = 'spt_oil'
    df_fluid.loc[df_fluid.fluid_code == 7, 'fluid'] = 'lcg'
    df_fluid.loc[df_fluid.fluid_code == 8, 'fluid'] = 'lco'
    df_fluid['formation'] = fm
    return df_fluid

df8_fluid = fluid_assign(df8, 'bal8').drop(columns='fluid_code')
df10_fluid = fluid_assign(df10, 'bal10').drop(columns='fluid_code')
df_fluid = pd.concat([df8_fluid, df10_fluid]).reset_index(drop=True)

In [None]:
def agg_func_ntd(df, fm):
    def gmean_func(x):
        return np.exp(np.mean(np.log(x)))

    def p25_1m(x):
        x = x[x>1]
        return np.percentile(x, 25)
    def p50_1m(x):
        x = x[x>1]
        return np.percentile(x, 50)
    def p75_1m(x):
        x = x[x>1]
        return np.percentile(x, 75)
    def p25(x):
        return np.percentile(x, 25)
    def p50(x):
        return np.percentile(x, 50)
    def p75(x):
        return np.percentile(x, 75)

    aggregation_functions = {
        'phit_avg': ['mean', 'median', 'sum'],  
        'vsh_avg': ['mean', 'median', 'sum'],
        'perm_avg': ['sum', gmean_func, p25, p50, p75],
        'htst': ['sum', 'count', p50, p25_1m, p50_1m, p75_1m],
        'khtst':['sum']}
    # Group by 'well' and aggregate according to the defined functions
    aggregated_df = df.groupby('well').agg(aggregation_functions).reset_index()
    aggregated_df.columns = ['_'.join(col).strip() for col in aggregated_df.columns.values]
    aggregated_df = aggregated_df.rename(columns={'well_': 'well'})
    aggregated_df['formation'] = fm
    return aggregated_df

df8_ntd_agg = agg_func_ntd(df8_ntd, 'bal8')
df10_ntd_agg = agg_func_ntd(df10_ntd, 'bal10')
df_ntd_agg = pd.concat([df8_ntd_agg, df10_ntd_agg]).reset_index(drop=True)
df_ntd_agg

In [None]:
sns.pairplot(df_ntd_agg[['phit_avg_mean', 'vsh_avg_mean', 'perm_avg_p50', 'htst_sum', 'htst_count', 'khtst_sum', 'htst_p50_1m', 'formation']], hue='formation');

In [None]:
def dbscan_run(df, eps_run, samples_run):
    # Selecting numerical columns (excluding 'formation' since it's categorical)
    features = df

    # Standardizing the features (important for distance-based algorithms like DBSCAN)
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    # Instantiate DBSCAN
    dbscan = DBSCAN(eps = eps_run, min_samples = samples_run)  # Adjust eps and min_samples based on your dataset

    # Fit DBSCAN to the scaled features
    dbscan.fit(features_scaled)

    # Extract labels (-1 indicates outliers/noise)
    labels = dbscan.labels_

    # Add cluster labels to the original DataFrame
    features['cluster'] = labels

    # # Identifying outliers
    # outliers = features[features['cluster'] == -1]
    # print("Outliers:\n", outliers)

    # # You can also explore the number of clusters found
    # n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    # print("Number of clusters:", n_clusters)
    sns.pairplot(features, hue='cluster', palette='viridis');
    return features

dbscan_res = dbscan_run(df_ntd_agg[['phit_avg_mean', 'htst_p50_1m']], 0.7, 3)
df_ntd_agg_v2 = df_ntd_agg.join(dbscan_res['cluster'])

In [None]:
df_ntd_agg_v2.columns

In [None]:
df_ntd_agg_v2 = df_ntd_agg_v2[df_ntd_agg_v2.cluster != -1]
sns.pairplot(df_ntd_agg_v2[['phit_avg_mean', 'vsh_avg_mean', 'perm_avg_gmean_func', 
                            'htst_sum', 'htst_count', 'khtst_sum', 'htst_p50_1m', 
                            'formation']], hue='formation');

In [None]:
df_ntd_agg_v2.columns

In [15]:
log8_ntd = pd.read_csv(r'C:\jupyter\SPP\inputoutput\general_logs\logs8_ntd_v5.csv')
# log8_ntd[['well','khtst', 'formation_up']].groupby('well').first()

In [None]:
sns.pairplot(df_ntd_agg_v2[['phit_avg_mean', 'vsh_avg_mean', 'perm_avg_p50',
                            'khtst_sum',
                            'htst_sum', 'htst_count', 'htst_p50',
                            'htst_p25_1m', 'htst_p50_1m', 'htst_p75_1m',
                            'formation']], hue='formation');

In [64]:
df_ntd_agg_v3 = (df_ntd_agg_v2.set_index(['well','formation']).join(df_fluid.set_index(['well','formation']))).join(df_field.set_index(['well','formation'])).reset_index()

In [None]:
df_ntd_agg_v3.field.unique()

In [66]:
df_ntd_agg_v3.to_csv(r'C:\jupyter\SPP\inputoutput\layers\ntd_agg_v3.csv', index=False)