# Libs

In [2]:
import warnings
warnings.filterwarnings('ignore')

import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.metrics.pairwise import euclidean_distances

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from pykrige.rk import RegressionKriging
from sklearn.model_selection import GridSearchCV
from pykrige.rk import Krige
from tqdm.notebook import tqdm
import plotly.graph_objects as go
import plotly.io as pio
import os
import matplotlib.image as mpimg

from sklearn.datasets import fetch_california_housing

pd.options.display.precision = 3
pd.options.display.float_format = lambda x: '%.5f' % x
pd.options.display.max_columns = 15
pd.options.display.max_rows = 6

# Data uploading

In [3]:
df_bal8_v4 = pd.read_csv('C:\jupyter\SPP\inputoutput\general_logs\df_bal8_azr_v4.csv')
df_bal8_v4.columns = df_bal8_v4.columns.str.lower()
df_bal8_v4.loc[df_bal8_v4.formation=='Balakhany VIII sand','formation'] = '1_bal8_sand'
df_bal8_v4.loc[df_bal8_v4.formation=='Balakhany VIII 25','formation'] = '2_bal8_25'
df_bal8_v4.loc[df_bal8_v4.formation=='Balakhany VIII 20','formation'] = '3_bal8_20'
df_bal8_v4.loc[df_bal8_v4.formation=='Balakhany VIII 15','formation'] = '4_bal8_15'
df_bal8_v4.loc[df_bal8_v4.formation=='Balakhany VIII 10','formation'] = '5_bal8_10'
df_bal8_v4.loc[df_bal8_v4.formation=='Balakhany VIII 5','formation'] = '6_bal8_5'
well_phit_flag8 = df_bal8_v4[df_bal8_v4.phit_flag==1].groupby('well')['phit_flag'].apply(lambda x: x.iloc[0]).reset_index().well.unique()
df_bal8_v4_flag = df_bal8_v4[df_bal8_v4.well.isin(well_phit_flag8)]
df_bal8_v4_flag = df_bal8_v4_flag[~df_bal8_v4_flag.well.isin(['B01ST1', 'D01', 'C14', 'C01A', 'B06', 'C13Z', 'C06', 'D01Z','C07'])]

df_bal10_v4 = pd.read_csv('C:\jupyter\SPP\inputoutput\general_logs\df_bal10_vshclp2_v4.csv')
df_bal10_v4.columns = df_bal10_v4.columns.str.lower()
df_bal10_v4.loc[df_bal10_v4.formation=='Balakhany X sand','formation'] = '1_bal10_sand'
df_bal10_v4.loc[df_bal10_v4.formation=='Balakhany X 50','formation'] = '2_bal10_40'
df_bal10_v4.loc[df_bal10_v4.formation=='Balakhany X 40','formation'] = '2_bal10_40'
df_bal10_v4.loc[df_bal10_v4.formation=='Balakhany X 20','formation'] = '3_bal10_20'
well_phit_flag10 = df_bal10_v4[df_bal10_v4.phit_flag==1].groupby('well')['phit_flag'].apply(lambda x: x.iloc[0]).reset_index().well.unique()
df_bal10_v4_flag = df_bal10_v4[df_bal10_v4.well.isin(well_phit_flag10)]

In [4]:
ntd_top_phi_bot8_bp_v4 = pd.read_csv(r'C:\jupyter\SPP\inputoutput\layers\ntd_top_phi_bot8_bp_v4.csv').drop('Unnamed: 0', axis=1)
ntd_top_phi_bot8_bp_v4.columns = ntd_top_phi_bot8_bp_v4.columns.str.lower()
ntd_top_phi_bot8_bp_v4 = ntd_top_phi_bot8_bp_v4[~ntd_top_phi_bot8_bp_v4.well.isin(['B01ST1', 'D01', 'C14', 'C01A', 'B06', 'C13Z', 'C06', 'D01Z','C07'])]

ntd_top_phi_bot10_bp_v4 = pd.read_csv(r'C:\jupyter\SPP\inputoutput\layers\ntd_top_phi_bot10_bp_v4.csv').drop('Unnamed: 0', axis=1)
ntd_top_phi_bot10_bp_v4.columns = ntd_top_phi_bot10_bp_v4.columns.str.lower()

In [5]:
def well_dist_calc(dataset, fm):
    data = dataset.groupby('well')[['xmean', 'ymean']].first().reset_index().dropna()
    row_name = data.well.reset_index().drop(['index'], axis=1)
    distance_fm = pd.DataFrame(euclidean_distances(data[['xmean', 'ymean']]), columns=list(data.well))
    distance_fm_well = distance_fm.join(row_name).set_index('well')
    distance_fm_well = distance_fm_well.reset_index()
    dist_melt = distance_fm_well.melt(id_vars='well', 
                                var_name='well2', 
                                value_name='dist').rename(columns={'well':'well_offset', 'well2':'well'})
    dist_melt = dist_melt[['well', 'well_offset', 'dist']]
    dist_melt = dist_melt[dist_melt.dist != 0].sort_values(by=['well','dist'])
    dist_melt['formation_up'] = fm
    return dist_melt

dist_bal8 = well_dist_calc(df_bal8_v4_flag, 'Balakhany VIII')
dist_bal10 = well_dist_calc(df_bal10_v4_flag, 'Balakhany X')

# Well pairs

In [5]:
# df_lst = []
# for wellname in dist_bal8.well.unique():
#     data = dist_bal8[dist_bal8.well == wellname].iloc[0:1]
#     df_lst.append(data)
# well_pairs = pd.concat(df_lst).reset_index(drop=True).drop_duplicates(subset=['dist'])
# well_pairs_v2 = well_pairs[well_pairs.dist < 500] # 235m is the max distance between wells selected by elbow plot with distance 500m
# # well_pairs.hist(column='dist', bins=50)
# well_pairs_v2.sort_values(by='dist', ascending=True)

In [6]:
# fig = plt.figure(figsize=(20, 5))
# sns.lineplot(data=well_pairs_v2.sort_values(by='dist', ascending=False), x='well', y='dist')
# plt.xticks(rotation=90)
# plt.grid()

In [7]:
# df_lst = []
# for wellname in dist_bal8.well.unique():
#     data = dist_bal8[dist_bal8.well == wellname].iloc[0:1]
#     df_lst.append(data)
# well_pairs = pd.concat(df_lst).reset_index(drop=True).drop_duplicates(subset=['dist'])
# well_pairs_v3 = well_pairs[well_pairs.dist < 230] # 235m is the max distance between wells selected by elbow plot with distance 500m
# # well_pairs.hist(column='dist', bins=50)
# well_pairs_v3 = well_pairs_v3.sort_values(by='dist', ascending=True).reset_index(drop=True)
# well_pairs_v3

In [8]:
# def well_offset_comparison_dashboard_pairs(dataset_wells, dataset_layers, dist_df,  well_target, offset_qty, fm_name, print_flag):
#     offset_well_list = dist_df[dist_df.well == well_target].iloc[:offset_qty]['well_offset'].values.tolist()
#     # offset_well_list = dist_df[dist_df.well == well_target].iloc[:offset_qty]
#     well_list = [well_target] + offset_well_list
#     data_logs = dataset_wells[(dataset_wells.well.isin(well_list)) & (dataset_wells.phit != 0)]
#     data_layers = dataset_layers[   (dataset_layers.well.isin(well_list)) & 
#                                     (dataset_layers.htst > 1)]
#     khtst_logs = data_logs.groupby(['well','formation'])[['khtst']].apply(lambda x: x.iloc[0] - x.iloc[-1]).reset_index()

#     def khtst_layer_calculation(data_logs):
#         data = data_logs[data_logs.net == 1]
#         df_lst = []
#         for wellname in data.well.unique():
#             well_data = data_logs[data_logs.well == wellname]
#             well_data['tst_index_rev'] = [i for i in range(len(well_data['tst']))[::-1]]
#             df_lst.append(well_data)
#         data_logs_khtst = pd.concat(df_lst)
#         return data_logs_khtst
#     data_logs_khtst = khtst_layer_calculation(data_logs)

#     def well_dist_title(dist_df):
#         offset_well_list = dist_df[dist_df.well == well_target].iloc[:offset_qty]
#         well = offset_well_list['well'].iloc[0]
#         well1 = offset_well_list.iloc[0,1]
#         dist1 = offset_well_list.iloc[0,2].round(0).astype(int)
#         # well2 = offset_well_list.iloc[1,1]
#         # dist2 = offset_well_list.iloc[1,2].round(0).astype(int)
#         # well3 = offset_well_list.iloc[2,1]
#         # dist3 = offset_well_list.iloc[2,2].round(0).astype(int)
#         return f"target well {well} : offsets {well1} - {dist1}m orange;"

#     fig = plt.figure(figsize=(22, 10))
#     gs = gridspec.GridSpec(2, 4, figure=fig)
#     ax1 = fig.add_subplot(gs[0, 0])
#     ax2 = fig.add_subplot(gs[0, 1])
#     ax3 = fig.add_subplot(gs[0, 2])
#     ax4 = fig.add_subplot(gs[0, 3])
#     ax5 = fig.add_subplot(gs[1, :3])

#     custom_palette = {well_target: 'red', offset_well_list[0]: 'orange'}
#     sns.histplot(data=data_logs, x='phit', hue='well', bins=50, kde=True, ax=ax1, palette=custom_palette)
#     sns.scatterplot(data=data_layers, x='htst', y='perm_avg', hue='well', s=75, ax=ax2, alpha=0.5, ec='black', palette=custom_palette)
#     sns.lineplot(data=data_logs_khtst, x='tst_index_rev', y='khtst', hue='well', ax=ax3, palette=custom_palette)
#     sns.barplot(data = khtst_logs, x='formation', y='khtst', hue='well', ax=ax4, palette=custom_palette)
#     ax1.set_yticklabels(ax1.get_yticklabels(), rotation=90, va='center')
#     ax2.set_yscale('log')
#     ax2.grid(True, which='both', linestyle='--', linewidth=0.5)
#     ax2.set_yticklabels(ax2.get_yticklabels(), rotation=90, va='center')
#     ax3.grid(True, which='both', linestyle='--', linewidth=0.5)
#     ax3.set_yticklabels(ax3.get_yticklabels(), rotation=90, va='center')

#     x = np.arange(len(khtst_logs.formation.unique()))
#     fms = khtst_logs.formation.unique()
#     ax4.set_xticks(x, fms, rotation=45, fontsize=6)
#     ax4.set_yticklabels(ax4.get_yticklabels(), rotation=90, va='center')

#     offset_well_list = dist_df[dist_df.well == well_target].iloc[:offset_qty]['well_offset'].values.tolist()
#     x = dataset_wells[dataset_wells.phit_flag == 1]['xmean']
#     y = dataset_wells[dataset_wells.phit_flag == 1]['ymean']
#     x_target = dataset_wells[dataset_wells.well == well_target]['xmean'].iloc[0]
#     y_target = dataset_wells[dataset_wells.well == well_target]['ymean'].iloc[0]
#     x_well1 = dataset_wells[dataset_wells.well == offset_well_list[0]]['xmean'].iloc[0]
#     y_well1 = dataset_wells[dataset_wells.well == offset_well_list[0]]['ymean'].iloc[0]
#     # x_well2 = dataset_wells[dataset_wells.well == offset_well_list[1]]['xmean'].iloc[0]
#     # y_well2 = dataset_wells[dataset_wells.well == offset_well_list[1]]['ymean'].iloc[0]
#     # x_well3 = dataset_wells[dataset_wells.well == offset_well_list[2]]['xmean'].iloc[0]
#     # y_well3 = dataset_wells[dataset_wells.well == offset_well_list[2]]['ymean'].iloc[0]
#     ax5.scatter(x, y, color='gray', s=10)
#     ax5.scatter(x_target, y_target, color='red', s=50, ec='black')
#     ax5.scatter(x_well1, y_well1, color='orange')
#     # ax5.scatter(x_well2, y_well2, color='green')
#     # ax5.scatter(x_well3, y_well3, color='#0797eb')

#     plt.suptitle(well_dist_title(dist_df), fontsize=16, y=0.92, x=0.32)
#     if print_flag == 'print':
#         plt.savefig(f'C:/jupyter/SPP/plots/offset_dashboard/{fm_name}_{well_target}_offset_dashboard.png');

# # df_bal8_v4_flag = df_bal8_v4_flag[~df_bal8_v4_flag.well.isin(['E31Z', 'D01Z'])]
# dist_bal8 = well_dist_calc(df_bal8_v4_flag, 'Balakhany VIII').round(0)
# for wellname in well_pairs_v3.well:
#     try:
#         well_offset_comparison_dashboard_pairs(df_bal8_v4_flag, ntd_top_phi_bot8_bp_v4, well_pairs_v3, wellname, 1, 'bal8','dontprint')
#     except:
#         print(f"error in {wellname}")

# Gas wells

In [6]:
ntd_top_phi_bot8_bp_v4['htst*phit_avg'] = ntd_top_phi_bot8_bp_v4['htst'] * ntd_top_phi_bot8_bp_v4['phit_avg']
ntd8 = ntd_top_phi_bot8_bp_v4.groupby('well')[['phit_avg','htst', 'htst*phit_avg']].agg({'phit_avg':'mean','htst':'sum', 'htst*phit_avg':'sum'}).reset_index()
xy = df_bal8_v4_flag.groupby('well')[['xmean','ymean']].first().reset_index()
ntd8 = ntd8.merge(xy, on='well').round({'xmean':0, 'ymean':0})
ntd8['phit_w_avg'] = ntd8['htst*phit_avg'] / ntd8['htst']
ntd8 = ntd8[['well', 'phit_avg', 'htst', 'phit_w_avg', 'xmean', 'ymean']]

field = df_bal8_v4.groupby('well')['field'].first().reset_index()
ntd8 = ntd8.merge(field, on='well')
ntd8 = ntd8[['well', 'phit_avg', 'htst', 'phit_w_avg', 'xmean', 'ymean', 'field']]
ntd8 = pd.get_dummies(ntd8, columns=['field'])

In [None]:
gas_wells = df_bal8_v4_flag.groupby(['well', 'fluid_code'])['tst'].count().reset_index().rename(columns={'tst':'fluid_code_points'})
total = df_bal8_v4_flag.groupby('well')['tst'].count().reset_index().rename(columns={'tst':'total_points'})
gas_wells_v2 = gas_wells.set_index('well').join(total.set_index('well')).reset_index()
gas_wells_v2['fluid_code%'] = gas_wells_v2['fluid_code_points']/gas_wells_v2['total_points']
gas_wells_v2 = gas_wells_v2[(gas_wells_v2.fluid_code == 1) & (gas_wells_v2['fluid_code%'] > 0.5)]
names_gas_wells_v2 = gas_wells_v2.well.unique()
print('total gas wells: ', len(names_gas_wells_v2))

ntd8['gas_well'] = 0
ntd8.loc[ntd8.well.isin(names_gas_wells_v2), 'gas_well'] = 1

custom_palette = {1: 'red', 0: 'green'}
sns.scatterplot(data=ntd8, x='htst', y='phit_w_avg', hue='gas_well', s=50, alpha=0.5, ec='black', palette=custom_palette)
for idx, txt in enumerate(ntd8.well):
    plt.annotate(txt, (ntd8.htst.iloc[idx], ntd8.phit_w_avg.iloc[idx]), fontsize=6)
plt.title('phit_w_avg vs htst')
plt.grid()
plt.xlim(20, 80)
plt.ylim(0.16, 0.30);

# New_phit testing

In [None]:
phit_new = pd.read_csv(r'C:\jupyter\SPP\input\ACG_phit_xover_comb.csv')[1:].drop('datasetName', axis=1)
phit_new.columns = phit_new.columns.str.lower()
phit_new = phit_new.rename(columns={'wellname':'well','depth':'md'})
phit_new = phit_new.astype({'well':'string','md':'float', 'lperm_xover_new':'float','phit_xover_new':'float'})
phit_new.md = phit_new.md.round(1)
phit_new = phit_new[phit_new.phit_xover_new != -9999.]
df_bal8_v4_flag_new_phit = df_bal8_v4_flag.set_index(['well','md']).join(phit_new.set_index(['well','md'])).reset_index()
df_bal8_v4_flag_new_phit = df_bal8_v4_flag_new_phit[['well','md','tst','net','phit','phit_xover_new']].dropna()
phit_new_group = df_bal8_v4_flag_new_phit[df_bal8_v4_flag_new_phit.net==1].groupby('well')[['phit','phit_xover_new']].mean().reset_index()
phit_new_group['diff'] = phit_new_group.phit_xover_new - phit_new_group.phit
phit_new_group['gas_well'] = 0
phit_new_group.loc[phit_new_group.well.isin(names_gas_wells_v2), 'gas_well'] = 1

def plot_phit(phit_new_group):
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=phit_new_group[phit_new_group.gas_well==0]['phit'],
        y=phit_new_group[phit_new_group.gas_well==0]['phit_xover_new'],
        mode='markers',
        text=phit_new_group[phit_new_group.gas_well==0]['well'],
        textposition='top center',
        hovertemplate=(
            'Well: %{text}<br>' +
            'phit_xover_new: %{y:.3f}<br>' +
            'phit: %{x:.3f}<br>' +
            'Diff: %{customdata:.3f}<extra></extra>'
        ),
        customdata=phit_new_group[phit_new_group.gas_well==0]['diff'].values,
        marker=dict(size=10, color='green', opacity=0.5,
                    line=dict(color='black', width=1)),
        name='phit vs phit_xover_new oil wells'
    ))
    
    fig.add_trace(go.Scatter(
        x=phit_new_group[phit_new_group.gas_well==1]['phit'],
        y=phit_new_group[phit_new_group.gas_well==1]['phit_xover_new'],
        mode='markers',
        text=phit_new_group[phit_new_group.gas_well==1]['well'],
        textposition='top center',
        hovertemplate=(
            'Well: %{text}<br>' +
            'phit_xover_new: %{y:.3f}<br>' +
            'phit: %{x:.3f}<br>' +
            'Diff: %{customdata:.3f}<extra></extra>'
        ),
        customdata=phit_new_group[phit_new_group.gas_well==1]['diff'].values,
        marker=dict(size=10, color='red', opacity=0.5,
                    line=dict(color='black', width=1)),
        name='phit vs phit_xover_new gas wells'
    ))

    # Add y=x line
    fig.add_trace(go.Scatter(
        x=[0.12, 0.25],
        y=[0.12, 0.25],
        mode='lines',
        line=dict(color='Red', dash='dash'),
        name='y=x'
    ))

    # Add y=x+0.01 line
    a = 0.01
    fig.add_trace(go.Scatter(
        x=[0.12, 0.25],
        y=[0.12 + a, 0.25 + a],
        mode='lines',
        line=dict(color='Orange', dash='dash'),
        name='y=x+0.01'
    ))

    fig.update_layout(
        title='Plot of phit vs phit_xover_new',
        xaxis_title='phit',
        yaxis_title='phit_xover_new',
        showlegend=True,
        legend=dict(
            x=0.01,  # Position the legend
            y=0.99,
            bgcolor='rgba(255, 255, 255, 0.5)',  # Background color with transparency
            bordercolor='Black',
            borderwidth=1
        ),
        xaxis=dict(showgrid=True),
        yaxis=dict(showgrid=True),
        margin=dict(l=0, r=10, t=50, b=0),
        width=800,  # Set the width of the plot
        height=600  # Set the height of the plot  
    )
    fig.show()
    # pio.write_html(fig, file='plots/plot_phit_v2.html', auto_open=True)

plot_phit(phit_new_group)

In [None]:
df_bal8_v4_flag_new_phit_v2 = df_bal8_v4_flag_new_phit[df_bal8_v4_flag_new_phit.net==1]
phit_new_group_v2 = df_bal8_v4_flag_new_phit_v2.groupby('well')[['phit','phit_xover_new', 'net']].agg({
                        'phit':'mean','phit_xover_new':'mean', 'net': lambda x: x.sum()*0.1}).reset_index()
phit_new_group_v2['gas_well'] = 0
phit_new_group_v2.loc[phit_new_group_v2.well.isin(names_gas_wells_v2), 'gas_well'] = 1
phit_new_group_v2 = phit_new_group_v2[phit_new_group_v2.net > 25 ]

def plot_phit_net(phit_new_group_v2):
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=phit_new_group_v2[phit_new_group_v2.gas_well==0]['net'],
        y=phit_new_group_v2[phit_new_group_v2.gas_well==0]['phit_xover_new'],
        mode='markers',
        text=phit_new_group_v2[phit_new_group_v2.gas_well==0],
        textposition='top center',
        hovertemplate=(
            'Well: %{text}<br>' +
            'phit_xover_new: %{y:.3f}<br>' +
            'net: %{x:.3f}<br><extra></extra>'
        ),
        marker=dict(size=10, color='green', opacity=0.5,
                    line=dict(color='black', width=1)),
        name='net vs phit_xover_new oil wells'
    ))

    fig.add_trace(go.Scatter(
        x=phit_new_group_v2[phit_new_group_v2.gas_well==1]['net'],
        y=phit_new_group_v2[phit_new_group_v2.gas_well==1]['phit_xover_new'],
        mode='markers',
        text=phit_new_group_v2[phit_new_group_v2.gas_well==1]['well'],
        textposition='top center',
        hovertemplate=(
            'Well: %{text}<br>' +
            'phit_xover_new: %{y:.3f}<br>' +
            'net: %{x:.3f}<br><extra></extra>'
        ),
        marker=dict(size=10, color='red', opacity=0.5,
                    line=dict(color='black', width=1)),
        name='net vs phit_xover_new gas wells'
    ))
    fig.update_layout(
        title='Plot of net vs phit_xover_new',
        xaxis_title='phit',
        yaxis_title='phit_xover_new',
        showlegend=True,
        legend=dict(
            x=0.65,  # Position the legend
            y=0.99,
            bgcolor='rgba(255, 255, 255, 0.5)',  # Background color with transparency
            bordercolor='Black',
            borderwidth=1
        ),
        xaxis=dict(showgrid=True),
        yaxis=dict(showgrid=True),
        margin=dict(l=0, r=10, t=50, b=0),
        width=800,  # Set the width of the plot
        height=600  # Set the height of the plot  
    )
    fig.show()
    # pio.write_html(fig, file='plot_phit_net.html', auto_open=True)

# plot_phit_net(phit_new_group_v2)

def plot_phit_net_sns(phit_new_group_v2):
    fig, ax = plt.subplots(1,2, figsize=(15, 5))
    sns.scatterplot(data=phit_new_group_v2, x='net', y='phit_xover_new', hue='gas_well', s=50, alpha=0.5, ec='black', palette=custom_palette, ax=ax[1])
    sns.scatterplot(data=phit_new_group_v2, x='net', y='phit', hue='gas_well', s=50, alpha=0.5, ec='black', palette=custom_palette, ax=ax[0])
    for idx, txt in enumerate(phit_new_group_v2.well):
        ax[1].annotate(txt, (phit_new_group_v2.net.iloc[idx], phit_new_group_v2.phit_xover_new.iloc[idx]), fontsize=6)
        ax[0].annotate(txt, (phit_new_group_v2.net.iloc[idx], phit_new_group_v2.phit.iloc[idx]), fontsize=6)
    ax[1].set_title('phit_xover_new vs net')
    ax[0].grid()
    ax[0].set_xlim(20, 85)
    ax[0].set_ylim(0.16, 0.30)
    ax[0].set_title('phit vs net')
    ax[1].grid()
    ax[1].set_xlim(20, 85)
    ax[1].set_ylim(0.16, 0.30);

plot_phit_net_sns(phit_new_group_v2)

# Anithotropy test 42

In [12]:
# param_dict = {
#     "method": ["ordinary", "universal"],
#     "variogram_model": ["linear", "exponencial", "power", "gaussian", "spherical"],
#     "n_closest_points": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
#     "anisotropy_scaling": [(0,1), (0.5,1), (1,1), (1,0.5), (1,0)],
#     "anisotropy_angle": [(0, 0, 0), (30, 0, 0), (60, 0, 0), (90, 0, 0)],
# }

# feature_bal8 = ntd8[['htst','field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI']].values.reshape(-1,4)
# coord_bal8 = ntd8[['xmean', 'ymean']].values
# target_bal8 = ntd8['phit_w_avg'].values

# f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
#     feature_bal8, coord_bal8, target_bal8, test_size=0.3, random_state=42
# )

# estimator = GridSearchCV(Krige(), param_dict, verbose=True, return_train_score=True)
# estimator.fit(X=xy_train, y=target_train)

# if hasattr(estimator, "best_score_"):
#     print("best_score R² = {:.3f}".format(estimator.best_score_))
#     print("best_params = ", estimator.best_params_)

In [13]:
# best_params = estimator.best_params_
# model = Krige(**best_params)
# model.fit(xy_train, target_train)
# pred_krige = model.predict(xy_test)
# pred_krige_df = pd.DataFrame(zip(pred_krige, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
# xy_krige_df = pd.DataFrame(xy_test, columns=['x', 'y'])
# pred_krige = pd.concat([xy_krige_df, pred_krige_df], axis=1)
# pred_krige['up5%'] = pred_krige.phit_w_avg_true+0.0115
# pred_krige['down5%'] = pred_krige.phit_w_avg_true-0.0115
# pred_krige['qc'] = np.where((pred_krige.phit_w_avg_pred >= pred_krige['down5%']) & (pred_krige.phit_w_avg_pred <= pred_krige['up5%']), 1, 0)
# pred_krige['model'] = 'krige_42'
# display(pred_krige.value_counts('qc', normalize=True))

# custom_palette = {0: 'red', 1: 'green'}
# sns.scatterplot(pred_krige, x='phit_w_avg_true', y='phit_w_avg_pred', hue='qc', s=50, alpha=0.5, ec='black', palette=custom_palette)
# plt.plot([0.15,0.27], [0.15,0.27], color='red', ls='--')
# plt.plot([0.15,0.27], [0.15+0.0115,0.27+0.0115], color='green', ls='--')
# plt.plot([0.15,0.27], [0.15-0.0115,0.27-0.0115], color='green', ls='--')
# plt.grid()
# plt.xlim(0.15,0.27)
# plt.ylim(0.15,0.27)

# Testing influence ml, kriging, ml+kriging to prediction

## Target based on nn-average

In [11]:
def add_phit_dist_df(df_ntd, df_dist):
    df_lst = []
    for well_offset in df_dist.well_offset.unique():
        phit = df_ntd[df_ntd.well == well_offset][['well','htst','phit_avg']]
        phit['htst*phit'] = phit['htst']*phit['phit_avg']
        phit_htst = phit['htst*phit'].sum()
        htst_sum = phit['htst'].sum()
        phit_wavg = phit_htst/htst_sum
        df_lst.append((well_offset, phit_wavg, htst_sum))
    phit_dist = pd.DataFrame(df_lst, columns=['well', 'phit_wavg', 'htst_sum'])
    dist_v2 = (df_dist.merge(phit_dist, left_on='well_offset', right_on='well', how='left')).merge(
        phit_dist, left_on='well_x', right_on='well', how='left')
    dist_v2 = dist_v2[['well_x','phit_wavg_y', 'well_offset', 'htst_sum_y','dist','phit_wavg_x','htst_sum_x','formation_up']].rename(
                    columns={'well_x':'well', 'phit_wavg_y':'phit_target', 'htst_sum_y':'htst_target', 
                             'phit_wavg_x':'phit_offset', 'htst_sum_x':'htst_offset', 'dist':'dist_offset'})
    return dist_v2

def add_htst_bins(df_ntd):
    qunatiles_df = pd.DataFrame(df_ntd[df_ntd.htst > 2].htst.quantile([0.25,0.5,0.75]), 
                            columns=['htst']).reset_index().rename(columns={'index':'qunatiles'})
    qunatiles_df['htst'] = qunatiles_df['htst'].round(0)
    bins = [0, qunatiles_df.htst.iloc[0], qunatiles_df.htst.iloc[1], qunatiles_df.htst.iloc[2], 100]
    labesl = ['htst_<3m', 'htst_3-4m', 'htst_4_7m', 'htst_>7m']
    df_ntd['htst_bins'] = pd.cut(df_ntd.htst, bins=bins, labels=labesl)

    df_lst = []
    for wellname in df_ntd.well.unique():
        htst_bins = df_ntd[df_ntd.well == wellname].groupby('htst_bins')['htst'].count().reset_index()
        htst_bins_v2 = htst_bins.T
        htst_bins_v2.columns = htst_bins_v2.iloc[0]
        htst_bins_v2 = htst_bins_v2.drop('htst_bins')
        htst_bins_v2['well'] = wellname
        df_lst.append(htst_bins_v2)
    htst_bins_df = pd.concat(df_lst).reset_index(drop=True)
    return htst_bins_df

def selection_quantity_well(df_dist_v2, q):
    df_lst = []
    for wellname in df_dist_v2.well.unique():
        well_set = df_dist_v2[df_dist_v2.well == wellname].iloc[:q]
        df_lst.append(well_set)
    dist_v3 = pd.concat(df_lst).reset_index(drop=True)
    return dist_v3

def assesment_phit_by_offset_avg(df_dist_v3, quality_range):
    df_lst = []
    for wellname in df_dist_v3.well.unique():
        data = df_dist_v3[df_dist_v3.well == wellname]
        weighted_average = np.average(data.phit_offset.values, weights=data.dist_offset.values)
        data['weighted_average'] = weighted_average
        df_lst.append(data)
    dist_v4 = pd.concat(df_lst).reset_index(drop=True)
    dist_v4['phit_target_high'] = dist_v4.phit_target + quality_range
    dist_v4['phit_target_low'] = dist_v4.phit_target - quality_range
    dist_v4['phit_pred_qc'] = 'in_range' # 0 - below low, 1 - within range, 2 - above high
    # dist_v4['phit_target_high'] = dist_v4.phit_target*(1 + quality_range)
    # dist_v4['phit_target_low'] = dist_v4.phit_target*(1 - quality_range)
    # dist_v4['phit_pred_qc'] = 'in_range' # 0 - below low, 1 - within range, 2 - above high
    dist_v4.loc[dist_v4.weighted_average <= dist_v4.phit_target_low, 'phit_pred_qc'] = 'under_estimated'
    dist_v4.loc[dist_v4.weighted_average >= dist_v4.phit_target_high, 'phit_pred_qc'] = 'over_estimated'
    dist_v4 = dist_v4[[ 'well', 'phit_target', 'well_offset', 'dist_offset', 'phit_offset', 'formation_up',
                        'phit_target_high', 'weighted_average','phit_target_low', 'phit_pred_qc']]
    result = pd.DataFrame(dist_v4[['well','phit_pred_qc']].drop_duplicates().value_counts('phit_pred_qc', normalize=True)).reset_index()
    result = result.sort_values(by='phit_pred_qc', ascending=True).reset_index(drop=True)
    return dist_v4, result

def assesment_htst_by_offset_avg(df_dist_v3, quality_range):
    df_lst = []
    for wellname in df_dist_v3.well.unique():
        data = df_dist_v3[df_dist_v3.well == wellname]
        weighted_average = np.average(data.htst_offset.values, weights=data.dist_offset.values)
        data['weighted_average'] = weighted_average
        df_lst.append(data)
    dist_v4 = pd.concat(df_lst).reset_index(drop=True)
    dist_v4['htst_target_high'] = dist_v4.htst_target + quality_range
    dist_v4['htst_target_low'] = dist_v4.htst_target - quality_range
    dist_v4['htst_pred_qc'] = 'in_range' # 0 - below low, 1 - within range, 2 - above high
    # dist_v4['htst_target_high'] = dist_v4.htst_target*(1 + quality_range)
    # dist_v4['htst_target_low'] = dist_v4.htst_target*(1 - quality_range)
    # dist_v4['htst_pred_qc'] = 'in_range' # 0 - below low, 1 - within range, 2 - above high
    dist_v4.loc[dist_v4.weighted_average <= dist_v4.htst_target_low, 'htst_pred_qc'] = 'under_estimated'
    dist_v4.loc[dist_v4.weighted_average >= dist_v4.htst_target_high, 'htst_pred_qc'] = 'over_estimated'
    dist_v4 = dist_v4[[ 'well', 'htst_target', 'well_offset', 'dist_offset', 'htst_offset', 'formation_up',
                        'htst_target_high', 'weighted_average','htst_target_low', 'htst_pred_qc']]
    result = pd.DataFrame(dist_v4[['well','htst_pred_qc']].drop_duplicates().value_counts('htst_pred_qc', normalize=True)).reset_index()
    result = result.sort_values(by='htst_pred_qc', ascending=True).reset_index(drop=True)
    return dist_v4, result

def well_dist_calc(dataset, fm):
    data = dataset.groupby('well')[['xmean', 'ymean']].first().reset_index().dropna()
    row_name = data.well.reset_index().drop(['index'], axis=1)
    distance_fm = pd.DataFrame(euclidean_distances(data[['xmean', 'ymean']]), columns=list(data.well))
    distance_fm_well = distance_fm.join(row_name).set_index('well')
    distance_fm_well = distance_fm_well.reset_index()
    dist_melt = distance_fm_well.melt(id_vars='well', 
                                var_name='well2', 
                                value_name='dist').rename(columns={'well':'well_offset', 'well2':'well'})
    dist_melt = dist_melt[['well', 'well_offset', 'dist']]
    dist_melt = dist_melt[dist_melt.dist != 0].sort_values(by=['well','dist'])
    dist_melt['formation_up'] = fm
    return dist_melt

In [None]:
dist_bal8 = well_dist_calc(df_bal8_v4_flag, 'Balakhany VIII')
dist_bal10 = well_dist_calc(df_bal10_v4_flag, 'Balakhany X')

df_range_phit_lst = []
df_range_htst_lst = []
for q in range(1,16):
    dist_bal8_v2 = add_phit_dist_df(ntd_top_phi_bot8_bp_v4, dist_bal8)
    dist_bal8_v3 = selection_quantity_well(dist_bal8_v2, q)
    dist_bal8_v4_phit, result_bal8_phit = assesment_phit_by_offset_avg(dist_bal8_v3, 0.0115)
    dist_bal8_v4_htst, result_bal8_htst = assesment_htst_by_offset_avg(dist_bal8_v3, 4.25)
    result_bal8_phit['offset_qty'] = q
    result_bal8_htst['offset_qty'] = q
    df_range_phit_lst.append(result_bal8_phit)
    df_range_htst_lst.append(result_bal8_htst)

df_range_phit = pd.concat(df_range_phit_lst).reset_index(drop=True)
df_range_htst = pd.concat(df_range_htst_lst).reset_index(drop=True)
qc_phit_avg = df_range_phit[df_range_phit.phit_pred_qc == 'in_range'].sort_values(by='proportion', ascending=False).iloc[0:1,1].values[0].round(3)

fig, ax = plt.subplots(1,2, figsize=(14,4))
sns.lineplot(data=df_range_phit, x='offset_qty', y='proportion', hue='phit_pred_qc', ax=ax[0])
ax[0].grid()
ax[0].set_title('Prediction by weighted avg offset 0.0115')
sns.lineplot(data=df_range_htst, x='offset_qty', y='proportion', hue='htst_pred_qc', ax=ax[1])
ax[1].grid()
ax[1].set_title('Prediction by weighted avg offset 4.25')

## Calc nn-avg

In [None]:
# ntd_top_phi_bot8_bp_v4 = ntd_top_phi_bot8_bp_v4[~ntd_top_phi_bot8_bp_v4.well.isin(['B14Z','B19','B13ST2'])]
# dist_bal8 = well_dist_calc(df_bal8_v4_flag[~df_bal8_v4_flag.well.isin(['B14Z','B19','B13ST2'])], 'Balakhany VIII')

q1, q2 = 2, 2
dist_bal8_v2 = add_phit_dist_df(ntd_top_phi_bot8_bp_v4, dist_bal8)
dist_bal8_v3_phit = selection_quantity_well(dist_bal8_v2, q1)
dist_bal8_v3_htst = selection_quantity_well(dist_bal8_v2, q2)
dist_bal8_v4_phit, result_bal8_phit = assesment_phit_by_offset_avg(dist_bal8_v3_phit, 0.0115)
dist_bal8_v4_htst, result_bal8_htst = assesment_htst_by_offset_avg(dist_bal8_v3_htst, 4.25)
result_bal8_phit['offset_qty'] = q1
result_bal8_htst['offset_qty'] = q2
dist_bal8_v4_phit['qc'] = np.where((dist_bal8_v4_phit.phit_pred_qc == 'in_range'), 1, 0)
dist_bal8_v4_htst['qc'] = np.where((dist_bal8_v4_htst.htst_pred_qc == 'in_range'), 1, 0)
dist_bal8_v4_phit = dist_bal8_v4_phit[[ 'well', 'phit_target', 'formation_up', 'phit_target_high', 'weighted_average',
                                        'phit_target_low', 'phit_pred_qc', 'qc']].drop_duplicates().reset_index(drop=True)
dist_bal8_v4_htst = dist_bal8_v4_htst[[ 'well', 'htst_target', 'formation_up', 'htst_target_high', 'weighted_average',
                                        'htst_target_low', 'htst_pred_qc', 'qc']].drop_duplicates().reset_index(drop=True)

feature_bal8_phit = dist_bal8_v4_phit[[ 'well', 'phit_target', 'formation_up', 
                                        'phit_target_high', 'weighted_average','phit_target_low', 
                                        'phit_pred_qc', 'qc']]
target_bal8_phit = dist_bal8_v4_phit['phit_target'].values

feature_bal8_htst = dist_bal8_v4_htst[[ 'well', 'htst_target', 'formation_up', 
                                        'htst_target_high', 'weighted_average', 'htst_target_low', 
                                        'htst_pred_qc', 'qc']]
target_bal8_htst = dist_bal8_v4_htst['htst_target'].values

def train_test_split_custom(feature, target, size, state):
    f_train, f_test, target_train, target_test = train_test_split(feature, target, test_size=size, random_state=state)
    return f_train, f_test, target_train, target_test

f_train_phit, f_test_phit, target_train_phit, target_test_phit = train_test_split_custom(feature_bal8_phit, target_bal8_phit, 0.5, 42)
f_train_htst, f_test_htst, target_train_htst, target_test_htst = train_test_split_custom(feature_bal8_htst, target_bal8_htst, 0.5, 42)

fig, ax = plt.subplots(1,2, figsize=(15,5))
custom_palette = {0: 'red', 1: 'green'}
qc_calc_nn_avg_phit = f_test_phit.qc.value_counts(normalize=True).reset_index()
sns.scatterplot(data=f_test_phit, x='phit_target', y='weighted_average', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette, ax=ax[0])
sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--', ax = ax[0])
ax[0].set_title(f'nn_points {q1} nn_avg_phit qc = 1 ' + str(qc_calc_nn_avg_phit[qc_calc_nn_avg_phit.qc==1]['proportion'].values[0].round(2)))
ax[0].grid()
for idx, txt in enumerate(f_test_phit.well):
    ax[0].annotate(txt, (f_test_phit.phit_target.iloc[idx], f_test_phit.weighted_average.iloc[idx]), fontsize=6)

qc_calc_nn_avg_htst = f_test_htst.qc.value_counts(normalize=True).reset_index()
sns.scatterplot(data=f_test_htst, x='htst_target', y='weighted_average', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette, ax=ax[1])
sns.lineplot(x=[30,80], y=[30,80], color='blue', ls='--', ax = ax[1])
ax[1].set_title(f'nn_points {q2} nn_avg_htst qc = 1 ' + str(qc_calc_nn_avg_htst[qc_calc_nn_avg_htst.qc==1]['proportion'].values[0].round(2)))
ax[1].grid()
for idx, txt in enumerate(f_test_htst.well):
    ax[1].annotate(txt, (f_test_htst.htst_target.iloc[idx], f_test_htst.weighted_average.iloc[idx]), fontsize=6);

## Feature creation

In [16]:
dist_bal8_v2_phit = add_phit_dist_df(ntd_top_phi_bot8_bp_v4, dist_bal8)
dist_bal8_v2_htstbins = add_htst_bins(ntd_top_phi_bot8_bp_v4)
dist_bal8_v2 = dist_bal8_v2_phit.set_index('well_offset').join(dist_bal8_v2_htstbins.set_index('well')).reset_index()
dist_bal8_v2 = dist_bal8_v2[[   'well', 'phit_target', 'htst_target', 'well_offset', 'dist_offset',
                                'phit_offset', 'htst_offset', 'htst_<3m', 'htst_3-4m',
                                'htst_4_7m', 'htst_>7m', 'formation_up']].sort_values(by=['well','dist_offset'])
dist_bal8_v3 = selection_quantity_well(dist_bal8_v2, 2)

In [None]:
df_lst = []
for wellname in dist_bal8_v3.well.unique():
    df1 = dist_bal8_v3[dist_bal8_v3.well == wellname][[ 'well', 'well_offset', 'dist_offset', 'phit_offset', 'htst_offset',
                                                       'htst_<3m', 'htst_3-4m', 'htst_4_7m', 'htst_>7m']].iloc[0:1].reset_index(drop=True)
    df1.columns = df1.columns + '_w1'
    df2 = dist_bal8_v3[dist_bal8_v3.well == wellname][[ 'well', 'well_offset', 'dist_offset', 'phit_offset', 'htst_offset',
                                                       'htst_<3m', 'htst_3-4m', 'htst_4_7m', 'htst_>7m']].iloc[1:2].reset_index(drop=True)
    df2.columns = df2.columns + '_w2'
    df_target = dist_bal8_v3[dist_bal8_v3.well == wellname][[ 'well', 'phit_target', 'htst_target']].iloc[0:1].reset_index(drop=True).drop('well', axis=1)
    df = pd.concat([df1, df2], axis=1).drop('well_w2', axis=1).rename(columns={'well_w1':'well'})
    df_v2 = pd.concat([df, df_target], axis=1)
    # df_v2 = df_v2[[ 'well', 'phit_target', 'htst_target', 'well_offset_1', 'dist_offset_1', 'phit_offset_1',
    #                 'htst_offset_1', 'well_offset_2', 'dist_offset_2', 'phit_offset_2',
    #                 'htst_offset_2']]
    df_lst.append(df_v2)
dist_bal8_v4 = pd.concat(df_lst).reset_index(drop=True)
dist_bal8_v4.columns

In [18]:
feature_list1 =  [
                    'dist_offset_w1', 'phit_offset_w1', 'htst_offset_w1', 
                    # 'htst_<3m_w1', 'htst_3-4m_w1', 'htst_4_7m_w1', 'htst_>7m_w1', 
                    'dist_offset_w2', 'phit_offset_w2', 'htst_offset_w2', 
                    # 'htst_<3m_w2', 'htst_3-4m_w2', 'htst_4_7m_w2', 'htst_>7m_w2', 
                    'phit_target']
feature_list2 =  [
#                   'dist_offset_w1', 'phit_offset_w1', 'htst_offset_w1', 
                    'htst_<3m_w1', 'htst_3-4m_w1', 'htst_4_7m_w1', 'htst_>7m_w1', 
                    # 'dist_offset_w2', 'phit_offset_w2', 'htst_offset_w2', 
                    'htst_<3m_w2', 'htst_3-4m_w2', 'htst_4_7m_w2', 'htst_>7m_w2', 
                    'phit_target']

In [None]:
sns.pairplot(data=dist_bal8_v4[feature_list2], diag_kind='kde', plot_kws={'alpha':0.5})

In [21]:
# 1. написать функцию для расчета предикшена с разными ramdom_state для базовых настроек кригинга и модели              +
# 2. посчитать стабильность предсказаний для 100 разных random_state                                                    +
# 3. посчитать предсказание отдельно для регрессии, регрессии с кригингом и кригинга - везде базовые настройки          +
# 4. добавить расчет поросити по соседним скважинам                                                                     +

# Phit prediction

## Data preparation

In [22]:
# Several statistical metrics are commonly used to describe the distribution of a dataset. 
# These metrics provide insights into the shape, central tendency, and spread of the data. Here are some of the key metrics:

# Mean: The average of all data points. It provides a measure of central tendency.

# Median: The middle value when the data points are arranged in order. It is another measure of central tendency that 
# is less affected by outliers than the mean.

# Mode: The most frequently occurring value(s) in the dataset. It can be used to understand the most common or 
# popular values in a distribution.

# Standard Deviation (SD): Measures the amount of variation or dispersion of a set of values. A low SD indicates that 
# the values tend to be close to the mean, while a high SD indicates that the values are spread out over a wider range.

# Variance: The square of the standard deviation. It measures how far each number in the set is from the mean and thus 
# from every other number in the set.

# Range: The difference between the highest and lowest values in the dataset. It gives a sense of the spread of the data.

# Interquartile Range (IQR): The difference between the 75th percentile (Q3) and the 25th percentile (Q1) in the data. 
# It is a measure of statistical dispersion and is less affected by outliers.

# Skewness: A measure of the asymmetry of the probability distribution of a real-valued random variable. Positive skew indicates 
# a distribution with an asymmetric tail extending towards more positive values, while negative skew indicates a tail extending 
# towards more negative values.

# Kurtosis: A measure of the "tailedness" of the probability distribution. High kurtosis means a distribution has heavy tails 
# and a sharp peak, while low kurtosis means a distribution has light tails and a flat peak.

# Percentiles/Quartiles: Points in the distribution below which a certain percentage of the data falls. Quartiles are specific 
# percentiles: the 25th percentile (Q1), the 50th percentile (median or Q2), and the 75th percentile (Q3).

In [19]:
ntd_top_phi_bot8_bp_v4['htst*phit_avg'] = ntd_top_phi_bot8_bp_v4['htst'] * ntd_top_phi_bot8_bp_v4['phit_avg']
ntd8 = ntd_top_phi_bot8_bp_v4.groupby('well')[['phit_avg','htst', 'htst*phit_avg']].agg({'phit_avg':'mean','htst':'sum', 'htst*phit_avg':'sum'}).reset_index()
xy = df_bal8_v4_flag.groupby('well')[['xmean','ymean']].first().reset_index()
ntd8 = ntd8.merge(xy, on='well').round({'xmean':0, 'ymean':0})
ntd8['phit_w_avg'] = ntd8['htst*phit_avg'] / ntd8['htst']
ntd8 = ntd8[['well', 'phit_avg', 'htst', 'phit_w_avg', 'xmean', 'ymean']]

field = df_bal8_v4.groupby('well')['field'].first().reset_index()
ntd8 = ntd8.merge(field, on='well')
ntd8 = ntd8[['well', 'phit_avg', 'htst', 'phit_w_avg', 'xmean', 'ymean', 'field']]
ntd8 = pd.get_dummies(ntd8, columns=['field'])

## No features

In [24]:
# ntd8 = ntd8[~ntd8.well.isin(['B14Z','B19','B13ST2'])]

### One-to-all ml-kriging

In [241]:
# df_lst = []
# for idx in range(len(ntd8)):
#     feature_bal8_all = np.delete(ntd8[['htst']].values.reshape(-1,1), idx, axis=0)
#     coord_bal8_all = np.delete(ntd8[['xmean', 'ymean']].values, idx, axis=0)
#     target_bal8_all = np.delete(ntd8['phit_w_avg'].values, idx, axis=0)

#     feature_bal8_one = ntd8[['htst']].values[idx].reshape(-1,1)
#     coord_bal8_one = ntd8[['xmean', 'ymean']].values[idx].reshape(-1,2)
#     target_bal8_one = ntd8['phit_w_avg'].values[idx]

#     m_rk = RegressionKriging(regression_model=SVR(), n_closest_points=4, verbose=False)
#     m_rk.fit(feature_bal8_all, coord_bal8_all, target_bal8_all)
#     pred_rk_loo = m_rk.predict(feature_bal8_one, coord_bal8_one)
#     pred_rk_loo_df = pd.DataFrame({ 'model': SVR().__class__.__name__,
#                                     'nn_points': 4,
#                                     'well': ntd8.well[idx],
#                                     'phit_w_avg_pred':pred_rk_loo, 
#                                     'phit_w_avg_true':target_bal8_one}, index=[0])
#     df_lst.append(pred_rk_loo_df)
# df_1 = pd.concat(df_lst).reset_index(drop=True)
# df_1

In [242]:
# svr_model = SVR()
# rf_model = RandomForestRegressor()
# lr_model = LinearRegression()
# model_list = [svr_model, rf_model, lr_model]

# model_lst = []
# for model in tqdm(model_list):  
#     nn_lst = []
#     for nn_points in range(2,31,1):
#         df_lst = []
#         for idx in range(len(ntd8)):
#             feature_bal8_all = np.delete(ntd8[['htst']].values.reshape(-1,1), idx, axis=0)
#             coord_bal8_all = np.delete(ntd8[['xmean', 'ymean']].values, idx, axis=0)
#             target_bal8_all = np.delete(ntd8['phit_w_avg'].values, idx, axis=0)

#             feature_bal8_one = ntd8[['htst']].values[idx].reshape(-1,1)
#             coord_bal8_one = ntd8[['xmean', 'ymean']].values[idx].reshape(-1,2)
#             target_bal8_one = ntd8['phit_w_avg'].values[idx]

#             m_rk = RegressionKriging(regression_model=rf_model, n_closest_points=nn_points, verbose=False)
#             m_rk.fit(feature_bal8_all, coord_bal8_all, target_bal8_all)
#             pred_rk_loo = m_rk.predict(feature_bal8_one, coord_bal8_one)
#             pred_rk_loo_df = pd.DataFrame({ 'model': model.__class__.__name__,
#                                             'nn_points': nn_points,
#                                             'well': ntd8['well'].values[idx],
#                                             'phit_w_avg_pred':pred_rk_loo, 
#                                             'phit_w_avg_true':target_bal8_one}, index=[0])
#             df_lst.append(pred_rk_loo_df)
#         df_1 = pd.concat(df_lst).reset_index(drop=True)
#         nn_lst.append(df_1)
#     df_2 = pd.concat(nn_lst).reset_index(drop=True)
#     model_lst.append(df_2)
# result = pd.concat(model_lst).reset_index(drop=True)

# result['up_1.15pu'] = result.phit_w_avg_true+0.0115
# result['down_1.15pu'] = result.phit_w_avg_true-0.0115
# result['qc'] = np.where((result.phit_w_avg_pred >= result['down_1.15pu']) 
#                                 & (result.phit_w_avg_pred <= result['up_1.15pu']), 1, 0)
# result.to_csv('output/result_pred_ml_kr_loo.csv', index=False)

In [None]:
result = pd.read_csv('output/result_pred_ml_kr_loo.csv')

df_lst = []
for model in result.model.unique():
    for nn in result.nn_points.unique():
        data = result[(result.model == model) & (result.nn_points == nn)].value_counts('qc', normalize=True)
        df = pd.DataFrame({'model': model, 'nn_points': nn, 'qc_1': data[1], 'qc_0': data[0]}, index=[0])
        df_lst.append(df)
result_v2 = pd.concat(df_lst).reset_index(drop=True)

def display_results_loo(result_v2):
    fig, ax = plt.subplots(1,3, figsize=(18,5))
    ax[0].step(result_v2[result_v2.model=='SVR']['nn_points'], 
            result_v2[result_v2.model=='SVR']['qc_1'], where='mid', label='SVR', color='blue')
    ax[1].step(result_v2[result_v2.model=='RandomForestRegressor']['nn_points'],
                result_v2[result_v2.model=='RandomForestRegressor']['qc_1'], where='mid', label='RRF', color='red')
    ax[2].step(result_v2[result_v2.model=='LinearRegression']['nn_points'],
                result_v2[result_v2.model=='LinearRegression']['qc_1'], where='mid', label='LR', color='green')
    ax[0].grid()
    ax[0].set_title('SVR')
    ax[0].set_ylim(0.45,0.55)
    ax[1].grid()
    ax[1].set_title('RandomForestRegressor')
    ax[1].set_ylim(0.45,0.55)
    ax[2].grid()
    ax[2].set_title('LinearRegression')
    ax[2].set_ylim(0.45,0.55)
display_results_loo(result_v2)

In [None]:
svr_13 = result[(result.model == 'SVR') & (result.nn_points == 13)]
display(svr_13)
fig, ax = plt.subplots(1,2, figsize=(15,5))
sns.scatterplot(data=svr_13, x='phit_w_avg_true', y='phit_w_avg_pred', hue='qc', s=50, alpha=0.5, ec='black', palette=custom_palette, ax=ax[0])
sns.lineplot(x=[0.15,0.27], y=[0.15,0.27], color='blue', ls='--', ax=ax[0])
ax[0].set_title('SVR nn_points 13')
for idx, row in svr_13.iterrows():
    ax[0].text(row.phit_w_avg_true, row.phit_w_avg_pred, row.well, fontsize=6);
svr_5 = result[(result.model == 'SVR') & (result.nn_points == 5)]
sns.scatterplot(data=svr_5, x='phit_w_avg_true', y='phit_w_avg_pred', hue='qc', s=50, alpha=0.5, ec='black', palette=custom_palette, ax=ax[1])
sns.lineplot(x=[0.15,0.27], y=[0.15,0.27], color='blue', ls='--', ax=ax[1])
ax[1].set_title('SVR nn_points 5')
for idx, row in svr_5.iterrows():
    ax[1].text(row.phit_w_avg_true, row.phit_w_avg_pred, row.well, fontsize=6);

### ml + kriging

In [22]:
def ml_kriging_no_features():
    def ml_kriging_prediction_bal8(models, random_state_value, test_size_value):
        feature_bal8 = ntd8[['well','htst']].values.reshape(-1,2)
        coord_bal8 = ntd8[['xmean', 'ymean']].values
        target_bal8 = ntd8['phit_w_avg'].values

        f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
            feature_bal8, coord_bal8, target_bal8, test_size=test_size_value, random_state=random_state_value
        )

        def model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models):
            df_final_lst = []
            for m in models:
                reg_score_lst, rk_score_lst, nn_lst, m_lst = [], [], [], []
                for nn in range(2,16):
                    print("n_closest_points:", nn)
                    m_rk = RegressionKriging(regression_model=m, n_closest_points=nn, verbose=False)
                    m_rk.fit(f_train[:,1:], xy_train, target_train)
                    reg_score_lst.append(m_rk.regression_model.score(f_test[:,1:], target_test))
                    rk_score_lst.append(m_rk.score(f_test[:,1:], xy_test, target_test))
                    nn_lst.append(nn)
                    m_lst.append(m)
                result = pd.DataFrame({'model':m_lst,'n_closest_points':nn_lst, 'reg_score':reg_score_lst,'rk_score':rk_score_lst})
                result['random_state'] = random_state_value
                result['test_size'] = test_size_value
                df_final_lst.append(result)
                df_final = pd.concat(df_final_lst)
            return df_final
        ml_kriging = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)

        def model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, models_df):
            model_sorted = models_df.sort_values(by='rk_score', ascending=False)
            model_name = model_sorted.iloc[0]['model']
            nn_points = model_sorted.iloc[0]['n_closest_points']
            m_rk = RegressionKriging(regression_model=model_name, n_closest_points=nn_points)
            m_rk.fit(f_train[:,1:], xy_train, target_train)

            pred = m_rk.predict(f_test[:,1:], xy_test)
            pred_df = pd.DataFrame(zip(pred, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
            pred_df['well'] = f_test[:,0]
            pred_df['model'] = model_name
            pred_df['n_closest_points'] = nn_points
            pred_df['random_state'] = random_state_value
            pred_df['test_size'] = test_size_value
            pred_df['up_1.15pu'] = pred_df.phit_w_avg_true+0.0115
            pred_df['down_1.15pu'] = pred_df.phit_w_avg_true-0.0115
            pred_df['qc'] = np.where((pred_df.phit_w_avg_pred >= pred_df['down_1.15pu']) & (pred_df.phit_w_avg_pred <= pred_df['up_1.15pu']), 1, 0)
            return pred_df
        prediction_df = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, ml_kriging)

        return prediction_df, ml_kriging

    svr_model = SVR()
    rf_model = RandomForestRegressor()
    lr_model = LinearRegression()

    models_test = [svr_model, rf_model, lr_model]
    prediction_df_nof, ml_kriging_nof = ml_kriging_prediction_bal8(models_test, 42, 0.5)
    qc_calc_mlk_nof = prediction_df_nof.qc.value_counts(normalize=True).reset_index()
    custom_palette = {0: 'red', 1: 'green'}
    sns.scatterplot(data=prediction_df_nof, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
    sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
    plt.title('qc = 1 ' + str(qc_calc_mlk_nof[qc_calc_mlk_nof.qc==1]['proportion'].values[0].round(2)))
    plt.grid()
    for idx, txt in enumerate(prediction_df_nof.well):
        plt.annotate(txt, (prediction_df_nof.phit_w_avg_true[idx], prediction_df_nof.phit_w_avg_pred[idx]), fontsize=6)
    return qc_calc_mlk_nof, prediction_df_nof, ml_kriging_nof

qc_calc_mlk_nof, pred_ml_kri_nof_df, ml_kri_nof = ml_kriging_no_features()
pred_ml_kri_nof_df['type'] = 'ml_kriging_no_features'
# pred_ml_nof_df

### ml

In [23]:
def ml_no_features():
    ml_model_nof = RandomForestRegressor()

    feature_bal8 = ntd8[['well','htst']].values.reshape(-1,2)
    coord_bal8 = ntd8[['xmean', 'ymean']].values
    target_bal8 = ntd8['phit_w_avg'].values

    f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
        feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42
    )
    ml_model_nof.fit(f_train[:,1:], target_train)
    pred_ml_nof = ml_model_nof.predict(f_test[:,1:])
    pred_ml_nof_df = pd.DataFrame(zip(pred_ml_nof, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
    pred_ml_nof_df['well'] = f_test[:,0]
    pred_ml_nof_df['model'] = ml_model_nof.__class__.__name__
    # pred_svr_df['n_closest_points'] = nn_points
    pred_ml_nof_df['random_state'] = 42
    pred_ml_nof_df['test_size'] = 0.5
    pred_ml_nof_df['up_1.15pu'] = pred_ml_nof_df.phit_w_avg_true+0.0115
    pred_ml_nof_df['down_1.15pu'] = pred_ml_nof_df.phit_w_avg_true-0.0115
    pred_ml_nof_df['qc'] = np.where((pred_ml_nof_df.phit_w_avg_pred >= pred_ml_nof_df['down_1.15pu']) & (pred_ml_nof_df.phit_w_avg_pred <= pred_ml_nof_df['up_1.15pu']), 1, 0)

    qc_calc_ml_nof = pred_ml_nof_df.qc.value_counts(normalize=True).reset_index()
    custom_palette = {0: 'red', 1: 'green'}
    sns.scatterplot(data=pred_ml_nof_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
    sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
    plt.title('qc = 1 ' + str(qc_calc_ml_nof[qc_calc_ml_nof.qc==1]['proportion'].values[0].round(2)))
    plt.grid()
    for idx, txt in enumerate(pred_ml_nof_df.well):
        plt.annotate(txt, (pred_ml_nof_df.phit_w_avg_true[idx], pred_ml_nof_df.phit_w_avg_pred[idx]), fontsize=6)
    
    return qc_calc_ml_nof, pred_ml_nof_df

qc_calc_ml_nof, pred_ml_nof_df = ml_no_features()
pred_ml_nof_df['type'] = 'ml_no_features'

### kriging

In [24]:
def kriging_no_features():
    feature_bal8 = ntd8[['htst']].values.reshape(-1,1)
    coord_bal8 = ntd8[['well','xmean', 'ymean']].values
    target_bal8 = ntd8['phit_w_avg'].values

    f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
        feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42
    )

    kriging_nof = Krige(n_closest_points=2, method='ordinary', variogram_model='linear')
    kriging_nof.fit(x=xy_train[:,1:], y=target_train)
    pred_kriging_nof = kriging_nof.predict(xy_test[:,1:])

    kriging_nof_df = pd.DataFrame(zip(pred_kriging_nof, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
    kriging_nof_df['well'] = xy_test[:,0]
    kriging_nof_df['model'] = kriging_nof.__class__.__name__
    kriging_nof_df['n_closest_points'] = 2
    kriging_nof_df['random_state'] = 42
    kriging_nof_df['test_size'] = 0.5
    kriging_nof_df['up_1.15pu'] = kriging_nof_df.phit_w_avg_true+0.0115
    kriging_nof_df['down_1.15pu'] = kriging_nof_df.phit_w_avg_true-0.0115
    kriging_nof_df['qc'] = np.where((kriging_nof_df.phit_w_avg_pred >= kriging_nof_df['down_1.15pu']) & (kriging_nof_df.phit_w_avg_pred <= kriging_nof_df['up_1.15pu']), 1, 0)

    qc_calc_k_nof = kriging_nof_df.qc.value_counts(normalize=True).reset_index()
    custom_palette = {0: 'red', 1: 'green'}
    sns.scatterplot(data=kriging_nof_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
    sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
    plt.title('qc = 1 ' + str(qc_calc_k_nof[qc_calc_k_nof.qc==1]['proportion'].values[0].round(2)))
    plt.grid()
    for idx, txt in enumerate(kriging_nof_df.well):
        plt.annotate(txt, (kriging_nof_df.phit_w_avg_true[idx], kriging_nof_df.phit_w_avg_pred[idx]), fontsize=6)
    
    return qc_calc_k_nof, kriging_nof_df

qc_calc_k_nof, pred_kri_nof_df = kriging_no_features()
pred_kri_nof_df['type'] = 'kriging_no_features'

## conclusions no feature

In [25]:
print('phit_w_avg prediction no features \nkriging:', qc_calc_k_nof[qc_calc_k_nof.qc==1]['proportion'].values[0].round(3), 
      'ml:', qc_calc_ml_nof[qc_calc_ml_nof.qc==1]['proportion'].values[0].round(3), 
      'ml + kriging:', qc_calc_mlk_nof[qc_calc_mlk_nof.qc==1]['proportion'].values[0].round(3),
      'phit_simple_avg', qc_calc_nn_avg_phit[qc_calc_nn_avg_phit.qc==1]['proportion'].values[0].round(3))

In [None]:
df_no_features = pd.concat([pred_kri_nof_df, pred_ml_nof_df, pred_ml_kri_nof_df]).reset_index(drop=True)
df_no_features

## 1 simple feature

In [26]:
ntd8['gas_well'] = 0
ntd8.loc[ntd8.well.isin(names_gas_wells_v2), 'gas_well'] = 1

### ml+kriging

In [None]:
def ml_kriging_feature_1():
    def ml_kriging_prediction_feature_bal8(models, random_state_value, test_size_value):
        feature_bal8 = ntd8[['well','htst','field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI', 'gas_well']].values.reshape(-1,6)
        coord_bal8 = ntd8[['xmean', 'ymean']].values
        target_bal8 = ntd8['phit_w_avg'].values

        f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
            feature_bal8, coord_bal8, target_bal8, test_size=test_size_value, random_state=random_state_value
        )

        def model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models):
            df_final_lst = []
            for m in models:
                reg_score_lst, rk_score_lst, nn_lst, m_lst = [], [], [], []
                for nn in range(2,16):
                    print("n_closest_points:", nn)
                    m_rk = RegressionKriging(regression_model=m, n_closest_points=nn, verbose=False)
                    m_rk.fit(f_train[:,1:], xy_train, target_train) #f_train[:,1:]
                    reg_score_lst.append(m_rk.regression_model.score(f_test[:,1:], target_test))
                    rk_score_lst.append(m_rk.score(f_test[:,1:], xy_test, target_test))
                    nn_lst.append(nn)
                    m_lst.append(m)
                result = pd.DataFrame({'model':m_lst,'n_closest_points':nn_lst, 'reg_score':reg_score_lst,'rk_score':rk_score_lst})
                result['random_state'] = random_state_value
                result['test_size'] = test_size_value
                df_final_lst.append(result)
                df_final = pd.concat(df_final_lst)
            return df_final
        ml_kriging = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)

        def model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, models_df):
            model_sorted = models_df.sort_values(by='rk_score', ascending=False)
            model_name = model_sorted.iloc[0]['model']
            nn_points = model_sorted.iloc[0]['n_closest_points']
            m_rk = RegressionKriging(regression_model=model_name, n_closest_points=nn_points)
            m_rk.fit(f_train[:,1:], xy_train, target_train)

            pred = m_rk.predict(f_test[:,1:], xy_test)
            pred_df = pd.DataFrame(zip(pred, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
            pred_df['model'] = model_name.__class__.__name__
            pred_df['well'] = f_test[:,0]
            pred_df['n_closest_points'] = nn_points
            pred_df['random_state'] = random_state_value
            pred_df['test_size'] = test_size_value
            pred_df['up_1.15pu'] = pred_df.phit_w_avg_true+0.0115
            pred_df['down_1.15pu'] = pred_df.phit_w_avg_true-0.0115
            pred_df['qc'] = np.where((pred_df.phit_w_avg_pred >= pred_df['down_1.15pu']) & (pred_df.phit_w_avg_pred <= pred_df['up_1.15pu']), 1, 0)
            return pred_df
        prediction_df = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, ml_kriging)

        return prediction_df, ml_kriging

    svr_model = SVR()
    rf_model = RandomForestRegressor()
    lr_model = LinearRegression()
    models_test = [svr_model, rf_model, lr_model]
    prediction_f_df, ml_f_kriging = ml_kriging_prediction_feature_bal8(models_test, 42, 0.5)
    qc_calc_ml_k_f = prediction_f_df.qc.value_counts(normalize=True).reset_index()
    custom_palette = {0: 'red', 1: 'green'}
    sns.scatterplot(data=prediction_f_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
    sns.lineplot(x=[0.18,0.27], y=[0.18,0.27], color='blue', ls='--')
    plt.title('ml + kriging qc = 1 ' + str(qc_calc_ml_k_f[qc_calc_ml_k_f.qc==1]['proportion'].values[0].round(2)))
    plt.grid()
    for idx, txt in enumerate(prediction_f_df.well):
        plt.annotate(txt, (prediction_f_df.phit_w_avg_true.iloc[idx], prediction_f_df.phit_w_avg_pred.iloc[idx]), fontsize=6)
    return qc_calc_ml_k_f, prediction_f_df, ml_f_kriging

qc_calc_ml_k_f, pred_ml_kri_f_df, ml_kri_f = ml_kriging_feature_1()
pred_ml_kri_f_df['type'] = 'ml_kriging_feature_1'

In [None]:
sns.scatterplot(data=ml_kri_f, x='rk_score', y='reg_score', hue='model')

### ml

In [None]:
def ml_feature_1_v1():
    ml_model_f = LinearRegression(fit_intercept=False)
    xy = df_bal8_v4_flag.groupby('well')[['xmean','ymean']].first().reset_index()
    dist_bal8_v5 = dist_bal8_v4.merge(xy, on='well').round({'xmean':0, 'ymean':0})
    dist_bal8_v5.columns

    feature_bal8 = ntd8[['well','htst','field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI', 'gas_well']].values.reshape(-1,6)
    coord_bal8 = ntd8[['xmean', 'ymean']].values
    target_bal8 = ntd8['phit_w_avg'].values

    f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
        feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42
    )
    ml_model_f.fit(f_train[:,1:], target_train)
    pred_ml_f = ml_model_f.predict(f_test[:,1:])
    pred_ml_f_df = pd.DataFrame(zip(pred_ml_f, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
    pred_ml_f_df['well'] = f_test[:,0]
    pred_ml_f_df['model'] = ml_model_f.__class__.__name__
    # pred_svr_df['n_closest_points'] = nn_points
    pred_ml_f_df['random_state'] = 42
    pred_ml_f_df['test_size'] = 0.5
    pred_ml_f_df['up_1.15pu'] = pred_ml_f_df.phit_w_avg_true+0.0115
    pred_ml_f_df['down_1.15pu'] = pred_ml_f_df.phit_w_avg_true-0.0115
    pred_ml_f_df['qc'] = np.where((pred_ml_f_df.phit_w_avg_pred >= pred_ml_f_df['down_1.15pu']) & (pred_ml_f_df.phit_w_avg_pred <= pred_ml_f_df['up_1.15pu']), 1, 0)

    score_ml = ml_model_f.score(f_test[:,1:], target_test)
    qc_calc_ml_f = pred_ml_f_df.qc.value_counts(normalize=True).reset_index()
    custom_palette = {0: 'red', 1: 'green'}
    sns.scatterplot(data=pred_ml_f_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
    sns.lineplot(x=[0.18,0.27], y=[0.18,0.27], color='blue', ls='--')
    plt.title('only ml qc = 1 ' + str(qc_calc_ml_f[qc_calc_ml_f.qc==1]['proportion'].values[0].round(2)) + ' / R2: ' + str(score_ml.round(2)))
    plt.grid()
    for idx, txt in enumerate(pred_ml_f_df.well):
        plt.annotate(txt, (pred_ml_f_df.phit_w_avg_true[idx], pred_ml_f_df.phit_w_avg_pred[idx]), fontsize=6)
    
    return qc_calc_ml_f, pred_ml_f_df

qc_calc_ml_f, pred_ml_f_df = ml_feature_1_v1()
pred_ml_f_df['type'] = 'ml_feature_1'

In [None]:
def ml_feature_1_v2():
    ml_model_f_v2 = RandomForestRegressor()

    feature_bal8 = ntd8[['well','htst','field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI', 'gas_well']].values.reshape(-1,6)
    coord_bal8 = ntd8[['xmean', 'ymean']].values
    target_bal8 = ntd8['phit_w_avg'].values

    f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
        feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42
    )
    ml_model_f_v2.fit(f_train[:,1:], target_train)
    pred_ml_f_v2 = ml_model_f_v2.predict(f_test[:,1:])
    pred_ml_f_v2_df = pd.DataFrame(zip(pred_ml_f_v2, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
    pred_ml_f_v2_df['model'] = ml_model_f_v2.__class__.__name__
    pred_ml_f_v2_df['well'] = f_test[:,0]
    # pred_svr_df['n_closest_points'] = nn_points
    pred_ml_f_v2_df['random_state'] = 42
    pred_ml_f_v2_df['test_size'] = 0.5
    pred_ml_f_v2_df['up_1.15pu'] = pred_ml_f_v2_df.phit_w_avg_true+0.0115
    pred_ml_f_v2_df['down_1.15pu'] = pred_ml_f_v2_df.phit_w_avg_true-0.0115
    pred_ml_f_v2_df['qc'] = np.where((pred_ml_f_v2_df.phit_w_avg_pred >= pred_ml_f_v2_df['down_1.15pu']) & (pred_ml_f_v2_df.phit_w_avg_pred <= pred_ml_f_v2_df['up_1.15pu']), 1, 0)

    score_ml = ml_model_f_v2.score(f_test[:,1:], target_test)
    qc_calc_ml_f_v2 = pred_ml_f_v2_df.qc.value_counts(normalize=True).reset_index()
    custom_palette = {0: 'red', 1: 'green'}
    sns.scatterplot(data=pred_ml_f_v2_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
    sns.lineplot(x=[0.18,0.27], y=[0.18,0.27], color='blue', ls='--')
    plt.title('qc = 1 ' + str(qc_calc_ml_f_v2[qc_calc_ml_f_v2.qc==1]['proportion'].values[0].round(2)) + ' / R2: ' + str(score_ml.round(2)))
    plt.grid()
    for idx, txt in enumerate(pred_ml_f_v2_df.well):
        plt.annotate(txt, (pred_ml_f_v2_df.phit_w_avg_true[idx], pred_ml_f_v2_df.phit_w_avg_pred[idx]), fontsize=6)
    
    return qc_calc_ml_f_v2, pred_ml_f_v2_df

qc_calc_ml_f_v2, pred_ml_f_v2_df = ml_feature_1_v2()
pred_ml_f_v2_df['type'] = 'ml_feature_1_v2'

### kriging

In [None]:
def kriging_feature_1():
    feature_bal8 = ntd8[['well','htst','field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI', 'gas_well']].values.reshape(-1,6)
    coord_bal8 = ntd8[['well','xmean', 'ymean']].values
    target_bal8 = ntd8['phit_w_avg'].values

    f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
        feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42
    )

    kriging_f = Krige(n_closest_points=2, method='ordinary', variogram_model='linear')
    kriging_f.fit(x=xy_train[:,1:], y=target_train)
    pred_kriging_f = kriging_f.predict(xy_test[:,1:])

    pred_kriging_f_df = pd.DataFrame(zip(pred_kriging_f, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
    pred_kriging_f_df['well'] = xy_test[:,0]
    pred_kriging_f_df['model'] = kriging_f.__class__.__name__
    pred_kriging_f_df['n_closest_points'] = 2
    pred_kriging_f_df['random_state'] = 42
    pred_kriging_f_df['test_size'] = 0.5
    pred_kriging_f_df['up_1.15pu'] = pred_kriging_f_df.phit_w_avg_true+0.0115
    pred_kriging_f_df['down_1.15pu'] = pred_kriging_f_df.phit_w_avg_true-0.0115
    pred_kriging_f_df['qc'] = np.where((pred_kriging_f_df.phit_w_avg_pred >= pred_kriging_f_df['down_1.15pu']) 
                                    & (pred_kriging_f_df.phit_w_avg_pred <= pred_kriging_f_df['up_1.15pu']), 1, 0)

    qc_calc_k_f = pred_kriging_f_df.qc.value_counts(normalize=True).reset_index()
    custom_palette = {0: 'red', 1: 'green'}
    sns.scatterplot(data=pred_kriging_f_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
    sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
    plt.title('only kriging qc = 1 ' + str(qc_calc_k_f[qc_calc_k_f.qc==1]['proportion'].values[0].round(2)))
    plt.grid()
    for idx, txt in enumerate(pred_kriging_f_df.well.unique()):
        plt.annotate(txt, (pred_kriging_f_df['phit_w_avg_true'].iloc[idx], pred_kriging_f_df['phit_w_avg_pred'].iloc[idx]), fontsize=6)
    
    return qc_calc_k_f, pred_kriging_f_df

qc_calc_k_f, pred_kri_f_df = kriging_feature_1()
pred_kri_f_df['type'] = 'kriging_feature_1'

## conclusions 1 smpl feature

In [39]:
# When I exclude 'B14Z','B19','B13ST2' wells from ntd8 dataset, the prediction quality is descreased up to
# phit_w_avg prediction
# kriging: 0.6 ml: 0.533 ml + kriging: 0.511 phit_simple_avg 0.725

In [None]:
print('phit_w_avg prediction 1 smpl feature \nkriging:', qc_calc_k_f[qc_calc_k_f.qc==1]['proportion'].values[0].round(3), 
      'ml:', qc_calc_ml_f[qc_calc_ml_f.qc==1]['proportion'].values[0].round(3),
      'ml v2:', qc_calc_ml_f_v2[qc_calc_ml_f_v2.qc==1]['proportion'].values[0].round(3),
      'ml + kriging:', qc_calc_ml_k_f[qc_calc_ml_k_f.qc==1]['proportion'].values[0].round(3),
      'phit_simple_avg', qc_calc_nn_avg_phit[qc_calc_nn_avg_phit.qc==1]['proportion'].values[0].round(3))

In [None]:
df_feature_1 = pd.concat([pred_kri_f_df, pred_ml_f_df, pred_ml_f_v2_df, pred_ml_kri_f_df]).reset_index(drop=True)
df_feature_1

In [41]:
# phit_w_avg prediction
# kriging: 0.739 ml: 0.587 ml + kriging: 0.696 phit_simple_avg 0.652

## Feature htst_bins

In [None]:
fields = df_bal8_v4.groupby('well')['field'].first().reset_index()
xy = df_bal8_v4.groupby('well')[['xmean','ymean']].first().reset_index()
fields_xy = fields.merge(xy, on='well')
fields_xy = pd.get_dummies(fields_xy, columns=['field'])
dist_bal8_v5 = dist_bal8_v4.set_index('well').join(fields_xy.set_index('well')).reset_index()
dist_bal8_v5['gas_well'] = 0
dist_bal8_v5.loc[dist_bal8_v5.well.isin(names_gas_wells_v2), 'gas_well'] = 1
dist_bal8_v5.columns

### ml+kriging

In [43]:
# feature_bal8 = dist_bal8_v5[[   
#                                 # 'phit_offset_w1',
#                                 'htst_offset_w1', 
#                                 'htst_<3m_w1', 'htst_3-4m_w1', 'htst_4_7m_w1', 'htst_>7m_w1', 
#                                 # 'phit_offset_w2',
#                                 'htst_offset_w2', 
#                                 'htst_<3m_w2', 'htst_3-4m_w2', 'htst_4_7m_w2', 'htst_>7m_w2',
#                                 'field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI']].values.reshape(-1,13)
# coord_bal8 = dist_bal8_v5[['xmean', 'ymean']].values
# target_bal8 = dist_bal8_v5['htst_target'].values

# f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
#     feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42
# )

# def model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models):
#     df_final_lst = []
#     for m in models:
#         reg_score_lst, rk_score_lst, nn_lst, m_lst = [], [], [], []
#         for nn in range(2,16):
#             print("n_closest_points:", nn)
#             m_rk = RegressionKriging(regression_model=m, n_closest_points=nn, verbose=False)
#             m_rk.fit(f_train, xy_train, target_train)
#             reg_score_lst.append(m_rk.regression_model.score(f_test, target_test))
#             rk_score_lst.append(m_rk.score(f_test, xy_test, target_test))
#             nn_lst.append(nn)
#             m_lst.append(m)
#         result = pd.DataFrame({'model':m_lst,'n_closest_points':nn_lst, 'reg_score':reg_score_lst,'rk_score':rk_score_lst})
#         result['random_state'] = 42
#         result['test_size'] = 0.5
#         df_final_lst.append(result)
#         df_final = pd.concat(df_final_lst)
#     return df_final
# ml_kriging = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)
# model_sorted = ml_kriging.sort_values(by='rk_score', ascending=False)
# model_sorted

In [None]:
def ml_kriging_hbins():
    def ml_kriging_prediction_feature_bal8(models, random_state_value, test_size_value):
        feature_bal8 = dist_bal8_v5[[   
                                        'well',
                                        'phit_offset_w1',
                                        'htst_offset_w1', 'htst_<3m_w1', 'htst_3-4m_w1', 'htst_4_7m_w1', 'htst_>7m_w1', 
                                        'phit_offset_w2',
                                        'htst_offset_w2', 'htst_<3m_w2', 'htst_3-4m_w2', 'htst_4_7m_w2', 'htst_>7m_w2',
                                        'field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI',
                                        'gas_well']].values.reshape(-1,17)
        coord_bal8 = dist_bal8_v5[['xmean', 'ymean']].values
        target_bal8 = dist_bal8_v5['phit_target'].values

        f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
            feature_bal8, coord_bal8, target_bal8, test_size=test_size_value, random_state=random_state_value
        )

        def model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models):
            df_final_lst = []
            for m in models:
                reg_score_lst, rk_score_lst, nn_lst, m_lst = [], [], [], []
                for nn in range(2,16):
                    print("n_closest_points:", nn)
                    m_rk = RegressionKriging(regression_model=m, n_closest_points=nn, verbose=False)
                    m_rk.fit(f_train[:,1:], xy_train, target_train)
                    reg_score_lst.append(m_rk.regression_model.score(f_test[:,1:], target_test))
                    rk_score_lst.append(m_rk.score(f_test[:,1:], xy_test, target_test))
                    nn_lst.append(nn)
                    m_lst.append(m)
                result = pd.DataFrame({'model':m_lst,'n_closest_points':nn_lst, 'reg_score':reg_score_lst,'rk_score':rk_score_lst})
                result['random_state'] = random_state_value
                result['test_size'] = test_size_value
                df_final_lst.append(result)
                df_final = pd.concat(df_final_lst)
            return df_final
        ml_kriging = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)

        def model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, models_df):
            model_sorted = models_df.sort_values(by='rk_score', ascending=False)
            model_name = model_sorted.iloc[0]['model']
            nn_points = model_sorted.iloc[0]['n_closest_points']
            m_rk = RegressionKriging(regression_model=model_name, n_closest_points=nn_points)
            m_rk.fit(f_train[:,1:], xy_train, target_train)

            pred = m_rk.predict(f_test[:,1:], xy_test)
            pred_df = pd.DataFrame(zip(pred, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
            pred_df['well'] = f_test[:,0]
            pred_df['model'] = model_name.__class__.__name__
            pred_df['n_closest_points'] = nn_points
            pred_df['random_state'] = random_state_value
            pred_df['test_size'] = test_size_value
            pred_df['up_1.15pu'] = pred_df.phit_w_avg_true+0.0115
            pred_df['down_1.15pu'] = pred_df.phit_w_avg_true-0.0115
            pred_df['qc'] = np.where((pred_df.phit_w_avg_pred >= pred_df['down_1.15pu']) & (pred_df.phit_w_avg_pred <= pred_df['up_1.15pu']), 1, 0)
            return pred_df
        prediction_df = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, ml_kriging)

        return prediction_df, ml_kriging

    svr_model = SVR()
    rf_model = RandomForestRegressor()
    lr_model = LinearRegression()
    models_test = [svr_model, rf_model, lr_model]
    prediction_hbins_df, ml_hbins_kriging = ml_kriging_prediction_feature_bal8(models_test, 42, 0.5)
    qc_calc_ml_k_bins = prediction_hbins_df.qc.value_counts(normalize=True).reset_index()
    custom_palette = {0: 'red', 1: 'green'}
    sns.scatterplot(data=prediction_hbins_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
    sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
    plt.title('ml + kriging hbins qc = 1 ' + str(qc_calc_ml_k_bins[qc_calc_ml_k_bins.qc==1]['proportion'].values[0].round(2)))
    plt.grid()
    for idx, txt in enumerate(prediction_hbins_df.well):
        plt.annotate(txt, (prediction_hbins_df.phit_w_avg_true.iloc[idx], prediction_hbins_df.phit_w_avg_pred.iloc[idx]), fontsize=6)
    
    return qc_calc_ml_k_bins, prediction_hbins_df, ml_hbins_kriging

qc_calc_ml_k_bins, pred_ml_kri_bins_df, ml_kri_bins = ml_kriging_hbins()
pred_ml_kri_bins_df['type'] = 'ml_kriging_hbins'

In [None]:
sns.scatterplot(ml_kri_bins, x='rk_score', y='reg_score', hue='model')

### ml

In [None]:
def ml_hbins():
    ml_model_hbins = RandomForestRegressor()
    feature_bal8 = dist_bal8_v5[[   
                                    'well',             
                                    'phit_offset_w1',
                                    'htst_offset_w1', 'htst_<3m_w1', 'htst_3-4m_w1', 'htst_4_7m_w1', 'htst_>7m_w1', 
                                    'phit_offset_w2',
                                    'htst_offset_w2', 'htst_<3m_w2', 'htst_3-4m_w2', 'htst_4_7m_w2', 'htst_>7m_w2',
                                    'field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI',
                                    'gas_well']].values.reshape(-1,17)
    coord_bal8 = dist_bal8_v5[['xmean', 'ymean']].values
    target_bal8 = dist_bal8_v5['phit_target'].values

    f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
        feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42
    )
    ml_model_hbins.fit(f_train[:,1:], target_train)
    pred_ml_hbins = ml_model_hbins.predict(f_test[:,1:])
    pred_ml_hbins_df = pd.DataFrame(zip(pred_ml_hbins, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
    pred_ml_hbins_df['well'] = f_test[:,0]
    pred_ml_hbins_df['model'] = ml_model_hbins.__class__.__name__
    # pred_svr_df['n_closest_points'] = nn_points
    pred_ml_hbins_df['random_state'] = 42
    pred_ml_hbins_df['test_size'] = 0.5
    pred_ml_hbins_df['up_1.15pu'] = pred_ml_hbins_df.phit_w_avg_true+0.0115
    pred_ml_hbins_df['down_1.15pu'] = pred_ml_hbins_df.phit_w_avg_true-0.0115
    pred_ml_hbins_df['qc'] = np.where((pred_ml_hbins_df.phit_w_avg_pred >= pred_ml_hbins_df['down_1.15pu']) 
                                    & (pred_ml_hbins_df.phit_w_avg_pred <= pred_ml_hbins_df['up_1.15pu']), 1, 0)

    score_ml = ml_model_hbins.score(f_test[:,1:], target_test)
    qc_calc_ml_hbins = pred_ml_hbins_df.qc.value_counts(normalize=True).reset_index()
    custom_palette = {0: 'red', 1: 'green'}
    sns.scatterplot(data=pred_ml_hbins_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
    sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
    plt.title('only ml qc = 1 ' + str(qc_calc_ml_hbins[qc_calc_ml_hbins.qc==1]['proportion'].values[0].round(2)) + ' / R2: ' + str(score_ml.round(2)))
    plt.grid()
    for idx, txt in enumerate(pred_ml_hbins_df.well):
        plt.annotate(txt, (pred_ml_hbins_df.phit_w_avg_true[idx], pred_ml_hbins_df.phit_w_avg_pred[idx]), fontsize=6)
    
    return qc_calc_ml_hbins, pred_ml_hbins_df

qc_calc_ml_hbins, pred_ml_hbins_df = ml_hbins()
pred_ml_hbins_df['type'] = 'ml_hbins'

### kriging

In [None]:
def kriging_hbins():
    feature_bal8 = dist_bal8_v5[[   'phit_offset_w1','htst_offset_w1', 'htst_<3m_w1', 'htst_3-4m_w1', 'htst_4_7m_w1', 'htst_>7m_w1', 
                                    'phit_offset_w2','htst_offset_w2', 'htst_<3m_w2', 'htst_3-4m_w2', 'htst_4_7m_w2', 'htst_>7m_w2',
                                    'field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI',
                                    'gas_well']].values.reshape(-1,16)
    coord_bal8 = dist_bal8_v5[['well','xmean', 'ymean']].values
    target_bal8 = dist_bal8_v5['phit_target'].values

    f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
        feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42
    )

    kriging_hbins = Krige(n_closest_points=2, method='ordinary', variogram_model='linear')
    kriging_hbins.fit(x=xy_train[:,1:], y=target_train)
    pred_kriging_hbins = kriging_hbins.predict(xy_test[:,1:])

    kriging_hbins_df = pd.DataFrame(zip(pred_kriging_hbins, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
    kriging_hbins_df['well'] = xy_test[:,0]
    kriging_hbins_df['model'] = kriging_hbins.__class__.__name__
    kriging_hbins_df['n_closest_points'] = 2
    kriging_hbins_df['random_state'] = 42
    kriging_hbins_df['test_size'] = 0.5
    kriging_hbins_df['up_1.15pu'] = kriging_hbins_df.phit_w_avg_true+0.0115
    kriging_hbins_df['down_1.15pu'] = kriging_hbins_df.phit_w_avg_true-0.0115
    kriging_hbins_df['qc'] = np.where((kriging_hbins_df.phit_w_avg_pred >= kriging_hbins_df['down_1.15pu']) 
                                    & (kriging_hbins_df.phit_w_avg_pred <= kriging_hbins_df['up_1.15pu']), 1, 0)

    qc_calc_k_hbins = kriging_hbins_df.qc.value_counts(normalize=True).reset_index()
    custom_palette = {0: 'red', 1: 'green'}
    sns.scatterplot(data=kriging_hbins_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
    sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
    plt.title('only kriging qc = 1 ' + str(qc_calc_k_hbins[qc_calc_k_hbins.qc==1]['proportion'].values[0].round(2)))
    plt.grid()
    for idx, txt in enumerate(kriging_hbins_df.well.unique()):
        plt.annotate(txt, (kriging_hbins_df['phit_w_avg_true'].iloc[idx], kriging_hbins_df['phit_w_avg_pred'].iloc[idx]), fontsize=6)
    
    return qc_calc_k_hbins, kriging_hbins_df

qc_calc_k_hbins, pred_kri_hbins_df = kriging_hbins()
pred_kri_hbins_df['type'] = 'kriging_hbins'

## conclusions htst_bins

In [None]:
print('phit_w_avg prediction hbins \nkriging:', qc_calc_k_hbins[qc_calc_k_hbins.qc==1]['proportion'].values[0].round(2), 
      'ml:', qc_calc_ml_hbins[qc_calc_ml_hbins.qc==1]['proportion'].values[0].round(2), 
      'ml + kriging:', qc_calc_ml_k_bins[qc_calc_ml_k_bins.qc==1]['proportion'].values[0].round(2),
      'phit_simple_avg', qc_calc_nn_avg_phit[qc_calc_nn_avg_phit.qc==1]['proportion'].values[0].round(3))

In [40]:
df_hbins = pd.concat([pred_kri_hbins_df, pred_ml_hbins_df, pred_ml_kri_bins_df]).reset_index(drop=True)

# Phit prediction final df

In [26]:
pred_phit_final = pd.concat([df_no_features, df_feature_1, df_hbins]).reset_index(drop=True)
pred_phit_final.to_csv('output/pred_phit_final.csv', index=False)
pred_phit_final.type.unique()

In [None]:
set_1 = ['kriging_no_features', 'ml_no_features']
fig, ax = plt.subplots(figsize=(10,7))
custom_markers = {0: 'X', 1: 'o'}

sns.scatterplot(data=pred_phit_final[pred_phit_final.type.isin(set_1)], 
                x='phit_w_avg_true', y='phit_w_avg_pred', s=30, 
                hue='type', style='qc', markers=custom_markers,
                alpha=0.75, ec='black')
sns.lineplot(x=[0.18,0.27], y=[0.18,0.27], color='blue', ls='--')
plt.grid()
ml = qc_calc_ml_nof[qc_calc_ml_nof.qc==1]['proportion'].values[0].round(3)
kriging = qc_calc_k_nof[qc_calc_k_nof.qc==1]['proportion'].values[0].round(3)
plt.title(f'phit_w_avg pred_no_features | ml:{ml} | kriging:{kriging}')
for idx, txt in enumerate(pred_phit_final[pred_phit_final.type.isin(set_1)].well):
    plt.annotate(txt, (pred_phit_final[pred_phit_final.type.isin(set_1)].phit_w_avg_true.iloc[idx], 
                       pred_phit_final[pred_phit_final.type.isin(set_1)].phit_w_avg_pred.iloc[idx]), fontsize=6)

In [None]:
set_2 = ['kriging_feature_1', 'ml_feature_1']
fig, ax = plt.subplots(figsize=(10,7))
custom_markers = {0: 'X', 1: 'o'}

sns.scatterplot(data=pred_phit_final[pred_phit_final.type.isin(set_2)], 
                x='phit_w_avg_true', y='phit_w_avg_pred', s=30, 
                hue='type', style='qc', markers=custom_markers,
                alpha=0.75, ec='black')
sns.lineplot(x=[0.18,0.27], y=[0.18,0.27], color='blue', ls='--')
plt.grid()
ml = qc_calc_ml_f[qc_calc_ml_f.qc==1]['proportion'].values[0].round(3)
kriging = qc_calc_k_f[qc_calc_k_f.qc==1]['proportion'].values[0].round(3)
plt.title(f'phit_w_avg prediction feature_1 | ml:{ml} | kriging:{kriging}')
for idx, txt in enumerate(pred_phit_final[pred_phit_final.type.isin(set_2)].well):
    plt.annotate(txt, (pred_phit_final[pred_phit_final.type.isin(set_2)].phit_w_avg_true.iloc[idx], 
                       pred_phit_final[pred_phit_final.type.isin(set_2)].phit_w_avg_pred.iloc[idx]), fontsize=6)

In [None]:
set_3 = ['kriging_hbins', 'ml_hbins']
fig, ax = plt.subplots(figsize=(10,7))
custom_markers = {0: 'X', 1: 'o'}

sns.scatterplot(data=pred_phit_final[pred_phit_final.type.isin(set_3)], 
                x='phit_w_avg_true', y='phit_w_avg_pred', s=30, 
                hue='type', style='qc', markers=custom_markers,
                alpha=0.75, ec='black')
sns.lineplot(x=[0.18,0.27], y=[0.18,0.27], color='blue', ls='--')
plt.grid()
ml = qc_calc_ml_hbins[qc_calc_ml_hbins.qc==1]['proportion'].values[0].round(3)
kriging = qc_calc_k_hbins[qc_calc_k_hbins.qc==1]['proportion'].values[0].round(3)
plt.title(f'phit_w_avg prediction hbins | ml:{ml} | kriging:{kriging}')
for idx, txt in enumerate(pred_phit_final[pred_phit_final.type.isin(set_3)].well):
    plt.annotate(txt, (pred_phit_final[pred_phit_final.type.isin(set_3)].phit_w_avg_true.iloc[idx], 
                       pred_phit_final[pred_phit_final.type.isin(set_3)].phit_w_avg_pred.iloc[idx]), fontsize=6)

## Detailed analize

In [None]:
set_2 = [   
            'ml_kriging_feature_1',
            'kriging_feature_1', 
            'ml_feature_1'
         ]
ml_kri = pred_phit_final[pred_phit_final.type.isin([set_2[0]])][['well', 'phit_w_avg_true', 'phit_w_avg_pred', 'type','qc']]
ml = pred_phit_final[pred_phit_final.type.isin([set_2[2]])][['well', 'phit_w_avg_pred', 'type','qc']]
kri = pred_phit_final[pred_phit_final.type.isin([set_2[1]])][['well', 'phit_w_avg_pred', 'type','qc']]

test = ml_kri.set_index('well').join(ml.set_index('well'), lsuffix='_ml_kri', rsuffix='_ml').join(kri.set_index('well'), rsuffix='_kri').reset_index()
test = test.rename(columns={'phit_w_avg_pred':'phit_w_avg_pred_kri', 'type':'type_kri', 'qc':'qc_kri'})
test['gas_well'] = 0
test.loc[test.well.isin(names_gas_wells_v2), 'gas_well'] = 1

no_chance_well = test[(test.qc_ml_kri==0) & (test.qc_ml==0) & (test.qc_kri==0)].well
test['no_chance_well'] = 0
test.loc[test.well.isin(no_chance_well), 'no_chance_well'] = 1
well_bad_list = test[test.no_chance_well==1].well.values
display(test[test.no_chance_well==1])
display(well_bad_list)

In [None]:
# 1. построить кривые вместо точек +
# 2. правильные значения тоже в виде кривой +
# 3. посчитать среднее между всему определениями поросити
# 4. добавить инфу по новой скважине

test = test.sort_values(by='phit_w_avg_true')
fig, ax = plt.subplots(figsize=(20,7))
sns.lineplot(data=test, x='well', y='phit_w_avg_pred_ml_kri', label='ml_kri', c='#d121b4')
sns.lineplot(data=test, x='well', y='phit_w_avg_pred_ml', label='ml', c='green')
sns.lineplot(data=test, x='well', y='phit_w_avg_pred_kri', label='kri', c='blue')
sns.lineplot(data=test, x='well', y='phit_w_avg_true', label='true', alpha=0.5, color='red')
ax.errorbar(test['well'], test['phit_w_avg_true'], yerr=0.0115, fmt='s', color='r', alpha=0.5)
plt.legend()
plt.xticks(rotation=45);  # Rotate x-axis tick labels by 45 degrees
# plt.grid()

In [None]:
image_dir = r'C:\jupyter\SPP\plots\logs_vsh_bal8'
all_files = os.listdir(image_dir)
image_files = [f for f in all_files if f.endswith('.png')]

image_files_bad_wells = []
for well in well_bad_list:
    for image in image_files:
        if well in image:
            image_files_bad_wells.append(image)
image_files_bad_wells.sort()
fig, axes = plt.subplots(2, 3, figsize=(15, 15))
axes = axes.flatten()
for ax, file in zip(axes, image_files_bad_wells):
    img = mpimg.imread(os.path.join(image_dir, file))
    ax.imshow(img)
    ax.axis('off')
fig.tight_layout()
plt.show()

In [None]:
nn_w_phit_map_v2 = pd.read_csv('C:/jupyter/SPP/inputoutput/nn_w_phit_map_v2.csv')
fig, ax = plt.subplots(figsize=(15,7))
sc = plt.scatter(nn_w_phit_map_v2.x, nn_w_phit_map_v2.y, s=2, c=nn_w_phit_map_v2.phit_avg, cmap='viridis', alpha=0.5)
plt.colorbar(sc)
plt.scatter(ntd8.xmean, ntd8.ymean, s=50, c='black', marker='+', lw=0.5)
plt.scatter(ntd8[ntd8.well.isin(well_bad_list)].xmean, ntd8[ntd8.well.isin(well_bad_list)].ymean, s=50, 
            c=test[test.no_chance_well==1].phit_w_avg_pred_ml, marker='o', cmap='viridis', ec='black', lw=0.5)
plt.title('phit_w_avg')
for idx, txt in enumerate(ntd8[ntd8.well.isin(well_bad_list)].well):
    plt.annotate(txt, (ntd8[ntd8.well.isin(well_bad_list)].xmean.iloc[idx], ntd8[ntd8.well.isin(well_bad_list)].ymean.iloc[idx]), fontsize=6)

# Synthetic data for kriging

In [None]:
svr_model = SVR(C=0.1, gamma="auto")
rf_model = RandomForestRegressor(n_estimators=100)
lr_model = LinearRegression(copy_X=True, fit_intercept=False)

models = [svr_model, rf_model, lr_model]
housing = fetch_california_housing()

# take the first 5000 as Kriging is memory intensive
p = housing["data"][:5000, :-2]
x = housing["data"][:5000, -2:]
target = housing["target"][:5000]

p_train, p_test, x_train, x_test, target_train, target_test = train_test_split(
    p, x, target, test_size=0.3, random_state=42
)

for m in models:
    print("=" * 40)
    print("regression model:", m.__class__.__name__)
    m_rk = RegressionKriging(regression_model=m, n_closest_points=10)
    m_rk.fit(p_train, x_train, target_train)
    print("Regression Score: ", m_rk.regression_model.score(p_test, target_test))
    print("RK score: ", m_rk.score(p_test, x_test, target_test))

In [None]:
rf_model = RandomForestRegressor(n_estimators=100)
p = housing["data"][:5000, :-2]
x = housing["data"][:5000, -2:]
target = housing["target"][:5000]

p_train, p_test, x_train, x_test, target_train, target_test = train_test_split(
    p, x, target, test_size=0.3, random_state=42
)

rf_model.fit(p_train, target_train)
pred_rf = rf_model.predict(p_test)
score_rf = rf_model.score(p_test, target_test).round(3)
print("RandomForest Score: ", score_rf)

In [None]:
p = housing["data"][:5000, :-2]
x = housing["data"][:5000, -2:]
target = housing["target"][:5000]

p_train, p_test, x_train, x_test, target_train, target_test = train_test_split(
    p, x, target, test_size=0.3, random_state=42
)

kriging = Krige(method='ordinary', variogram_model='linear')
kriging.fit(x=x_train, y=target_train)
pred_kriging = kriging.predict(x_test)
score_kriging = kriging.score(x_test, target_test).round(3)
print("Kriging Score: ", score_kriging)
# print('RFR + kriging:',)

In [None]:
rf_model = RandomForestRegressor(n_estimators=100)
p = housing["data"][:5000, :-2]
x = housing["data"][:5000, -2:]
target = housing["target"][:5000]

p_train, p_test, x_train, x_test, target_train, target_test = train_test_split(
    p, x, target, test_size=0.3, random_state=42
)

m_rk = RegressionKriging(regression_model=rf_model)
m_rk.fit(p_train, x_train, target_train)
score_rf_kriging = m_rk.score(p_test, x_test, target_test).round(3)
print("RK score: ", score_rf_kriging)

In [None]:
print('synthetic dataset\nkriging:', score_kriging, 'ml:', score_rf, 'ml + kriging:', score_rf_kriging)

# Htst prediction

## mk+kriging

In [None]:
def ml_kriging_prediction_htst_bal8(models, random_state_value, test_size_value):
    feature_bal8 = dist_bal8_v5[[   
                                    # 'phit_offset_w1',
                                    'htst_offset_w1', 
                                    'htst_<3m_w1', 'htst_3-4m_w1', 'htst_4_7m_w1', 'htst_>7m_w1', 
                                    # 'phit_offset_w2',
                                    'htst_offset_w2', 
                                    'htst_<3m_w2', 'htst_3-4m_w2', 'htst_4_7m_w2', 'htst_>7m_w2',
                                    'field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI']].values.reshape(-1, 13)
    coord_bal8 = dist_bal8_v5[['xmean', 'ymean']].values
    target_bal8 = dist_bal8_v5['htst_target'].values

    f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
        feature_bal8, coord_bal8, target_bal8, test_size=test_size_value, random_state=random_state_value
    )

    def model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models):
        df_final_lst = []
        for m in models:
            reg_score_lst, rk_score_lst, nn_lst, m_lst = [], [], [], []
            for nn in range(2,16):
                print("n_closest_points:", nn)
                m_rk = RegressionKriging(regression_model=m, n_closest_points=nn, verbose=False)
                m_rk.fit(f_train, xy_train, target_train)
                reg_score_lst.append(m_rk.regression_model.score(f_test, target_test))
                rk_score_lst.append(m_rk.score(f_test, xy_test, target_test))
                nn_lst.append(nn)
                m_lst.append(m)
            result = pd.DataFrame({'model':m_lst,'n_closest_points':nn_lst, 'reg_score':reg_score_lst,'rk_score':rk_score_lst})
            result['random_state'] = random_state_value
            result['test_size'] = test_size_value
            df_final_lst.append(result)
            df_final = pd.concat(df_final_lst)
        return df_final
    ml_kriging = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)

    def model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, models_df):
        model_sorted = models_df.sort_values(by='rk_score', ascending=False)
        model_name = model_sorted.iloc[0]['model']
        nn_points = model_sorted.iloc[0]['n_closest_points']
        m_rk = RegressionKriging(regression_model=model_name, n_closest_points=nn_points)
        m_rk.fit(f_train, xy_train, target_train)

        pred = m_rk.predict(f_test, xy_test)
        pred_df = pd.DataFrame(zip(pred, target_test), columns=['htst_sum_pred', 'htst_sum_true'])
        pred_df['model'] = model_name.__class__.__name__
        pred_df['n_closest_points'] = nn_points
        pred_df['random_state'] = random_state_value
        pred_df['test_size'] = test_size_value
        pred_df['up_4.25m'] = pred_df.htst_sum_true+4.25
        pred_df['down_4.25m'] = pred_df.htst_sum_true-4.25
        pred_df['qc'] = np.where((pred_df.htst_sum_pred >= pred_df['down_4.25m']) & (pred_df.htst_sum_pred <= pred_df['up_4.25m']), 1, 0)
        return pred_df
    prediction_df = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, ml_kriging)

    return prediction_df, ml_kriging

svr_model = SVR()
rf_model = RandomForestRegressor()
lr_model = LinearRegression()
models_test = [svr_model, rf_model, lr_model]
prediction_htst_df, ml_htst_kriging = ml_kriging_prediction_htst_bal8(models_test, 42, 0.5)
qc_htst_ml_kri2 = prediction_htst_df.qc.value_counts(normalize=True).reset_index()
custom_palette = {0: 'red', 1: 'green'}
sns.scatterplot(data=prediction_htst_df, x='htst_sum_true', y='htst_sum_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
sns.lineplot(x=[20, 80], y=[20, 80], color='blue', ls='--')
plt.title('ml + kriging qc = 1 ' + str(qc_htst_ml_kri2[qc_htst_ml_kri2.qc==1]['proportion'].values[0].round(2)));

In [None]:
ml_htst_kriging.sort_values(by=['rk_score','reg_score'], ascending=False).head(2)

In [None]:
sns.scatterplot(ml_htst_kriging, x='rk_score', y='reg_score', hue='model')

## kriging

In [None]:
feature_bal8 = dist_bal8_v5[[   'phit_offset_w1','htst_offset_w1', 'htst_<3m_w1', 'htst_3-4m_w1', 'htst_4_7m_w1', 'htst_>7m_w1', 
                                'phit_offset_w2','htst_offset_w2', 'htst_<3m_w2', 'htst_3-4m_w2', 'htst_4_7m_w2', 'htst_>7m_w2',
                                'field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI']].values.reshape(-1,15)
coord_bal8 = dist_bal8_v5[['xmean', 'ymean']].values
target_bal8 = dist_bal8_v5['htst_target'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42
)
nn_points = 5
kriging = Krige(n_closest_points=nn_points, method='ordinary', variogram_model='linear')
kriging.fit(x=xy_train, y=target_train)
pred_kriging = kriging.predict(xy_test)

kriging_df = pd.DataFrame(zip(pred_kriging, target_test), columns=['htst_sum_pred', 'htst_sum_true'])
kriging_df['model'] = kriging.__class__.__name__
kriging_df['n_closest_points'] = nn_points
kriging_df['random_state'] = 42
kriging_df['test_size'] = 0.5
kriging_df['up_4.25m'] = kriging_df.htst_sum_true+4.25
kriging_df['down_4.25m'] = kriging_df.htst_sum_true-4.25
kriging_df['qc'] = np.where((kriging_df.htst_sum_pred >= kriging_df['down_4.25m']) & (kriging_df.htst_sum_pred <= kriging_df['up_4.25m']), 1, 0)

qc_calc_kriging = kriging_df.qc.value_counts(normalize=True).reset_index()
custom_palette = {0: 'red', 1: 'green'}
sns.scatterplot(data=kriging_df, x='htst_sum_pred', y='htst_sum_true', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
sns.lineplot(x=[20, 80], y=[20, 80], color='blue', ls='--')
plt.title('only kriging qc = 1 ' + str(qc_calc_kriging[qc_calc_kriging.qc==1]['proportion'].values[0].round(2)));

# Looking for a best vsh_cutoff

In [None]:
logs8_ntd_v5 = pd.read_csv(r"C:\jupyter\SPP\inputoutput\general_logs\logs8_ntd_v5.csv")
gr_cube_htst = logs8_ntd_v5[['well', 'formation', 'md', 'tst', 'formation_up','vsh','net','vsh_cube']]
for cutoff in np.arange(0.30, 0.60, 0.01):
    cutoff = cutoff.round(2)
    gr_cube_htst['cutoff_' + str(cutoff)] = cutoff
gr_cube_htst.columns

In [None]:
for val in [    'cutoff_0.3', 'cutoff_0.31', 'cutoff_0.32', 'cutoff_0.33',
                'cutoff_0.34', 'cutoff_0.35', 'cutoff_0.36', 'cutoff_0.37',
                'cutoff_0.38', 'cutoff_0.39', 'cutoff_0.4', 'cutoff_0.41',
                'cutoff_0.42', 'cutoff_0.43', 'cutoff_0.44', 'cutoff_0.45',
                'cutoff_0.46', 'cutoff_0.47', 'cutoff_0.48', 'cutoff_0.49',
                'cutoff_0.5', 'cutoff_0.51', 'cutoff_0.52', 'cutoff_0.53',
                'cutoff_0.54', 'cutoff_0.55', 'cutoff_0.56', 'cutoff_0.57',
                'cutoff_0.58', 'cutoff_0.59']:
    gr_cube_htst['net_' + val] = np.where(gr_cube_htst.vsh_cube <= gr_cube_htst[val], 1, 0)
gr_cube_htst.columns

In [62]:
gr_cube_htst_sum = gr_cube_htst.groupby('well')[[   'net', 'net_cutoff_0.3', 'net_cutoff_0.31',
                                                    'net_cutoff_0.32', 'net_cutoff_0.33', 'net_cutoff_0.34',
                                                    'net_cutoff_0.35', 'net_cutoff_0.36', 'net_cutoff_0.37',
                                                    'net_cutoff_0.38', 'net_cutoff_0.39', 'net_cutoff_0.4',
                                                    'net_cutoff_0.41', 'net_cutoff_0.42', 'net_cutoff_0.43',
                                                    'net_cutoff_0.44', 'net_cutoff_0.45', 'net_cutoff_0.46',
                                                    'net_cutoff_0.47', 'net_cutoff_0.48', 'net_cutoff_0.49',
                                                    'net_cutoff_0.5', 'net_cutoff_0.51', 'net_cutoff_0.52',
                                                    'net_cutoff_0.53', 'net_cutoff_0.54', 'net_cutoff_0.55',
                                                    'net_cutoff_0.56', 'net_cutoff_0.57', 'net_cutoff_0.58',
                                                    'net_cutoff_0.59']].apply(lambda x: x.sum()*0.1).reset_index()

In [None]:
df_lst = []
for col in [    'net_cutoff_0.3', 'net_cutoff_0.31',
                'net_cutoff_0.32', 'net_cutoff_0.33', 'net_cutoff_0.34',
                'net_cutoff_0.35', 'net_cutoff_0.36', 'net_cutoff_0.37',
                'net_cutoff_0.38', 'net_cutoff_0.39', 'net_cutoff_0.4',
                'net_cutoff_0.41', 'net_cutoff_0.42', 'net_cutoff_0.43',
                'net_cutoff_0.44', 'net_cutoff_0.45', 'net_cutoff_0.46',
                'net_cutoff_0.47', 'net_cutoff_0.48', 'net_cutoff_0.49',
                'net_cutoff_0.5', 'net_cutoff_0.51', 'net_cutoff_0.52',
                'net_cutoff_0.53', 'net_cutoff_0.54', 'net_cutoff_0.55',
                'net_cutoff_0.56', 'net_cutoff_0.57', 'net_cutoff_0.58',
                'net_cutoff_0.59']:
    final_look = gr_cube_htst[['well', 'net', col]].groupby('well').apply(lambda x: x.sum()*0.1).reset_index()
    final_look = final_look[~((final_look.net == 0) | (final_look[col] == 0))]
    correlation = final_look[col].corr(final_look['net'])
    mae = abs(final_look[col] - final_look['net'])
    df = pd.DataFrame({'vsh_cutoff':col, 'correlation':correlation, 'mae': mae}, index=[0])
    df_lst.append(df)
df_corr = pd.concat(df_lst).reset_index(drop=True)
df_corr.sort_values(by='mae', ascending=True).head(2)

In [None]:
sns.scatterplot(data=df_corr, x='correlation', y='mae')

In [None]:
final_look = gr_cube_htst[['well', 'formation', 'md', 'tst', 'formation_up', 'vsh', 'net','vsh_cube', 
                           'net_cutoff_0.3','net_cutoff_0.52', 'net_cutoff_0.59']]
final_look_v2 = final_look.groupby('well')[['net',
                                            'net_cutoff_0.3',
                                            'net_cutoff_0.52', 
                                            'net_cutoff_0.59']].apply(lambda x: x.sum()*0.1).reset_index()
final_look_v2
sns.scatterplot(data=final_look_v2, x='net', y='net_cutoff_0.52', label = '0.52', alpha=0.5, ec='black')
sns.scatterplot(data=final_look_v2, x='net', y='net_cutoff_0.59', label = '0.59', alpha=0.5, ec='black')
sns.scatterplot(data=final_look_v2, x='net', y='net_cutoff_0.3', label = '0.3', alpha=0.5, ec='black')
sns.lineplot(x=[0,90], y=[0,90], color='r', ls='--')
plt.legend()

# Phit pred based on net_cube

In [66]:
logs8_ntd_v5 = pd.read_csv(r"C:\jupyter\SPP\inputoutput\general_logs\logs8_ntd_v5.csv")
htst_cube = logs8_ntd_v5[['well', 'tst', 'net','vsh_cube']]
htst_cube['cutoff'] = 0.52
htst_cube['net_cube'] = np.where(htst_cube.vsh_cube <= htst_cube.cutoff, 1, 0)
htst_cube_sum = htst_cube.groupby('well')[['net', 'net_cube']].apply(lambda x: x.sum()*0.1).reset_index()
htst_cube_sum = htst_cube_sum.rename(columns={'net':'htst_v2', 'net_cube':'htst_cube'})

In [None]:
ntd_top_phi_bot8_bp_v4['htst*phit_avg'] = ntd_top_phi_bot8_bp_v4['htst'] * ntd_top_phi_bot8_bp_v4['phit_avg']
ntd8 = ntd_top_phi_bot8_bp_v4.groupby('well')[['phit_avg','htst', 'htst*phit_avg']].agg({'phit_avg':'mean','htst':'sum', 'htst*phit_avg':'sum'}).reset_index()
xy = df_bal8_v4_flag.groupby('well')[['xmean','ymean']].first().reset_index()
ntd8 = ntd8.merge(xy, on='well').round({'xmean':0, 'ymean':0})
ntd8['phit_w_avg'] = ntd8['htst*phit_avg'] / ntd8['htst']
ntd8 = ntd8[['well', 'phit_avg', 'htst', 'phit_w_avg', 'xmean', 'ymean']]
ntd8_v2 = ntd8.set_index('well').join(htst_cube_sum.set_index('well')).reset_index()
ntd8_v2

In [68]:
# sns.scatterplot(data=ntd8_v2, x='htst', y='htst_v2', alpha=0.5, ec='black')
# sns.lineplot(x=[0, 90], y=[0, 90], color='r', ls='--')

## ml+kriging

In [None]:
def ml_kriging_prediction_bal8_cube(models, random_state_value, test_size_value):
    feature_bal8 = ntd8_v2[['htst_cube']].values.reshape(-1,1)
    coord_bal8 = ntd8_v2[['xmean', 'ymean']].values
    target_bal8 = ntd8_v2['phit_w_avg'].values

    f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
        feature_bal8, coord_bal8, target_bal8, test_size=test_size_value, random_state=random_state_value
    )

    def model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models):
        df_final_lst = []
        for m in models:
            reg_score_lst, rk_score_lst, nn_lst, m_lst = [], [], [], []
            for nn in range(2,16):
                print("n_closest_points:", nn)
                m_rk = RegressionKriging(regression_model=m, n_closest_points=nn, verbose=False)
                m_rk.fit(f_train, xy_train, target_train)
                reg_score_lst.append(m_rk.regression_model.score(f_test, target_test))
                rk_score_lst.append(m_rk.score(f_test, xy_test, target_test))
                nn_lst.append(nn)
                m_lst.append(m)
            result = pd.DataFrame({'model':m_lst,'n_closest_points':nn_lst, 'reg_score':reg_score_lst,'rk_score':rk_score_lst})
            result['random_state'] = random_state_value
            result['test_size'] = test_size_value
            df_final_lst.append(result)
            df_final = pd.concat(df_final_lst)
        return df_final
    ml_kriging = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)

    def model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, models_df):
        model_sorted = models_df.sort_values(by='rk_score', ascending=False)
        model_name = model_sorted.iloc[0]['model']
        nn_points = model_sorted.iloc[0]['n_closest_points']
        m_rk = RegressionKriging(regression_model=model_name, n_closest_points=nn_points)
        m_rk.fit(f_train, xy_train, target_train)

        pred = m_rk.predict(f_test, xy_test)
        pred_df = pd.DataFrame(zip(pred, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
        pred_df['model'] = model_name
        pred_df['n_closest_points'] = nn_points
        pred_df['random_state'] = random_state_value
        pred_df['test_size'] = test_size_value
        pred_df['up_1.15pu'] = pred_df.phit_w_avg_true+0.0115
        pred_df['down_1.15pu'] = pred_df.phit_w_avg_true-0.0115
        pred_df['qc'] = np.where((pred_df.phit_w_avg_pred >= pred_df['down_1.15pu']) & (pred_df.phit_w_avg_pred <= pred_df['up_1.15pu']), 1, 0)
        return pred_df
    prediction_df = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, ml_kriging)

    return prediction_df, ml_kriging

svr_model = SVR()
rf_model = RandomForestRegressor()
lr_model = LinearRegression()

models_test = [svr_model, rf_model, lr_model]
prediction_df, ml_kriging = ml_kriging_prediction_bal8_cube(models_test, 42, 0.5)
qc_calc = prediction_df.qc.value_counts(normalize=True).reset_index()
custom_palette = {0: 'red', 1: 'green'}
sns.scatterplot(data=prediction_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
plt.title('qc = 1 ' + str(qc_calc[qc_calc.qc==1]['proportion'].values[0].round(2)));

In [None]:
ml_kriging.sort_values(by='rk_score', ascending=False).head(2)

## ml

In [None]:
svr_model = RandomForestRegressor()

feature_bal8 = ntd8_v2[['htst_cube']].values.reshape(-1,1)
coord_bal8 = ntd8_v2[['xmean', 'ymean']].values
target_bal8 = ntd8_v2['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42)
svr_model.fit(f_train, target_train)

pred_svr = svr_model.predict(f_test)
pred_svr_df = pd.DataFrame(zip(pred_svr, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
pred_svr_df['model'] = svr_model.__class__.__name__
# pred_svr_df['n_closest_points'] = nn_points
pred_svr_df['random_state'] = 42
pred_svr_df['test_size'] = 0.5
pred_svr_df['up_1.15pu'] = pred_svr_df.phit_w_avg_true+0.0115
pred_svr_df['down_1.15pu'] = pred_svr_df.phit_w_avg_true-0.0115
pred_svr_df['qc'] = np.where((pred_svr_df.phit_w_avg_pred >= pred_svr_df['down_1.15pu']) & (pred_svr_df.phit_w_avg_pred <= pred_svr_df['up_1.15pu']), 1, 0)

qc_calc = pred_svr_df.qc.value_counts(normalize=True).reset_index()
custom_palette = {0: 'red', 1: 'green'}
sns.scatterplot(data=pred_svr_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
plt.title('qc = 1 ' + str(qc_calc[qc_calc.qc==1]['proportion'].values[0].round(2)));

# KNeighborsRegressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knr_model = KNeighborsRegressor(n_neighbors=2)

feature_bal8 = ntd8_v2[['htst_cube']].values.reshape(-1,1)
coord_bal8 = ntd8_v2[['xmean', 'ymean']].values
target_bal8 = ntd8_v2['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42)
svr_model.fit(f_train, target_train)

knr_model.fit(f_train, target_train)
pred_knr = knr_model.predict(f_test)

pred_knr_df = pd.DataFrame(zip(pred_knr, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
pred_knr_df['model'] = svr_model.__class__.__name__
# pred_svr_df['n_closest_points'] = nn_points
pred_knr_df['random_state'] = 42
pred_knr_df['test_size'] = 0.5
pred_knr_df['up_1.15pu'] = pred_knr_df.phit_w_avg_true+0.0115
pred_knr_df['down_1.15pu'] = pred_knr_df.phit_w_avg_true-0.0115
pred_knr_df['qc'] = np.where((pred_knr_df.phit_w_avg_pred >= pred_knr_df['down_1.15pu']) & 
                   (pred_knr_df.phit_w_avg_pred <= pred_knr_df['up_1.15pu']), 1, 0)

qc_calc_knr = pred_knr_df.qc.value_counts(normalize=True).reset_index()
custom_palette = {0: 'red', 1: 'green'}
sns.scatterplot(data=pred_knr_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
plt.title('qc = 1 ' + str(qc_calc_knr[qc_calc_knr.qc==1]['proportion'].values[0].round(2)));