## Start

### Import libs

In [2]:
#Import libs 
import pandas as pd
import numpy as np
import statistics as st
from scipy.ndimage import gaussian_filter
from scipy.interpolate import interp1d
from scipy.stats import gmean
from scipy import stats
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from IPython.display import display, HTML
import math
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.lines as mlines
import plotly.graph_objects as go
import plotly.offline as go_offline
from plotly.subplots import make_subplots
import plotly.express as px
from tqdm import tqdm
import textwrap
import seaborn as sns
from statistics import mean
import geopandas as gpd
from shapely.geometry import Point, Polygon, mapping
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor 
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score as r2 
from sklearn.metrics import mean_absolute_error as mae 
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import make_scorer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from datetime import datetime
import random
pd.set_option("display.precision", 3)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', 15)
import warnings
warnings.filterwarnings('ignore')

### Upload main data

In [3]:
#Loading metadata, distribution wells per Platforms and all the that.
def metadata_parquet_loading():
    path = 'C:\\jupyter\\SPP\\input\\'
    metadata_init = pd.read_csv(path + 'ACG_wells_metadata.csv', sep=',')
    metadata = metadata_init.copy()
    metadata = metadata.rename(columns={'X':'X_wellhead', 'Y':'Y_wellhead'})
    metadata.Status = metadata.Status.str.strip()
    metadata.Status = metadata.Status.str.lower()
    metadata.loc[metadata.Status == 'oil', 'Status' ] = 'production oil'
    metadata.loc[metadata.Status == 'oil producer', 'Status' ] = 'production oil'
    metadata.loc[metadata.Status == 'production', 'Status' ] = 'production oil'
    metadata.loc[metadata.Status == 'produiction oil', 'Status' ] = 'production oil'
    metadata.loc[metadata.Status == 'production_oil', 'Status' ] = 'production oil'
    metadata.loc[metadata.Status == 'abandoned production oil', 'Status' ] = 'abandoned oil'
    metadata.loc[metadata.Status == 'abandoned  oil', 'Status' ] = 'abandoned oil'
    metadata.loc[metadata.Status == 'abandoned oi', 'Status' ] = 'abandoned oil'
    metadata.loc[metadata.Status == 'injector  - water', 'Status' ] = 'injector - water'
    metadata.loc[metadata.Status == 'injector water', 'Status' ] = 'injector - water'
    metadata.loc[metadata.Status == 'injetor  - water', 'Status' ] = 'injector - water'
    metadata.loc[metadata.Status == 'abandoned injector - water per b', 'Status' ] = 'abandoned injector - water'
    metadata.loc[metadata.Status == 'plugged and abandoned', 'Status' ] = 'p&a'
    metadata.loc[metadata.X_wellhead==118.270, 'X_wellhead'] = 526258.84
    metadata.loc[metadata.Y_wellhead==526261.510, 'Y_wellhead'] = 4435802.01
    metadata.loc[metadata.well=='C39', 'X_wellhead'] = 526258.840
    metadata.loc[metadata.well=='C39', 'Y_wellhead'] = 4435802.010
    metadata.loc[metadata.field=='West Azeri', 'field'] = 'WEST AZERI'
    metadata.loc[metadata.field=='COP', 'field'] = 'WEST CHIRAG'
    metadata.loc[metadata.well=='AZERI2', 'field'] = 'WEST AZERI'
    metadata.loc[metadata.well=='AZERI3', 'field'] = 'WEST AZERI'
    metadata.loc[metadata.well=='B31', 'field'] = 'CENTRAL AZERI'
    metadata.loc[metadata.well=='J28_bpQIP', 'field'] = 'WEST CHIRAG'

    #Read data from parquet
    path = 'C:\\jupyter\\SPP\\input\\'
    df_prq = pd.read_parquet(path + 'ACG_wells_JOINT_BEST_v10.parquet.gzip')
    df_prq.rename(columns={'wellName':'well'}, inplace=True)
    df_prq = df_prq.set_index('well').join(metadata.set_index('well')).reset_index()
    # print('wells in df totally:', len(df_prq.well.unique()))
    # Filter data with bad_well_list 
    bad_well_list = ['E10Z','Predrill_J01Z', 'Predrill_J08', 'J28_bpQIP', 'A01W_2']
    df_prq = df_prq[~df_prq.well.isin(bad_well_list)]
    #Assign any Fluidcode_mod number by variable gross_pay=1 and gross_pay=0 if Fluidcode_mod as NaN
    df_prq.loc[df_prq.FLUIDS>0, 'FLUIDS_int'] = 1
    df_prq.loc[df_prq.FLUIDS<=0, 'FLUIDS_int'] = 0
    df_prq.FLUIDS_int = df_prq.FLUIDS_int.astype('int')
    # Unite of FU for each formation

    df_bal = df_prq[df_prq.FORMATION.str.contains('Balakhany')]
    df_bal.loc[df_bal.FORMATION.str.contains('Balakhany VIII'), 'FORMATION_up'] = 'Balakhany VIII'
    df_bal.loc[df_bal.FORMATION.str.contains('Balakhany X'), 'FORMATION_up'] = 'Balakhany X'
    df_bal = df_bal[df_bal.FORMATION_up.notna()]
    #Getting XY mean coords of Balakhany formation
    xy_coord_mean = df_bal[['well', 'FORMATION_up', 'X', 'Y']]
    xy_coord_mean = xy_coord_mean.groupby(['well', 'FORMATION_up']).agg({'X': 'mean', 'Y':'mean'}).reset_index()
    xy_coord_mean = xy_coord_mean.rename(columns={'X':'X_mean', 'Y':'Y_mean'})
    xy_coord_mean = xy_coord_mean[xy_coord_mean.FORMATION_up.str.contains('Balakhany') & (xy_coord_mean.X_mean>0) & (xy_coord_mean.Y_mean>0)]
    df_bal.rename(columns={'X':'X_traj', 'Y':'Y_traj'}, inplace=True)
    df_bal = df_bal.set_index(['well', 'FORMATION_up']).join(xy_coord_mean.set_index(['well', 'FORMATION_up'])).reset_index()
    return df_bal
df_bal = metadata_parquet_loading()

### List of functions

In [4]:
# Function to display in TST well plots with logging curves
def well_display_khtst( dataset, wellname, fmname, net_var, comments, 
                        ref_depth, fm_flag, depth_step, kh_include, print):
    """
    dataset = df_bal or something else
    net_var = NET or FLUIDS_int
    comments = put what you want
    ref_depth = MD or TST
    fm_flag = 1 if you need a FORMATION_up, 0 if just a simple FORMATION
    depth_step = step for ticks on the diagramm
    kh_include = 1 if we have KHtst in dataset, 0 if there is not KHtst
    print = 1 if we want to print the plot
    """
    if fm_flag == 0:
        data = dataset[(dataset.well==wellname) & (dataset.FORMATION == fmname)]
    if fm_flag == 1:
        data = dataset[(dataset.well==wellname) & (dataset.FORMATION_up == fmname)]
    depth = data[ref_depth]
    grn = data['GR_N']
    vsh = data['VSH']
    rhob = data['RHOB'] 
    npss = data['NPSS']
    rdeep = data['RDEEP']
    phit = data['PHIT'] 
    net = data[net_var]
    perm = data['LPERM']
    if kh_include == 1:
        kh = data['KHtst']
    else:
        data['KHtst'] = 0
        kh = data['KHtst']
    fig, ax = plt.subplots(1,4, figsize=(7,7), sharey=True)
    well_bal_tops = df_bal[(df_bal.well == wellname)].groupby('FORMATION')[ref_depth].apply(lambda x: x.iloc[0]).reset_index()
    ax[0].yaxis.set_ticks(np.arange(min(depth), max(depth), depth_step))
    ax[0].plot(grn, depth, color='lightgreen', lw=3, zorder=10)
    ax[0].invert_yaxis() 
    ax[0].set_xlim(0, 150) 
    ax[0].grid(axis='y')
    for i in well_bal_tops[well_bal_tops.FORMATION.str.contains(fmname)].FORMATION:
        ax[0].hlines(well_bal_tops[well_bal_tops.FORMATION==i][ref_depth].iloc[0], 
                    xmin=0, xmax=1000, linewidth=2, color='black', lw=2, alpha=0.33)
    twin0 = ax[0].twiny()
    twin0.plot(vsh, depth, color='black', alpha=0.5, zorder=5)
    twin0.set_xlim(0, 1.5)
    ax[1].plot(rhob, depth, color='red') 
    ax[1].invert_yaxis() 
    ax[1].xaxis.set_ticks(np.arange(1.65, 2.65, 0.3))
    ax[1].set_xlim(1.65, 2.65)
    ax[1].grid(axis='y'), ax[1].grid(axis='x')
    for i in well_bal_tops[well_bal_tops.FORMATION.str.contains(fmname)].FORMATION:
        ax[1].hlines(well_bal_tops[well_bal_tops.FORMATION==i][ref_depth].iloc[0], 
        xmin=0, xmax=150, linewidth=2, color='black', lw=2, alpha=0.33)
        ax[1].text(1.67, well_bal_tops[well_bal_tops.FORMATION==i][ref_depth].iloc[0]+0.5*depth_step, i, fontsize = 7, color ="black")
    twin1 = ax[1].twiny()
    twin1.plot(npss, depth, color='blue')
    twin1.set_xlim(0.6, 0)
    # ax[2].plot(rdeep, depth, color='black'), ax[2].set_xscale('log'), ax[2].set_xlim(0.1, 50), ax[2].invert_yaxis(), ax[2].grid(axis='x', which='both')
    ax[2].plot(phit, depth, color='green', linestyle='dashed'), ax[2].set_xlim(0.3, 0), ax[2].grid(axis='x') 
    ax[2].invert_yaxis()
    ax[2].grid(axis='y')
    ax[2].vlines(0.13, ymin=min(depth), ymax=max(depth), color='black', linestyle='dashed')
    twin2 = ax[2].twiny()
    twin2.plot(net, depth, color='orange', linewidth=0.5)
    twin2.fill_betweenx(depth,net, color='orange', alpha=0.33)
    twin2.set_xlim(0, 1)
    twin2.set_ylim(min(depth), max(depth))
    ax[3].plot(perm, depth, color='purple', alpha=0.66), ax[3].set_xscale('log'), ax[3].set_xlim(0.1, 1000)
    ax[3].invert_yaxis()
    ax[3].grid(axis='y')
    for i in well_bal_tops[well_bal_tops.FORMATION.str.contains(fmname)].FORMATION:
        ax[3].hlines(well_bal_tops[well_bal_tops.FORMATION==i][ref_depth].iloc[0], xmin=0, xmax=1000, linewidth=2, color='black', lw=2, alpha=0.66)
    twin4 = ax[3].twiny()
    twin4.plot(kh, depth, color='black', alpha=1)
    fig.suptitle(wellname + ' ' + fmname + ' ' + ref_depth + ' ' + str(round(max(kh.dropna()),0)) + ' ' + str(comments), fontsize=14)
    fig.tight_layout()
    if print == 1:
        path = 'C:\\jupyter\\SPP\\inputoutput\\wellplots\\'
        fig.savefig(path + fmname.replace(' ','') + '_' + wellname + '.png')
    else:
        pass
# Draw a map
def map_value_2plots(metadata, dataset, formation, value, color, multi_chr = 0.001, multi_azr = 0.001):
    """
    metadata, 
    dataset = dataset with X & Y, 
    formation = 'Balakhany VIII',  
    value = for example 'KHtst' or 'tst_interv'
    multi_chr = 0.001, multi_azr = 0.001
    """
    fig = make_subplots(rows=2, cols=1, subplot_titles=('crg: ' + str(multi_chr), 'azr: ' + str(multi_azr)), 
                        vertical_spacing = 0.025)
    azr_lst = ['CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI']
    chg_lst = ['CHIRAG', 'DWG', 'DDGG', 'WEST CHIRAG']
    field_avg_coord = metadata.groupby('field')[['X_wellhead','Y_wellhead']].mean().reset_index()
    field_avg_coord_chg = field_avg_coord[field_avg_coord.field.isin(chg_lst)]
    field_avg_coord_azr = field_avg_coord[field_avg_coord.field.isin(azr_lst)] 
    df_chg = dataset[(dataset.FORMATION_up == formation) & (dataset.field.isin(chg_lst))]
    df_azr = dataset[(dataset.FORMATION_up == formation) & (dataset.field.isin(azr_lst))]
    fig.add_trace(go.Scatter(x=df_chg.X, y=df_chg.Y, customdata = df_chg[['well', value, color]],
                            marker=dict(color=df_chg[color], size=df_chg[value]*multi_chr, colorscale='Viridis_r',  showscale=True,
                            line=dict(color='rgb(47, 57, 61)', width=0.5)),
                            mode='markers', hovertemplate="".join(["well:%{customdata[0]}, value:%{customdata[1]}, color:%{customdata[2]}<extra></extra>"])),
                            row=1, col=1)
    fig.add_trace(go.Scatter(x=field_avg_coord_chg.X_wellhead, y=field_avg_coord_chg.Y_wellhead, customdata = field_avg_coord_chg[['field']],
                            text=field_avg_coord_chg['field'], textposition="middle right",
                            marker=dict(color='rgb(0, 0,0)', size=12),
                            mode='markers+text', 
                            marker_symbol='square', hovertemplate="".join(["%{customdata[0]}<extra></extra>"])),
                            row=1, col=1)
    fig.add_trace(go.Scatter(x=df_azr.X, y=df_azr.Y, customdata = df_azr[['well', value, color]],
                            marker=dict(color=df_azr[color], size=df_azr[value]*multi_azr, colorscale='Viridis_r',  showscale=False,
                            line=dict(color='rgb(47, 57, 61)', width=0.5)),
                            mode='markers', hovertemplate="".join(["well:%{customdata[0]}, value:%{customdata[1]}, color:%{customdata[2]}<extra></extra>"])),
                            row=2, col=1)
    fig.add_trace(go.Scatter(x=field_avg_coord_azr.X_wellhead, y=field_avg_coord_azr.Y_wellhead, customdata = field_avg_coord_azr[['field']],
                            text=field_avg_coord_azr['field'], textposition="middle right",
                            marker=dict(color='rgb(0, 0,0)', size=12),
                            mode='markers+text', 
                            marker_symbol='square', hovertemplate="".join(["%{customdata[0]}<extra></extra>"])),
                            row=2, col=1)
    fig.update_layout(  title_text= ('formation: ' + str(formation) + ' value: ' + str(value) + ' color: ' + str(color)),
                        autosize=True, width=1300, height=1400, margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
    return fig.show()
# Calculation NTD
def ntd_calculation_big(dataset, desired_fm, net_var='NET'):
    df_lst = []
    for well_in_loop in tqdm(dataset.well.unique()[:]):
        well_lst = []
        data = dataset[(dataset.well==well_in_loop)]
        data.iloc[0, 3] = 0
        data.iloc[-1, 3] = 0
        tst_top = [data.iloc[i]['TST'] for i in range(len(data)-1)
                    if (data.iloc[i][net_var] == 1 and data.iloc[i-1][net_var]==0)]
        tst_bot = [data.iloc[i]['TST'] for i in range(len(data)-1)
                    if (data.iloc[i][net_var] == 1 and data.iloc[i+1][net_var]==0)]
        for k in range(len(tst_top)):
            if (round(tst_top[k],1) == round(tst_bot[k],1)):
                h_tst = 0 
            elif (round(tst_bot[k],1) == round(tst_top[k]+0.1,1)):
                h_tst = 0
            else:
                h_tst = (round((tst_bot[k] - tst_top[k]),1))
                md_perm = []
                md_phit = []
                md_vsh = []
                for i in range(len(data)):
                    if round(data.iloc[i]['TST'],1) >= round(tst_top[k],1) and round(data.iloc[i]['TST'],1) <= round(tst_bot[k],1):
                        md_perm.append(data.iloc[i]['LPERM'])
                        md_phit.append(data.iloc[i]['PHIT'])
                        md_vsh.append(data.iloc[i]['VSH'])
                if len(md_perm) == 0:
                    md_perm.append(0)
                if len(md_phit) == 0:
                    md_phit.append(0)
                if len(md_vsh) == 0:
                    md_vsh.append(0)
                well_lst.append([data.iloc[0]['well'], h_tst, tst_top[k], tst_bot[k], round(mean(md_perm),0), round(mean(md_phit),2), round(mean(md_vsh),2)])
            df_tst = pd.DataFrame(well_lst, columns = ['well', 'h_tst', 'top_tst', 'bot_tst', 'md_perm_avg', 'md_phit_avg', 'md_vsh_avg'])
        df_lst.append(df_tst)
    ntd_bal = pd.concat(df_lst)
    ntd_bal['FORMATION_up'] = desired_fm
    return ntd_bal
def ntd_calculation_brief(dataset,well,desired_fm, net_var='NET'):
    data = dataset[(dataset.well==well) & (dataset.FORMATION_up==desired_fm)]
    data.iloc[0, 3] = 0
    data.iloc[-1, 3] = 0
    tst_top = [data.iloc[i]['TST'] for i in range(len(data)-1)
                if (data.iloc[i][net_var] == 1 and data.iloc[i-1][net_var]==0)]
    tst_bot = [data.iloc[i]['TST'] for i in range(len(data)-1)
                if (data.iloc[i][net_var] == 1 and data.iloc[i+1][net_var]==0)]
    tops = zip(tst_top, tst_bot)
    df_htst = pd.DataFrame(tops, columns=['tst_top', 'tst_bot'])
    df_htst['FORMATION_up'] = desired_fm
    df_htst['well'] = well
    df_htst['h_tst'] = df_htst.tst_bot - df_htst.tst_top
    df_htst = df_htst[['well','FORMATION_up','tst_top','tst_bot','h_tst']]
    return df_htst
# Calculation NTD zero
def ntd_calculation_zero(dataset,well,formation, net_var='NET'):
    data = dataset[(dataset.well==well) & (dataset.FORMATION_up==formation)]
    data.iloc[0, 3] = 1
    data.iloc[-1, 3] = 1
    tst_zero_top = [data.iloc[i]['TST'].round(3) for i in range(len(data)-1)
                if (data.iloc[i][net_var] == 0 and data.iloc[i-1][net_var] == 1)]
    tst_zero_bot = [data.iloc[i]['TST'].round(3) for i in range(len(data)-1) 
                if (data.iloc[i][net_var] == 0 and data.iloc[i+1][net_var] == 1)]
    tops_zero = zip(tst_zero_top, tst_zero_bot)
    df_zero_htst = pd.DataFrame(tops_zero, columns=['tst_zero_top', 'tst_zero_bot'])
    df_zero_htst['FORMATION_up'] = formation
    df_zero_htst['well'] = well
    df_zero_htst['h_tst_zero'] = df_zero_htst.tst_zero_bot - df_zero_htst.tst_zero_top
    df_zero_htst = df_zero_htst[['well','FORMATION_up','tst_zero_top','tst_zero_bot','h_tst_zero']]
    return df_zero_htst
# Print numerical table with layers
def ntd_numerical(dataset, wellname, fmname):
    """
    dataset = ntd_final
    """
    df = dataset[(dataset.well==wellname) & (dataset.FORMATION_up == fmname) ][['well','h_tst','top_tst', 'bot_tst','FORMATION_up']]
    q50 = df['h_tst'].quantile(q=0.5, interpolation='nearest')
    df['q50'] = q50
    return df
#Cleaning NET variable and making up NET_clp with clipped data
def ntd_htst_cleaning(dataset, cutoff):
    """
    dataset - any updated dataset like df_bal...
    cutoff - value in TST to remove layers with thickness below cutoff
    """
    df_list_ntd = []
    for well in tqdm(dataset.well.unique()):
        ntd_well = dataset[(dataset.well ==well)]
        ntd_well_cutoff = ntd_well[ntd_well.h_tst >= cutoff]
        well_short = df_bal[['well', 'FORMATION_up', 'MD', 'TST', 'GR_N', 'NET', 'FORMATION']]
        net_well = well_short[(well_short.well==well)]
        net_well['NET_clp'] = 0
        for j in range(len(ntd_well_cutoff.well)):
            ntd_top = ntd_well_cutoff.iloc[j, 2].round(3)
            ntd_bot = ntd_well_cutoff.iloc[j, 3].round(3)
            for i in range(len(net_well.TST)):
                well_tst = net_well['TST'].iloc[i].round(3)
                if well_tst >= ntd_top and well_tst <= ntd_bot:
                    net_well['NET_clp'].iloc[i] = 1
        df_list_ntd.append(net_well)
    net_clp = pd.concat(df_list_ntd)
    return net_clp
# Cleaning NET_clp variable from zero values with zero_samples <=cutoff
def ntd_htst_zero_cleaning(dataset_zero, dataset, cutoff, net_var1, net_var2):
    df_list_ntd_zero = []
    for well in tqdm(dataset_zero.well.unique()):
        ntd_well_zero = dataset_zero[(dataset_zero.well ==well)]
        ntd_well_zero_sel = ntd_well_zero[ntd_well_zero.h_tst_zero <= cutoff]
        well_zero_short = dataset[['well','FORMATION_up','MD','TST', net_var1, 'GR_N', 'NET', 'FORMATION']]
        well_zero_short[net_var2] = well_zero_short[net_var1]
        well_zero_sel = well_zero_short[(well_zero_short.well==well)]
        for j in range(len(ntd_well_zero_sel.well)):
            ntd_zero_top = ntd_well_zero_sel.iloc[j, 2].round(3)
            ntd_zero_bot = ntd_well_zero_sel.iloc[j, 3].round(3)
            for i in range(len(well_zero_sel.TST)):
                well_zero_tst = well_zero_sel['TST'].iloc[i].round(3)
                if well_zero_tst >= ntd_zero_top and well_zero_tst <= ntd_zero_bot:
                    well_zero_sel[net_var2].iloc[i] = 1
        df_list_ntd_zero.append(well_zero_sel)
    result = pd.concat(df_list_ntd_zero)
    return result
# View desired TST-interval
def net_view1(dataset, well, top, bot):
    dataset = dataset[dataset.well==well][['well','TST','GR_N', 'RHOB', 'NET','NET_clp']]
    return dataset[(dataset.TST >= top) & (dataset.TST <= bot)].head(50)
#TST sampling & TST KH curve calculation per formation/well
def proph_calculation(dataset, net_var):
    df_smpl_lst = []
    print('TST sampling calculation')
    for well_smpl in tqdm(dataset.well.unique()[:]):
        tst_sampl = dataset[dataset.well==well_smpl]['TST'].diff()
        df_new = dataset[dataset.well==well_smpl].join(tst_sampl, rsuffix='_smpl')    
        df_smpl_lst.append(df_new)
    df_bal_tst_smpl = pd.concat(df_smpl_lst)
    df_kh_lst_fm = []
    print('KHtst calculation')
    for fm_kh in ['Balakhany VIII', 'Balakhany X']:
        df_kh_lst = []
        for well_kh in tqdm(dataset.well.unique()[:]):
            well_tst_perm = df_bal_tst_smpl[(df_bal_tst_smpl.well==well_kh) & 
                                            (df_bal_tst_smpl.FORMATION_up==fm_kh)].sort_values(by='MD', ascending=False)
            well_tst_perm.loc[well_tst_perm[net_var] == 0, 'LPERM'] = 0
            well_tst_perm.loc[well_tst_perm[net_var] == 0, 'PHIT'] = 0
            well_tst_perm.loc[well_tst_perm[net_var] == 0, 'VSH'] = 0
            well_tst_perm['khtst'] = well_tst_perm.LPERM*well_tst_perm.TST_smpl
            well_tst_perm['phithtst'] = well_tst_perm.PHIT*well_tst_perm.TST_smpl
            well_tst_perm['vshhtst'] = well_tst_perm.VSH*well_tst_perm.TST_smpl
            well_tst_perm['KHtst'] = well_tst_perm.khtst.cumsum()
            well_tst_perm['PHITHtst'] = well_tst_perm.phithtst.cumsum()
            well_tst_perm['VSHHtst'] = well_tst_perm.vshhtst.cumsum()
            well_tst_perm = well_tst_perm.sort_values(by='MD')
            df_kh_lst.append(well_tst_perm)
        df_khlst = pd.concat(df_kh_lst)
        df_kh_lst_fm.append(df_khlst)
    df_khlst_fm = pd.concat(df_kh_lst_fm)
    # df_khlst_fm = df_khlst_fm.dropna()
    return df_khlst_fm[['well', 'FORMATION_up', 'MD', 'TST', 'TST_smpl','KHtst','PHITHtst','VSHHtst']]
# Comparison NET_clp and NET_clp2
def well_display_net(dataset, well, formation, net1='NET_clp', net2_flag=0, net2='NET_clp_v2'):
    well_sel = dataset[(dataset.well == well) & (dataset.FORMATION_up == formation)]
    depth = well_sel['TST']
    grn = well_sel['GR_N']
    net = well_sel['NET']
    net_clp = well_sel[net1]
    if net2_flag == 0:
        fig, ax = plt.subplots(1,3, figsize=(4.5,8), sharey=True)
        ax[0].yaxis.set_ticks(np.arange(min(depth), max(depth), 5))
        ax[0].plot(grn, depth, color='green'), ax[0].invert_yaxis(), ax[0].set_xlim(0, 150), ax[0].grid(axis='y')
        well_bal_tops = well_sel.groupby('FORMATION')['TST'].apply(lambda x: x.iloc[0]).reset_index()
        for i in well_bal_tops[well_bal_tops.FORMATION.str.contains('Balakhany VIII')].FORMATION:
            ax[0].hlines(well_bal_tops[well_bal_tops.FORMATION==i]['TST'].iloc[0], xmin=0, xmax=150, color='black', lw=2, alpha=0.66)
            ax[0].text(10, well_bal_tops[well_bal_tops.FORMATION==i]['TST'].iloc[0]+3, i, fontsize = 7, color ="black")
        ax[1].plot(net, depth, color='orange'), ax[1].set_xlim(0, 1), ax[1].grid(axis='y')
        ax[1].fill_betweenx(depth,net, color='orange', alpha=0.33)
        ax[2].plot(net_clp, depth, color='orange'), ax[2].set_xlim(0, 1), ax[2].grid(axis='y')
        ax[2].fill_betweenx(depth,net_clp, color='orange', alpha=0.33)
        fig.suptitle(well_sel.well.unique()[0], fontsize=14)
        fig.tight_layout()
    if net2_flag == 1:
        net_clp2 = well_sel[net2]
        fig, ax = plt.subplots(1,4, figsize=(6,8), sharey=True)
        ax[0].yaxis.set_ticks(np.arange(min(depth), max(depth), 5))
        ax[0].plot(grn, depth, color='green'), ax[0].invert_yaxis(), ax[0].set_xlim(0, 150), ax[0].grid(axis='y')
        well_bal_tops = well_sel.groupby('FORMATION')['TST'].apply(lambda x: x.iloc[0]).reset_index()
        for i in well_bal_tops[well_bal_tops.FORMATION.str.contains('Balakhany VIII')].FORMATION:
            ax[0].hlines(well_bal_tops[well_bal_tops.FORMATION==i]['TST'].iloc[0], xmin=0, xmax=150, color='black', lw=2, alpha=0.66)
            ax[0].text(10, well_bal_tops[well_bal_tops.FORMATION==i]['TST'].iloc[0]+3, i, fontsize = 7, color ="black")
        ax[1].plot(net, depth, color='orange', lw=0.25), ax[1].set_xlim(0, 1), ax[1].grid(axis='y')
        ax[1].fill_betweenx(depth,net, color='orange', alpha=0.33)
        ax[2].plot(net_clp, depth, color='orange', lw=0.25), ax[2].set_xlim(0, 1), ax[2].grid(axis='y')
        ax[2].fill_betweenx(depth,net_clp, color='orange', alpha=0.33)
        ax[3].plot(net_clp2, depth, color='orange', lw=0.25), ax[3].set_xlim(0, 1), ax[3].grid(axis='y')
        ax[3].fill_betweenx(depth,net_clp2, color='orange', alpha=0.33)
        fig.suptitle(well_sel.well.unique()[0], fontsize=14)
        fig.tight_layout()
    return fig.show()
# Run RFR model with train/test split
def rfr_train_test_split(train_dataset, gs_set, scorer, target='KHtst', rng=0.25, margin=0.005):
    """
    'train_ds', 
    'metrics: r2_train, r2_test, mae_train, mae_test, test_in', 
    'grid_search', 
    'result_df', 
    'train_df', 
    'test_df'
    --------
    scorer = make_scorer(mse, greater_is_better=False) <- format scorer like this
    """
    train_dataset_list = []
    grids_setting_list = []
    metrics_dict = []
    # X_train/x_test data splitting
    y = np.array(train_dataset[['well','FORMATION_up',target]])
    x = np.array(train_dataset.drop(target, axis=1))
    num = random.randint(0,100)
    # num=42
    train_dataset_list.append(train_dataset.drop(['FORMATION_up', target], axis=1).columns[1:].values.tolist())
    x_train_init, x_test_init, y_train_init, y_test_init = train_test_split(x, y, test_size=0.3, random_state=num)
    # Taking well names from train/test datasets
    # x_train_wells = x_train_init[:,2]
    # x_test_wells = x_test_init[:,2]
    y_train_wells = y_train_init[:,0:2]
    y_test_wells = y_test_init[:,0:2]
    x_train = x_train_init[:,2:]
    x_test = x_test_init[:,2:]
    y_train = y_train_init[:,2]
    y_test = y_test_init[:,2]
    # GridSearch for ML-model
    grid_rfr = RandomForestRegressor(n_jobs=-1, random_state=42)
    grid_calc_rfr = GridSearchCV(estimator = grid_rfr, param_grid = gs_set, scoring=scorer, cv = 5)
    grid_calc_rfr.fit(x_train, y_train)
    gd_sr_setting = grid_calc_rfr.best_params_
    grids_setting_list.append(gd_sr_setting)
    print('Grid_search: ', grid_rfr)
    # Applying Pipeline for ML-model
    rfr = Pipeline([("scaler",StandardScaler()),("rfr",RandomForestRegressor(**gd_sr_setting, n_jobs=-1, random_state=42))])
    rfr.fit(x_train, y_train)
    y_pred_train = rfr.predict(x_train)
    y_pred_test = rfr.predict(x_test)
    # Reporting
    print('Pipeline: ', rfr.steps[1][1])
    up_range = rng + 1
    dwn_range = 1 - rng
    well_fm_train = pd.DataFrame(y_train_wells, columns=['well', 'FORMATION_up'])
    rfr_train = pd.DataFrame(zip(y_train, y_pred_train), columns=['actual','predict'])
    df_rfr_train = well_fm_train.join(rfr_train)
    df_rfr_train['l_limit'] = df_rfr_train.actual*dwn_range - margin
    df_rfr_train['h_limit'] = df_rfr_train.actual*up_range + margin
    df_rfr_train['qc'] = 'out'
    df_rfr_train['dataset'] = 'train'
    df_rfr_train.loc[(df_rfr_train.predict >= df_rfr_train.l_limit) & (df_rfr_train.predict <= df_rfr_train.h_limit), 'qc'] = 'in'
    well_fm_test = pd.DataFrame(y_test_wells, columns=['well', 'FORMATION_up'])
    rfr_test = pd.DataFrame(zip(y_test, y_pred_test), columns=['actual','predict'])
    df_rfr_test = well_fm_test.join(rfr_test)
    df_rfr_test['l_limit'] = df_rfr_test.actual*dwn_range - margin
    df_rfr_test['h_limit'] = df_rfr_test.actual*up_range + margin
    df_rfr_test['qc'] = 'out'
    df_rfr_test['dataset'] = 'test'
    df_rfr_test.loc[(df_rfr_test.predict >= df_rfr_test.l_limit) & (df_rfr_test.predict <= df_rfr_test.h_limit), 'qc'] = 'in'
    df_rfr_result = pd.concat([df_rfr_train,df_rfr_test])
    df_rfr_result['diff'] = (df_rfr_result.actual - df_rfr_result.predict).round(3)
    metrics_dict = {    'r2_train':     r2(y_train, y_pred_train).round(2), 
                        'r2_test':      r2(y_test, y_pred_test).round(2),
                        'mae_train':    mae(y_train, y_pred_train).round(2), 
                        'mae_test':     mae(y_test, y_pred_test).round(2),
                        'train_in':     df_rfr_train['qc'].value_counts(normalize=True)['in'].round(2),
                        'test_in':      df_rfr_test['qc'].value_counts(normalize=True)['in'].round(2)}
    feature_imp = pd.Series(rfr.steps[1][1].feature_importances_, index=train_dataset_list[0]).sort_values(ascending=True)
    return {'train_ds':train_dataset_list[0], 
            'metrics':metrics_dict, 
            'grid_search' : grids_setting_list, 
            'result_df' : df_rfr_result,
            'train_df' : df_rfr_train,
            'test_df' : df_rfr_test,
            'feature_imp' : feature_imp}
# Run RFR model with loop
def rfr_loop(dataset, fmname, target, hyperdict, rng, margin):
    """
    'train_ds', 'train_ftrs', 'result_df', 'grid_search', 'metrics'
    """
    y_test_lst = []
    y_pred_test_lst = []
    well_exclude_lst = []
    fm_exclude_lst = []
    gs_settings_lst = []
    metrics_r2_lst = []
    metrics_mae_lst = []
    ftr_imp_lst = []
    for i in tqdm(range(len(dataset))[:]):
        #Making up the feature and target datasets
        df_wo_well = dataset.drop([i])
        well_exclude = dataset.iloc[i]['well']
        well_exclude_lst.append(well_exclude)
        fm_exclude = dataset.iloc[i][fmname]
        fm_exclude_lst.append(fm_exclude)
        y_train = np.array(df_wo_well[target])
        x_train = np.array(df_wo_well.drop(['well',fmname, target], axis=1))
        well_train = np.array(df_wo_well['well'])
        y_test = np.array(dataset.iloc[i][target])
        y_test_lst.append(y_test)
        x_test = np.array(dataset.drop(['well', fmname, target], axis=1).iloc[i])
        # Statement of ML-model
        rfr = Pipeline([("scaler",StandardScaler()),("rfr",RandomForestRegressor(**hyperdict, n_jobs=-1, random_state=42))])                                                                                  
        # Fitting the ML-model
        rfr.fit(x_train, y_train)
        y_pred_train = rfr.predict(x_train)
        y_pred_test = rfr.predict([x_test])
        y_pred_test_lst.append(y_pred_test[0])
        # Metrics computation for the ML-model
        r2_train = r2(y_train, y_pred_train).round(5)
        mae_train = mae(y_train, y_pred_train)
        metrics_r2_lst.append(r2_train)
        metrics_mae_lst.append(mae_train.round(5))
        feature_imp = pd.Series(rfr.steps[1][1].feature_importances_, index=df_wo_well.drop(['well', fmname,target], axis=1).columns.tolist()).sort_values(ascending=True)
        ftr_imp_lst.append(feature_imp)
    # Building up of dataframe
    print(rfr.steps[1][1])
    res_rfr_sha = pd.DataFrame( zip(y_test_lst, y_pred_test_lst, well_exclude_lst, fm_exclude_lst, metrics_r2_lst, metrics_mae_lst, ftr_imp_lst), 
                            columns = ['actual','predict','well', 'FORMATION_up','metrics_r2', 'metrics_mae','features_imp'])
    res_rfr_sha['l_range'] = res_rfr_sha.actual*(1-rng) - margin 
    res_rfr_sha['h_range'] = res_rfr_sha.actual*(1+rng) + margin
    res_rfr_sha['qc'] = 'out'
    res_rfr_sha.loc[(res_rfr_sha.predict >= res_rfr_sha.l_range) & (res_rfr_sha.predict <= res_rfr_sha.h_range), 'qc'] = 'in'
    wells_tot = res_rfr_sha.shape[0]
    wells_unpred = res_rfr_sha['qc'].value_counts()['out']
    wells_unpred_vv = (res_rfr_sha['qc'].value_counts()['out']/res_rfr_sha.shape[0]).round(3)
    try:
        wells_pred = res_rfr_sha['qc'].value_counts()['in']
        wells_pred_vv =  (res_rfr_sha['qc'].value_counts()['in']/res_rfr_sha.shape[0]).round(3)
    except:
        wells_pred = 0
        wells_pred_vv = 0
    res_rfr_sha['diff'] = res_rfr_sha.actual - res_rfr_sha.predict
    res_rfr_sha = res_rfr_sha[['well','FORMATION_up','actual','predict', 'diff', 'l_range', 'h_range', 'qc', 'metrics_r2', 'metrics_mae', 'features_imp']]
    types_dict = {'actual': 'float64', 'predict': 'float64', 'diff': 'float64', 'l_range': 'float64', 'h_range': 'float64'}
    res_rfr_sha = res_rfr_sha.astype(types_dict)
    res_rfr_sha = res_rfr_sha.round({'actual': 3, 'predict': 3, 'diff': 3})
    metrics_dict = {    'wells_total':          wells_tot, 
                        'wells_unpred':         wells_unpred,
                        'wells_unpred_v/v':     wells_unpred_vv,
                        'wells_pred':           wells_pred,
                        'wells_pred_v/v':       wells_pred_vv
                    }
    return {    'train_ds': dataset.columns.tolist(),
                'train_ftrs': df_wo_well.drop(['well', fmname,target], axis=1).columns.tolist(),
                'result_df': res_rfr_sha,
                'grid_search' : hyperdict,
                'metrics':metrics_dict,
                'feature_imp' : feature_imp
            }
# Run XGBR model with loop 
def xgbr_loop(dataset, fmname, target, hyperdict, rng, margin):
    """
    'train_ds', 'train_ftrs', 'result_df', 'grid_search', 'metrics'
    """
    y_test_lst = []
    y_pred_test_lst = []
    well_exclude_lst = []
    fm_exclude_lst = []
    gs_settings_lst = []
    metrics_r2_lst = []
    metrics_mae_lst = []
    ftr_imp_lst = []
    for i in tqdm(range(len(dataset))[:]):
        #Making up the feature and target datasets
        df_wo_well = dataset.drop([i])
        well_exclude = dataset.iloc[i]['well']
        well_exclude_lst.append(well_exclude)
        fm_exclude = dataset.iloc[i][fmname]
        fm_exclude_lst.append(fm_exclude)
        y_train = np.array(df_wo_well[target])
        x_train = np.array(df_wo_well.drop(['well',fmname, target], axis=1))
        well_train = np.array(df_wo_well['well'])
        y_test = np.array(dataset.iloc[i][target])
        y_test_lst.append(y_test)
        x_test = np.array(dataset.drop(['well', fmname, target], axis=1).iloc[i])
        xgbr = Pipeline([("scaler",StandardScaler()),("xgbr",XGBRegressor(**hyperdict, n_jobs=-1, random_state=42))])
        # Fitting the ML-model
        xgbr.fit(x_train, y_train)
        y_pred_train = xgbr.predict(x_train)
        y_pred_test = xgbr.predict([x_test])
        y_pred_test_lst.append(y_pred_test[0])
        # Metrics computation for the ML-model
        r2_train = r2(y_train, y_pred_train).round(5)
        mae_train = mae(y_train, y_pred_train)
        metrics_r2_lst.append(r2_train)
        metrics_mae_lst.append(mae_train.round(5))
        feature_imp = pd.Series(xgbr.steps[1][1].feature_importances_, index=df_wo_well.drop(['well', fmname,target], axis=1).columns.tolist()).sort_values(ascending=True)
        ftr_imp_lst.append(feature_imp)

    # Building up of dataframe
    print(xgbr.steps[1][1])
    res_rfr_sha = pd.DataFrame( zip(y_test_lst, y_pred_test_lst, well_exclude_lst, fm_exclude_lst, metrics_r2_lst, metrics_mae_lst, ftr_imp_lst), 
                            columns = ['actual','predict','well', 'FORMATION_up','metrics_r2', 'metrics_mae','features_imp'])
    res_rfr_sha['l_range'] = res_rfr_sha.actual*(1-rng) - margin 
    res_rfr_sha['h_range'] = res_rfr_sha.actual*(1+rng) + margin 
    res_rfr_sha['qc'] = 'out'
    res_rfr_sha.loc[(res_rfr_sha.predict >= res_rfr_sha.l_range) & (res_rfr_sha.predict <= res_rfr_sha.h_range), 'qc'] = 'in'
    wells_tot = res_rfr_sha.shape[0]
    wells_unpred = res_rfr_sha['qc'].value_counts()['out']
    wells_unpred_vv = (res_rfr_sha['qc'].value_counts()['out']/res_rfr_sha.shape[0]).round(3)
    try:
        wells_pred = res_rfr_sha['qc'].value_counts()['in']
        wells_pred_vv =  (res_rfr_sha['qc'].value_counts()['in']/res_rfr_sha.shape[0]).round(3)
    except:
        wells_pred = 0
        wells_pred_vv = 0
    res_rfr_sha['diff'] = res_rfr_sha.actual - res_rfr_sha.predict
    res_rfr_sha = res_rfr_sha[['well','FORMATION_up','actual','predict', 'diff','l_range', 'h_range', 'qc', 'metrics_r2', 'metrics_mae', 'features_imp']]
    types_dict = {'actual': 'float64', 'predict': 'float64', 'diff': 'float64', 'l_range': 'float64', 'h_range': 'float64'}
    res_rfr_sha = res_rfr_sha.astype(types_dict)
    res_rfr_sha = res_rfr_sha.round({'actual': 0, 'predict': 0, 'diff': 0})
    metrics_dict = {    'wells_total':          wells_tot, 
                        'wells_unpred':         wells_unpred,
                        'wells_unpred_v/v':     wells_unpred_vv,
                        'wells_pred':           wells_pred,
                        'wells_pred_v/v':       wells_pred_vv
                    }
    return {    'train_ds': dataset.columns.tolist(),
                'train_ftrs': df_wo_well.drop(['well', fmname,target], axis=1).columns.tolist(),
                'result_df': res_rfr_sha,
                'grid_search' : hyperdict,
                'metrics':metrics_dict,
                'feature_imp' : feature_imp
            }
# Display results of ML-modeling
def xplot_qc(dataset, dataframe, max_val, rng=0.25):
    fig1_ml = px.scatter(dataset[dataframe], x='actual', y='predict', 
                        color='qc', 
                        hover_data=['well'], 
                        width=400, height=400,
                        #  color_discrete_sequence=["red", "green"]
                        )
    up_range = rng+1
    dwn_range = 1- rng
    fig1_ml.update_traces(marker=dict(size=10,opacity=0.75,line=dict(color='rgb(47, 57, 61)', width=1)))
    fig2_ml=px.line(x=[0,max_val], y=[0,max_val])
    fig2_1_ml=px.line(x=[0,max_val], y=[0,max_val*up_range])
    fig2_2_ml=px.line(x=[0,max_val], y=[0,max_val*dwn_range])
    fig2_ml.update_traces(line=dict(color = 'blue'))
    fig2_1_ml.update_traces(line=dict(color = 'blue', dash='dash'))
    fig2_2_ml.update_traces(line=dict(color = 'blue', dash='dash'))
    fig3_ml = go.Figure(data = fig1_ml.data + fig2_ml.data + fig2_1_ml.data + fig2_2_ml.data)
    fig3_ml.update_layout(  title = 'Comparison Actual vs Pred' + 
                                    ' QC_train: ' + str(dataset['metrics']['train_in']) +
                                    ' QC_test: ' + str(dataset['metrics']['test_in']),
                            width=600,height=400, xaxis_title='actual', yaxis_title='predict',
                            margin=dict(l=10,r=10,b=10,t=40))
    return fig3_ml.show()
# Calculate weighted avg properties 
def avg_prop_calculation(dataset_ntd, dataset, formation):
    well_data = []
    well_formation = formation
    for well in tqdm(dataset_ntd.well.unique()):
        # print(well)
        ntd_well_avgprop = dataset_ntd[(dataset_ntd.well ==well)]
        well_avgprop_sel = dataset[(dataset.well==well)]
        well_phit = []
        well_phit10 = []
        well_phit50 = []
        well_phit90 = []
        well_vsh = []
        well_vsh10 = []
        well_vsh50 = []
        well_vsh90 = []
        well_gperm = []
        well_h = []
        for layers in range(len(ntd_well_avgprop.well)):
            ntd_top = ntd_well_avgprop.iloc[layers, 2].round(3)
            ntd_bot = ntd_well_avgprop.iloc[layers, 3].round(3)
            ntd_h = ntd_well_avgprop.iloc[layers, 4].round(3)
            phit_lst = []
            vsh_lst = []
            perm_lst = []
            for depth in range(len(well_avgprop_sel.TST)):
                well_avgprop_tst = well_avgprop_sel['TST'].iloc[depth].round(3)
                if well_avgprop_tst >= ntd_top and well_avgprop_tst <= ntd_bot:
                    phit_lst.append(well_avgprop_sel['PHIT'].iloc[depth])
                    vsh_lst.append(well_avgprop_sel['VSH'].iloc[depth])
                    perm_lst.append(well_avgprop_sel['LPERM'].iloc[depth])
            well_phit.append(mean(phit_lst)*ntd_h)
            well_phit10.append(np.quantile(phit_lst, 0.1)*ntd_h)
            well_phit50.append(np.quantile(phit_lst, 0.5)*ntd_h)
            well_phit90.append(np.quantile(phit_lst, 0.9)*ntd_h)
            well_vsh.append(mean(vsh_lst)*ntd_h)
            well_vsh10.append(np.quantile(vsh_lst, 0.1)*ntd_h)
            well_vsh50.append(np.quantile(vsh_lst, 0.5)*ntd_h)
            well_vsh90.append(np.quantile(vsh_lst, 0.9)*ntd_h)
            well_gperm.append(gmean(perm_lst)*ntd_h)
            well_h.append(ntd_h)
        well_phit_wavg = sum(well_phit)/sum(well_h)
        well_phit10_wavg = sum(well_phit10)/sum(well_h)
        well_phit50_wavg = sum(well_phit50)/sum(well_h)
        well_phit90_wavg = sum(well_phit90)/sum(well_h)
        well_vsh_wavg = sum(well_vsh)/sum(well_h)
        well_vsh10_wavg = sum(well_vsh10)/sum(well_h)
        well_vsh50_wavg = sum(well_vsh50)/sum(well_h)
        well_vsh90_wavg = sum(well_vsh90)/sum(well_h)
        well_perm_wavg = sum(well_gperm)/sum(well_h)
        well_hmax = max(well_h)
        well_h_p50 = np.quantile(well_h, 0.5)
        well_layers_count =len(well_h)
        well_hsum = sum(well_h)
        well_data.append([  well, well_formation, 
                            well_hmax, well_h_p50, well_layers_count, well_hsum,
                            well_phit_wavg, well_phit10_wavg, well_phit50_wavg, well_phit90_wavg,
                            well_vsh_wavg, well_vsh10_wavg, well_vsh50_wavg, well_vsh90_wavg,
                            well_perm_wavg])
    result = pd.DataFrame(well_data, columns=[  'well','FORMATION_up',
                                                'htst_max', 'htst_p50','htst_count', 'htst_sum',            
                                                'phit_wavg', 'phit10_wavg','phit50_wavg','phit90_wavg',
                                                'vsh_wavg', 'vsh10_wavg', 'vsh50_wavg', 'vsh90_wavg',
                                                'perm_wavg'])
    return result
# Euclidian dist calculation with prop
def dist_prop_calc(dataset, dist_formation, dist_cutoff, value):
    """
    dataset have to contain 'X_mean', 'Y_mean', 'TVD_SCS' and 'KHtst', if you assing value as KHtst
    """
    data = dataset[(dataset.FORMATION_up == dist_formation)]
    row_name = data.well.reset_index().drop(['index'], axis=1)
    distance_fm = pd.DataFrame(euclidean_distances(data[['X_mean', 'Y_mean', 'TVD_SCS']]), columns=list(data.well))
    distance_fm_well = distance_fm.join(row_name).set_index('well')
    distance_fm_well.reset_index(inplace=True)
    def well_kh_accum(wells, dataset, kh_formation):
        well_kh_accum = []
        well_x_accum = []
        well_y_accum = []
        for i in wells:
            well_kh_accum.append(dataset[(dataset.well==i)&(dataset.FORMATION_up == kh_formation)][value].reset_index())    
            well_x_accum.append(dataset[(dataset.well==i)&(dataset.FORMATION_up == kh_formation)]['X_mean'].reset_index())
            well_y_accum.append(dataset[(dataset.well==i)&(dataset.FORMATION_up == kh_formation)]['Y_mean'].reset_index())
        well_kh3 = pd.concat(well_kh_accum).T[1:]
        well_kh3.columns = [value + '_1',value + '_2', value + '_3']
        well_x3 = pd.concat(well_x_accum).T[1:]
        well_x3.columns = ['x1','x2','x3']
        well_y3 = pd.concat(well_y_accum).T[1:]
        well_y3.columns = ['y1','y2','y3']
        final = pd.concat([ well_kh3.reset_index().drop('index',axis=1), 
                            well_x3.reset_index().drop('index',axis=1), 
                            well_y3.reset_index().drop('index',axis=1)], axis=1)
        return final
    df_collect = []
    for num, well_name in enumerate(distance_fm_well.well[:]):
        well_dist3 = distance_fm_well[distance_fm_well.well == well_name].T[1:].sort_values(by=num)
        well_dist3_s2 = well_dist3[well_dist3[num] > dist_cutoff][:3].reset_index()
        well_dist3_tuple = tuple(well_dist3_s2['index'])
        well_dist3_res = well_dist3_s2.T[1:].reset_index().drop('index', axis=1)   
        well_name3_res = well_dist3_s2.T[:1].reset_index().drop('index', axis=1)
        well_kh3_res = well_kh_accum(well_dist3_tuple,dataset, dist_formation)
        well_dist3_res.columns =['dist1', 'dist2', 'dist3']
        well_name3_res.columns =['well1', 'well2', 'well3']
        concat_df = pd.concat([well_dist3_res, well_kh3_res, well_name3_res], axis=1)
        result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
        df_collect.append(result)     
    df_well_kh_dist = pd.concat(df_collect).reset_index().drop('index', axis=1)
    df_well_kh_dist['FORMATION_up'] = dist_formation
    return df_well_kh_dist
# Feature importance bar chart for 1-to-all algorithm
def feature_imp_loop(dataset, wellname, fmname, xsize, ysize):
    # dataset = test['result_df']
    data = dataset[(dataset.well==wellname) & (dataset.FORMATION_up == fmname)]
    ftr_imp = data['features_imp'].values[0]
    f, ax = plt.subplots(figsize=(xsize, ysize))
    ftr_imp.plot.barh()
    ax.set_title('RFR feature imp  ' + wellname + ' ' + fmname)
    ax.tick_params(axis='y', labelsize=8, rotation=0)
    return f.show()
# Save datafram to csv
def save_tocsv(dataframe, filename, flag):
    if flag == 1:
        # Saving avg_prop dataframe to .csv
        path = 'C:\\jupyter\\SPP\\inputoutput\\'
        dataframe.to_csv(path + filename)
    else:
        pass
# Feature importance bar chart for split dataframe
def feature_imp_split(dataset, xsize, ysize):
    fig, ax = plt.subplots(figsize=(xsize, ysize))
    ax = dataset.plot.barh()
    ax.set_title("RFR Feature Importances")
    ax.tick_params(axis='y', labelsize=9, rotation=0)
    ax.figure.tight_layout()
    return fig.show()
# Logging results of ml
def write_res_file(finename, comments, target, trainds, metrics, gridsearch):
    with open(finename, 'a') as file:
        # Get the current date and time
        current_datetime = datetime.now()
        # Write the result to the file
        file.write(f'\n{current_datetime} \n {comments} target: {target}')
        file.write(f'\n training_ds_{trainds} \n metrics_{[metrics]} \n grid_search_{gridsearch}')
    file.close()
# Remover categorical values from datasets
def cat_finder(dataset):
    """
    cat_list: categorical columns to drop out
    get_dum_list: categorical columns to run via pd.get_dummies
    """
    cat_list = []
    gm_list = []
    for col in dataset.columns:
        # print(i)
        if dataset[col].dtype == 'string':
            cat_list.append(col)
            if col != 'well':
                gm_list.append(col)
    # return {'cat_list':cat_list,
    #         'get_dum_list': gm_list}
    return cat_list, gm_list
# Display results of ML-modeling ver2
def xplot_qc2(data, max_val, rng, margin, round):
    data = data.round({'actual': round, 'predict': round, 'diff': round})
    ds_train = data[data.dataset == 'train']
    ds_test = data[data.dataset == 'test']
    up_range = rng + 1
    dwn_range = 1 - rng
    colors = {'in': 'green', 'out': 'red'}
    qc_colors_tr = [colors[qc] for qc in ds_train.qc]
    qc_colors_ts = [colors[qc] for qc in ds_test.qc]
    scatter_train = go.Scatter( x=ds_train.actual, y=ds_train.predict,
                                mode='markers',
                                marker=dict(color=qc_colors_tr, size=7, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                customdata = ds_train[['well','actual','predict','diff', 'FORMATION_up']],
                                hovertemplate="".join(
                                ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]},d:%{customdata[3]}, f:%{customdata[4]}<extra></extra>"])
                                )
    scatter_test = go.Scatter(  x=ds_test.actual, y=ds_test.predict, 
                                mode='markers',
                                marker=dict(color=qc_colors_ts, size=7, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                customdata = ds_test[['well','actual','predict','diff', 'FORMATION_up']],
                                hovertemplate="".join(
                                ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]},d:%{customdata[3]}, f:%{customdata[4]}<extra></extra>"])
                                )
    line_trace_up = go.Scatter(x=[0, max_val], y=[0 + margin, max_val*up_range + margin], mode='lines+markers', line=dict(color='blue'))
    line_trace_dw = go.Scatter(x=[0, max_val], y=[0 - margin, max_val*dwn_range - margin], mode='lines+markers', marker=dict(color='blue'))
    fig = make_subplots(rows=1, cols=2, subplot_titles=('train ds', 'test ds'))
    fig.add_trace(scatter_train,  row=1, col=1)
    fig.add_trace(line_trace_up,  row=1, col=1)
    fig.add_trace(line_trace_dw,  row=1, col=1)
    fig.update_xaxes(title_text='actual', row=1, col=1)
    fig.update_yaxes(title_text='predict', row=1, col=1)
    fig.add_trace(scatter_test,  row=1, col=2)
    fig.add_trace(line_trace_up,  row=1, col=2)
    fig.add_trace(line_trace_dw,  row=1, col=2)
    fig.update_xaxes(title_text='actual', row=1, col=2)
    fig.update_yaxes(title_text='predict', row=1, col=2)
    fig.update_layout(  title_text= ('rfr_train_test_split'), width=900, height=450, 
                        margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
    return fig.show()
# Display results of ML-modeling ver2 via loop    
def xplot_qc2_loop(data, max_val, rng, margin=0.005):
    data = data.round({'actual': 3, 'predict': 3, 'diff ': 3})
    up_range = rng + 1
    dwn_range = 1 - rng
    colors = {'in': 'green', 'out': 'red'}
    qc_colors = [colors[qc] for qc in data.qc]
    scatter = go.Scatter( x=data.actual, y=data.predict,
                            mode='markers',
                            marker=dict(color=qc_colors, size=7, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                            customdata = data[['well','actual','predict', 'diff', 'FORMATION_up']],
                            hovertemplate="".join(
                            ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, d:%{customdata[3]}, f:%{customdata[4]}<extra></extra>"])
                            )
    fig = go.Figure()
    line_up = go.Scatter(x=[0, max_val], y=[0 + margin, max_val*up_range + margin], mode='lines+markers', line=dict(color='blue'))
    line_dw = go.Scatter(x=[0, max_val], y=[0 - margin, max_val*dwn_range - margin], mode='lines+markers', marker=dict(color='blue'))
    fig.add_trace(scatter)
    fig.add_trace(line_up)
    fig.add_trace(line_dw)
    fig.update_xaxes(title_text='actual')
    fig.update_yaxes(title_text='predict')
    fig.update_layout(  title_text= ('rfr_loop'), width=450, height=450, 
                        margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
    return fig.show()
# Display results of ML-modeling on map
def map_qc(metadata, data, fmname, scale):
    data['diff'] = abs(data['diff'])
    data = data[data.FORMATION_up == fmname]
    data_in = data[data.qc=='in']
    data_out = data[data.qc=='out']
    field_avg_coord = metadata.groupby('field')[['X_wellhead','Y_wellhead']].mean().reset_index()
    platform  = go.Scatter(         x=field_avg_coord.X_wellhead, y=field_avg_coord.Y_wellhead, customdata = field_avg_coord[['field']],
                                    text=field_avg_coord['field'], textposition="middle right",
                                    marker=dict(color='rgb(0, 0,0)', size=12),
                                    mode='markers+text', 
                                    marker_symbol='square', hovertemplate="".join(["%{customdata[0]}<extra></extra>"])
                                    )
    scatter_data_in = go.Scatter(   x=data_in.X, y=data_in.Y,
                                    mode='markers',
                                    marker=dict(symbol='circle', color='green', size=data_in['actual']*scale,
                                    opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)
                                    ),
                                    customdata = data_in[['well', 'diff']],
                                    hovertemplate="".join(["well:%{customdata[0]}, diff:%{customdata[1]}<extra></extra>"])
                                    )
    scatter_data_out = go.Scatter(  x=data_out.X, y=data_out.Y, 
                                    mode='markers',
                                    marker=dict(symbol='diamond', color='red', size=data_out['diff']*scale,
                                    opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                    customdata = data_out[['well', 'diff']],
                                    hovertemplate="".join(["well:%{customdata[0]}, diff:%{customdata[1]}<extra></extra>"])
                                    )
    fig = go.Figure()
    fig.add_trace(platform)
    fig.add_trace(scatter_data_in)
    fig.add_trace(scatter_data_out)
    fig.update_layout(title_text= ('rfr_train_test_split'),autosize=True, width=1000, height=600, margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
    return fig.show()
# Pairplot new version
def pairplot_special(dataset, xsize, ysize, flag=1):
    if flag == 1:
        def corrfunc(x, y, **kws):
            r, _ = stats.pearsonr(x, y)
            ax = plt.gca()
            ax.annotate("r = {:.2f}".format(r),
                        xy=(.1, .9), xycoords=ax.transAxes)
        sns.set_context(rc={'axes.labelsize':10, 'lines.linewidth': 0.75})
        g = sns.PairGrid(dataset)
        g.fig.set_size_inches(xsize,ysize)
        g.set(xticklabels=[], yticklabels=[]) 
        g.map_upper(plt.scatter, s=10, alpha=0.5)
        g.map_diag(sns.distplot, kde=False)
        g.map_lower(sns.kdeplot, cmap="Blues_d")
        g.map_lower(corrfunc)
    else:
        pass
# Columns reorder for better display of variables
def columns_reorder(dataset, selected_column):
    new_order = [col for col in dataset.columns if col != selected_column] + [selected_column]
    dataset = dataset[new_order]
    return dataset
# Just simple x-plot for 1 dataframe
def log_map_plot(dataframe, x_var, y_var, min_val, max_val):
    fig = go.Figure()
    scatter = go.Scatter(   x=dataframe[x_var], y=dataframe[y_var], 
                            mode='markers',
                            marker=dict(color='orange', size=10, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                            customdata = dataframe[['well',x_var,y_var]],
                            hovertemplate="".join(
                            ["w:%{customdata[0]},x:%{customdata[1]}, y:%{customdata[2]}<extra></extra>"])
                            )
    line = go.Scatter(x=[min_val, max_val], y=[min_val, max_val], mode='lines', line=dict(color='blue'))
    fig.add_trace(scatter)
    fig.add_trace(line)
    fig.update_layout(  title_text= ('scatter plot'), width=600, height=600, 
                        margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
    return fig.show()
# Joining main and additional dataframes for predictions
def join_add_df_prediction(base_dataframe, add_dataframe, target_var):
    """
    Both dataframes have contain 'well' & 'FORMATION_up' for joining
    """
    join_dataframe = base_dataframe.set_index(['well','FORMATION_up']).join(add_dataframe.set_index(['well','FORMATION_up'])).reset_index()
    col_names, gm_list = cat_finder(join_dataframe)
    df_corr = join_dataframe.drop(col_names, axis=1)
    df_corr = columns_reorder(df_corr, target_var)
    mem_cell = pd.get_dummies(join_dataframe[gm_list], columns=gm_list, drop_first=True)
    mem_cell.rename(columns={'FORMATION_up_Balakhany X':'FORMATION_up_gm'},inplace=True)
    join_dataframe_gm = pd.concat([join_dataframe, mem_cell], axis=1)
    return df_corr, join_dataframe_gm
# Preparation dataframes for pairplot and for predictions
def join_df_prediction(base_dataframe, target_var):
    def columns_reorder(dataset, selected_column):
        new_order = [col for col in dataset.columns if col != selected_column] + [selected_column]
        dataset = dataset[new_order]
        return dataset
    def cat_finder(dataset):
        """
        cat_list: categorical columns to drop out
        get_dum_list: categorical columns to run via pd.get_dummies
        """
        cat_list = []
        gm_list = []
        for col in dataset.columns:
            # print(i)
            if dataset[col].dtype == 'string':
                cat_list.append(col)
                if col != 'well':
                    gm_list.append(col)
        # return {'cat_list':cat_list,
        #         'get_dum_list': gm_list}
        return cat_list, gm_list
    col_names, gm_list = cat_finder(base_dataframe)
    df_corr = base_dataframe.drop(col_names, axis=1)
    df_corr = columns_reorder(df_corr, target_var)
    mem_cell = pd.get_dummies(base_dataframe[gm_list], columns=gm_list, drop_first=True)
    mem_cell.rename(columns={'FORMATION_up_Balakhany X':'FORMATION_up_gm'},inplace=True)
    dataframe = pd.concat([base_dataframe, mem_cell], axis=1)
    return df_corr, dataframe
# Function to calculate grid_search via train_split
def run_rfr_train_test_split(dataset, gs_set, scorer, target, rng, margin, logtxt_name, comment, xplot_flag, ftr_imp_flag):
    model_res = rfr_train_test_split(dataset, gs_set, scorer, target, rng, margin)
    write_res_file(logtxt_name, comment, target, 
                    model_res['train_ds'], model_res['metrics'], model_res['grid_search'])
    print('train_ds: ', model_res['train_ds'])
    print('metrics: ', model_res['metrics'])
    print('grid_search: ', model_res['grid_search'])
    model_res_hyper_par = model_res['grid_search'][0]
    if xplot_flag == 1:
        xplot_qc2(dataset['result_df'], 0.3, 0.05, margin)
    else:
        pass
    if ftr_imp_flag == 1:
        feature_imp_split(dataset['feature_imp'], 6, 4)
    else:
        pass
    return model_res_hyper_par
# Function to calculate target via 1-to-all
def run_rfr_1_to_all(dataset, hyperdict, target, rng, margin, logtxt_name, comment, xplot_flag, max_val, ftr_imp_flag):
    loop_res = rfr_loop(dataset, 'FORMATION_up', target, hyperdict, rng, margin)
    write_res_file(logtxt_name, comment, target, loop_res['train_ds'], loop_res['metrics'], loop_res['grid_search'])
    loop_res_pred = loop_res['result_df']
    print('train_ftrs: ',loop_res['train_ftrs'])
    print('metrics: ',loop_res['metrics'])
    if xplot_flag == 1:
        xplot_qc2_loop(loop_res['result_df'], max_val, rng, margin)
    else:
        pass
    if ftr_imp_flag == 1:
        feature_imp_split(loop_res['feature_imp'], 6, 4)
    else:
        pass
    return loop_res_pred
# Just display 2 df side by side
def display_2df_side_side(df1, df2):
    df_combined = pd.concat([df1, df2], axis=1)
    display(HTML(df_combined.to_html(index=True)))

## Data preparation

### NetThicknessDistribution upload

In [5]:
df_bal_net2_kh = pd.read_csv(r'C:\jupyter\SPP\inputoutput\df_bal_net2_kh.csv').drop('Unnamed: 0', axis=1)
df_dist_kh_bal_fin = pd.read_csv(r'C:\jupyter\SPP\inputoutput\df_dist_kh_bal_fin.csv').drop('Unnamed: 0', axis=1)

In [6]:
# Display 3 offsets wells
def display_3offset_wells(well, formation, dataset_dist=df_dist_kh_bal_fin, dataset_logs=df_bal_net2_kh):
    """
    Pay attention dataset_dist=df_dist_kh_bal_fin, dataset_logs=df_bal_net2_kh
    well:       just well name
    formation:  just formation
    """
    def well_offset_selection(dataset_dist, fmname, well_target):
        try:
            well_df = dataset_dist[(dataset_dist.well == well_target) & (dataset_dist.FORMATION_up == fmname)][['well', 'well1', 'well2', 'well3',
                                                                                                                        'dist1', 'dist2', 'dist3',
                                                                                                                'KHtst','KHtst_1', 'KHtst_2', 'KHtst_3']]
            well1 = well_df['well1'].iloc[0]
            well2 = well_df['well2'].iloc[0]
            well3 = well_df['well3'].iloc[0]
            dist1 = well_df['dist1'].astype('int').iloc[0]
            dist2 = well_df['dist2'].astype('int').iloc[0]
            dist3 = well_df['dist3'].astype('int').iloc[0]
            kh = well_df['KHtst'].astype('int').iloc[0]
            kh1 = well_df['KHtst_1'].astype('int').iloc[0]
            kh2 = well_df['KHtst_2'].astype('int').iloc[0]
            kh3 = well_df['KHtst_3'].astype('int').iloc[0]
        except Exception as e:
            print(f'It looks like the desired formation is absent. The error is "{e}"')
        return {'target': well_target, 'w1':well1, 'w2':well2, 'w3':well3, 
                'dist': 0,'d1':dist1, 'd2':dist2,'d3':dist3,
                'kh':kh,'kh1':kh1, 'kh2':kh2, 'kh3':kh3}
    def display_tracks(dataset, wellname, fmname, ref_depth, depth_step, r, c, kh_value, dist):
        try:
            data = dataset[(dataset.well==wellname) & (dataset.FORMATION_up == fmname)]
            depth = data[ref_depth]
            grn = data['GR_N']
            vsh = data['VSH']
            rhob = data['RHOB'] 
            npss = data['NPSS']
            rdeep = data['RDEEP']
            phit = data['PHIT'] 
            net = data['NET_clp2']
            perm = data['LPERM']
            kh = data['KHtst']
            well_bal_tops = df_bal[(df_bal.well == wellname)].groupby('FORMATION')[ref_depth].apply(lambda x: x.iloc[0]).reset_index()
            ax[r,c].plot(grn, depth, color='lightgreen', lw=2, zorder=10)
            ax[r,c].set_xlim(0, 150) 
            ax[r,c].grid(axis='y')
            ax[r,c].invert_yaxis()
            ax[r,c].yaxis.set_ticks(np.arange(min(depth), max(depth), depth_step))
            ax[r,c].set_xticks([])
            ax[r,c].tick_params(axis='y', labelsize=8)
            ax[r,c].set_title(wellname + ' ' + fmname + ' kh:' + str(kh_value) + ' dist:' + str(dist), fontsize=12) 
            for i in well_bal_tops[well_bal_tops.FORMATION.str.contains(fmname)].FORMATION:
                ax[r,c].hlines(    well_bal_tops[well_bal_tops.FORMATION==i][ref_depth].iloc[0], 
                                    xmin=0, xmax=1000, linewidth=2, color='black', lw=2, alpha=0.33)
                # ax[r,c].text(10, well_bal_tops[well_bal_tops.FORMATION==i][ref_depth].iloc[0]+0.5*depth_step, i, fontsize = 7, color ="black")
            ax[r,c+1].plot(rhob, depth, color='red')
            ax[r,c+1].xaxis.set_ticks(np.arange(1.65, 2.65, 0.3))
            ax[r,c+1].set_xlim(1.65, 2.65)
            ax[r,c+1].grid(axis='y')
            ax[r,c+1].grid(axis='x')
            ax[r,c+1].invert_yaxis()
            ax[r,c+1].yaxis.set_ticks(np.arange(min(depth), max(depth), depth_step))
            ax[r,c+1].set_xticks([])
            ax[r,c+1].set_yticks([])
            for i in well_bal_tops[well_bal_tops.FORMATION.str.contains(fmname)].FORMATION:
                ax[r,c+1].hlines( well_bal_tops[well_bal_tops.FORMATION==i][ref_depth].iloc[0], 
                                xmin=0, xmax=150, linewidth=2, color='black', lw=2, alpha=0.33)
                ax[r,c+1].text(1.67, well_bal_tops[well_bal_tops.FORMATION==i][ref_depth].iloc[0]+0.5*depth_step, i, fontsize = 7, color ="black")
            twin1 = ax[r,c+1].twiny()
            twin1.plot(npss, depth, color='blue')
            twin1.set_xlim(0.6, 0)
            twin1.set_xticks([])
            ax[r,c+2].plot(phit, depth, color='green', linestyle='dashed')
            ax[r,c+2].set_xlim(0.3, 0)
            ax[r,c+2].grid(axis='x')
            ax[r,c+2].grid(axis='y')
            ax[r,c+2].invert_yaxis()
            ax[r,c+2].yaxis.set_ticks(np.arange(min(depth), max(depth), depth_step))
            ax[r,c+2].set_xticks([])
            ax[r,c+2].set_yticks([])
            ax[r,c+2].vlines(0.13, ymin=min(depth), ymax=max(depth), color='black', linestyle='dashed')
            for i in well_bal_tops[well_bal_tops.FORMATION.str.contains(fmname)].FORMATION:
                ax[r,c+2].hlines(    well_bal_tops[well_bal_tops.FORMATION==i][ref_depth].iloc[0], 
                                    xmin=0, xmax=1000, linewidth=2, color='black', lw=2, alpha=0.33)
            twin2 = ax[r,c+2].twiny()
            twin2.plot(net, depth, color='orange', linewidth=0.5)
            twin2.fill_betweenx(depth,net, color='orange', alpha=0.33)
            twin2.set_xlim(0, 1)
            twin2.set_xticks([])
            ax[r,c+3].plot(perm, depth, color='purple', alpha=0.66)
            ax[r,c+3].set_xscale('log')
            ax[r,c+3].set_xlim(0.1, 1000)
            ax[r,c+3].grid(axis='y')
            ax[r,c+3].grid(axis='x')
            ax[r,c+3].invert_yaxis()
            ax[r,c+3].yaxis.set_ticks(np.arange(min(depth), max(depth), depth_step))
            ax[r,c+3].set_xticks([])
            ax[r,c+3].set_yticks([])
            for i in well_bal_tops[well_bal_tops.FORMATION.str.contains(fmname)].FORMATION:
                ax[r,c+3].hlines(well_bal_tops[well_bal_tops.FORMATION==i][ref_depth].iloc[0], xmin=0, xmax=1000, linewidth=2, color='black', lw=2, alpha=0.5)
            twin4 = ax[r,c+3].twiny()
            twin4.plot(kh, depth, color='black', alpha=1)
            twin4.set_xticks([])
        except Exception as e:
            print(f'It looks like the desired formation is absent. The error is "{e}"')
        return fig.show()
    def display_subplots():
        try:
            well_dist_dict = well_offset_selection(dataset_dist, fmname, well_target)
            display_tracks(dataset_logs, well_dist_dict['target'], fmname,'TST', 10, 0,0,well_dist_dict['kh'], well_dist_dict['dist'])
            display_tracks(dataset_logs, well_dist_dict['w1'], fmname,'TST', 10 ,0,4, well_dist_dict['kh1'], well_dist_dict['d1'])  
            display_tracks(dataset_logs, well_dist_dict['w2'], fmname,'TST', 10,1,0, well_dist_dict['kh2'], well_dist_dict['d2'])      
            display_tracks(dataset_logs, well_dist_dict['w3'], fmname,'TST', 10,1,4, well_dist_dict['kh3'], well_dist_dict['d3'])
        except Exception as e:
            print(f'It looks like the desired formation is absent. The error is "{e}"')
    well_target = well
    fmname = formation
    fig, ax = plt.subplots(2,8, figsize=(9,8), constrained_layout=True)
    return display_subplots()

### Maps & 3D view

In [None]:
# Drawing maps of well trajectories
def well_traj_dataprep(dataset):
    map_data = dataset.dropna()
    map_data_top = map_data.groupby(['well','FORMATION_up'])[['X_traj','Y_traj']].apply(lambda x: x.iloc[0:-100:100]).reset_index()
    map_data_bot = map_data.groupby(['well','FORMATION_up'])[['X_traj','Y_traj']].apply(lambda x: x.iloc[-1]).reset_index()
    map_data_middle = map_data.groupby(['well','FORMATION_up'])[['X_mean', 'Y_mean', 'KHtst', 'TVD_SCS', 'Status']].apply(lambda x: x.iloc[0]).reset_index()
    map_trajectory_display = pd.concat([map_data_top, map_data_bot]).sort_values(by=['well','FORMATION_up']).drop('level_2', axis=1)
    return map_trajectory_display, map_data_middle
map_trajectory_display, map_data_middle = well_traj_dataprep(df_bal_net2_kh)

bal8_1510 = pd.read_csv(r'C:\jupyter\SPP\input\surfaces\petrel\bal8_1510_base.csv', sep=' ', names=['X','Y','geobody'])
def display_well_traj(trajectory, map_data_middle, petrel, fmname, mult, path, comment, print_flag):
    trajectory = trajectory[trajectory.FORMATION_up == fmname]
    map_data_middle = map_data_middle[map_data_middle.FORMATION_up == fmname]
    map_data_middle['KHtst'] = map_data_middle['KHtst'].round(0)
    traj = go.Scatter(  x=trajectory.X_traj, y=trajectory.Y_traj, 
                        mode='markers',
                        marker=dict(color='black', size=1),
                        customdata = trajectory[['well']],
                        hovertemplate="".join(["well:%{customdata[0]}<extra></extra>"])
                        )
    wells = go.Scatter( x=map_data_middle.X_mean, y=map_data_middle.Y_mean, 
                        mode='markers',
                        # marker=dict(symbol='diamond', color='red', size=7, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                        marker=dict(color=map_data_middle.KHtst, size=map_data_middle.KHtst*mult, colorscale='RdYlGn',  showscale=True,
                                    line=dict(color='rgb(47, 57, 61)', width=0.5)),
                        customdata = map_data_middle[['well', 'KHtst']],
                        hovertemplate="".join(["well:%{customdata[0]},kh:%{customdata[1]}<extra></extra>"]))
    geobody_map = go.Scatter(   x=petrel['X'], y=petrel['Y'],
                                mode='markers',
                                marker=dict(size=5, color=petrel['geobody'],colorscale='Viridis', opacity=0.5))
    fig = go.Figure()
    fig.add_trace(geobody_map)
    fig.add_trace(traj)
    fig.add_trace(wells)
    fig.update_layout(  title_text= ('Map of traj and well mean points with'+ ' ' + fmname + ' 1510 polygons. Size of bubbles is KHtst.'),
                        autosize=True, width=1000, height=700, margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
    if print_flag == 'print':
        go_offline.plot(fig, filename=path + comment, validate=True, auto_open=False)
    else:
        pass
    return fig.show()
display_well_traj(map_trajectory_display, map_data_middle, bal8_1510, 'Balakhany VIII', 0.00125, 'plots/', 'Balakhany8_KHtst', 'dont_print')
# display_well_traj(map_trajectory_display, map_data_middle, 'Balakhany X', 0.003, 'plots/', 'Balakhany10_KHtst', 'dont_print')

### Data interpolation

In [None]:
def interpolate_by_depth_fm(dataset_logs, formation_name, step):
    def interpolate_by_depth(one_well, formation_name, step):
        one_well = one_well.sort_values(by='TST')
        well_name = one_well["well"].iloc[0]
        data_range = np.floor((one_well["TST"].max() - one_well["TST"].min())/step)
        starting_tst = one_well["TST"].iloc[0]
        new_TST_values = [starting_tst + i*0.1 for i in range(1,int(data_range))]
        interp_X = interp1d(one_well['TST'], one_well['X_traj'], kind='linear', fill_value="extrapolate")
        interp_Y = interp1d(one_well['TST'], one_well['Y_traj'], kind='linear', fill_value="extrapolate")
        interp_PHIT = interp1d(one_well['TST'], one_well['PHIT'], kind='linear', fill_value="extrapolate")
        interp_TVD = interp1d(one_well['TST'], one_well['TVD_SCS'], kind='linear', fill_value="extrapolate")
        interp_NET_clp2 = interp1d(one_well['TST'], one_well['NET_clp2'], kind='linear', fill_value="extrapolate")
        interp_LPERM = interp1d(one_well['TST'], one_well['LPERM'], kind='linear', fill_value="extrapolate")
        interp_KHtst = interp1d(one_well['TST'], one_well['KHtst'], kind='linear', fill_value="extrapolate")
        # Create a new DataFrame with the interpolated values for new TVD_SCS
        new_data = {
            'well': [well_name for _ in range(len(new_TST_values))],
            'FORMATION_up': [formation_name for _ in range(len(new_TST_values))],
            'tst_index': [_ for _ in range(len(new_TST_values))],
            'TST': new_TST_values,
            'X_traj': interp_X(new_TST_values),
            'Y_traj': interp_Y(new_TST_values),
            'PHIT': interp_PHIT(new_TST_values),
            'TVD_SCS': interp_TVD(new_TST_values),
            'NET_clp2': interp_NET_clp2(new_TST_values),
            'LPERM': interp_LPERM(new_TST_values),
            'KHtst': interp_KHtst(new_TST_values),
        }
        new_df = pd.DataFrame(new_data)
        return new_df
    df_lst = []
    print(f'Start interpolation of {formation_name}')
    for wellnames in tqdm(dataset_logs.well.unique()):
        well_sel = dataset_logs[dataset_logs.well == wellnames]
        well_interp = interpolate_by_depth(well_sel, formation_name, step)
        df_lst.append(well_interp)
    result = pd.concat(df_lst)
    return result
well_bal8 = df_bal_net2_kh[(df_bal_net2_kh.FORMATION_up == 'Balakhany VIII')]
well_bal10 = df_bal_net2_kh[(df_bal_net2_kh.FORMATION_up == 'Balakhany X')]
well_bal8_interp = interpolate_by_depth_fm(well_bal8, 'Balakhany VIII', 0.1)
well_bal10_interp = interpolate_by_depth_fm(well_bal10, 'Balakhany X', 0.1)

### PHIT_clipped

In [None]:
well_bal8_interp_rn = well_bal8_interp.rename(columns={'PHIT':'PHIT_orig'})
well_bal10_interp_rn = well_bal10_interp.rename(columns={'PHIT':'PHIT_orig'})

def phit_rolling_averaging(input_dataset, samples_per_window):
    df_lst = []
    avg_report = []
    fmname = input_dataset['FORMATION_up'].iloc[0] 
    print(f'Start rolling averaging of {fmname}')
    for wellname in tqdm(input_dataset.well.unique()):
        dataset = input_dataset[input_dataset.well == wellname]
        window_size = int(len(dataset) / samples_per_window)
        dataset['PHIT'] = dataset['PHIT_orig'].rolling(window=window_size, center=True).mean()
        dataset =  dataset.dropna(subset=['PHIT'])
        df_lst.append(dataset)
        avg_report.append((wellname, len(dataset), window_size, samples_per_window))
    result = pd.concat(df_lst)
    avg_report_df = pd.DataFrame(avg_report, columns=['well','lenght_ds','window_size','samples_per_window'])
    return result, avg_report_df
samples_per_window = 100
well_bal8_interp_phavg, avg_report_df8 = phit_rolling_averaging(well_bal8_interp_rn, samples_per_window)
well_bal10_interp_phavg, avg_report_df10 = phit_rolling_averaging(well_bal10_interp_rn, samples_per_window)
well_bal8_interp_phavg['PHIT_clp'] = well_bal8_interp_phavg['PHIT']
well_bal10_interp_phavg['PHIT_clp'] = well_bal10_interp_phavg['PHIT']
well_bal8_interp_phavg['LPERM_clp'] = well_bal8_interp_phavg['LPERM']
well_bal10_interp_phavg['LPERM_clp'] = well_bal10_interp_phavg['LPERM']
well_bal8_interp_phavg.loc[well_bal8_interp_phavg.NET_clp2 == 0, 'PHIT_clp'] = 0.12
well_bal10_interp_phavg.loc[well_bal10_interp_phavg.NET_clp2 == 0, 'PHIT_clp'] = 0.12
well_bal8_interp_phavg.loc[well_bal8_interp_phavg.NET_clp2 == 0, 'LPERM_clp'] = 0.1
well_bal10_interp_phavg.loc[well_bal10_interp_phavg.NET_clp2 == 0, 'LPERM_clp'] = 0.1

In [None]:
def cutting_block_lenght(dataset, block_lenght):
    df_lst = []
    fmname = dataset['FORMATION_up'].iloc[0]
    print(f'Start processing of dataset for {fmname} with block lenght {block_lenght}')
    for wellname in tqdm(dataset.well.unique()):
        data = dataset[dataset.well == wellname]
        tst_index_repaired = [i for i in range(0, len(data))]
        data['tst_index'] = tst_index_repaired
        new_index = [i for i in range(0, len(data), block_lenght)]
        data_cut = data[(data.tst_index < new_index[-1])]
        df_lst.append(data_cut)
    result = pd.concat(df_lst)
    return result
block_lenght = 100
well_bal8_interp_phavg_cut = cutting_block_lenght(well_bal8_interp_phavg, block_lenght)
well_bal10_interp_phavg_cut = cutting_block_lenght(well_bal10_interp_phavg, block_lenght)

In [None]:
def exercise_phit_avg_kh(dataset, wellname):
    well_a01w = dataset[(dataset.well==wellname) & (dataset.FORMATION_up=='Balakhany VIII')]

    well_a01w['PHIT_clipped'] = well_a01w['PHIT']
    well_a01w.loc[well_a01w.NET_clp2 == 0, 'PHIT_clipped'] = 0
    well_a01w['LPERM_avg'] = 0.00000002*(np.exp(well_a01w.PHIT*105.56))
    well_a01w.loc[well_a01w['PHIT'] >= 0.2, 'LPERM_avg'] = (7.7925*((well_a01w.PHIT*100)**2))-(29881.0*well_a01w.PHIT)+2891.8
    well_a01w.loc[well_a01w['PHIT'] < 0.16, 'LPERM_avg'] = 0.0159*(np.exp(well_a01w.PHIT*21.27))
    well_a01w['khtst'] = well_a01w.LPERM_avg*0.1
    well_a01w['KHtst_avg'] = well_a01w.loc[::-1, 'khtst'].cumsum()[::-1]

    y = well_a01w.TST
    phit_orig = well_a01w.PHIT_orig
    phit_avg = well_a01w.PHIT
    phit_cliped = well_a01w.PHIT_clipped
    net = well_a01w.NET_clp2
    perm = well_a01w.LPERM
    perm_avg = well_a01w.LPERM_avg
    kh = well_a01w.KHtst
    kh_avg = well_a01w.KHtst_avg
    print(  'KH orig:', kh.iloc[0].round(0), 
            'KH avg:',kh_avg.iloc[0].round(0), 
            'KHavg/KHorig:',((kh.iloc[0].round(0)-kh_avg.iloc[0].round(0))/kh.iloc[0].round(0)).round(2))

    fig, ax = plt.subplots(1, 4, figsize=(6, 7))
    ax[0].plot(phit_orig, y, color='green')
    ax[0].plot(phit_avg, y, color='red')
    ax[0].set_xlim(0, 0.3)
    ax[0].invert_yaxis()
    ax[0].set_title(wellname)
    ax[1].plot(phit_cliped, y, color='red', zorder=1)
    ax[1].plot(net, y, color='orange', zorder=0)
    ax[1].set_xlim(0, 0.3)
    ax[1].invert_yaxis()
    ax[2].plot(perm, y, color='purple', lw=3)
    ax[2].plot(perm_avg, y, color='yellow')
    ax[2].invert_yaxis()
    ax[2].set_xscale('log')
    ax[3].plot(kh, y, color='black')
    ax[3].plot(kh_avg, y, color='gray')
    ax[3].invert_yaxis()
    fig.show()
exercise_phit_avg_kh(well_bal8_interp_phavg_cut, 'A01W')

###  PHIT & GRcube - martix plot 

In [13]:
def gr_cube_upload():
    path = 'C:\\jupyter\\SPP\\input\\'
    vsh_cube_log = pd.read_parquet(path + 'ACG_GRcube_VSH_v3.parquet.gzip')
    vsh_cube_log = vsh_cube_log.replace(-9999.000, np.nan)
    vsh_cube_log = vsh_cube_log.dropna()
    vsh_cube_log.loc[vsh_cube_log.FORMATION.str.contains('Balakhany VIII'), 'FORMATION_up'] = 'Balakhany VIII'
    vsh_cube_log.loc[vsh_cube_log.FORMATION.str.contains('Balakhany X'), 'FORMATION_up'] = 'Balakhany X'
    vsh_cube_log = vsh_cube_log[vsh_cube_log.FORMATION_up.isin(['Balakhany VIII', 'Balakhany X'])]
    vsh_grcube = vsh_cube_log[['wellName', 'DEPT','VSH_GRcube', 'FORMATION_up']]
    vsh_grcube = vsh_grcube.rename(columns={'wellName':'well', 'DEPT':'MD'})
    return vsh_grcube
vsh_grcube = gr_cube_upload()
df_bal_net2_kh['MD'] = df_bal_net2_kh.MD.round(1)
df_bal_net2_kh_cube = df_bal_net2_kh.set_index(['well','MD', 'FORMATION_up']).join(vsh_grcube.set_index(['well','MD', 'FORMATION_up'])).reset_index()

In [None]:
def vsh_gr_cube_recalc(dataset):
    def interpolate_by_depth_fm(dataset_logs, formation_name, step):
        def interpolate_by_depth(one_well, formation_name, step):
            one_well = one_well.sort_values(by='TST')
            well_name = one_well["well"].iloc[0]
            data_range = np.floor((one_well["TST"].max() - one_well["TST"].min())/step)
            starting_tst = one_well["TST"].iloc[0]
            new_TST_values = [starting_tst + i*0.1 for i in range(1,int(data_range))]
            interp_X = interp1d(one_well['TST'], one_well['X_traj'], kind='linear', fill_value="extrapolate")
            interp_Y = interp1d(one_well['TST'], one_well['Y_traj'], kind='linear', fill_value="extrapolate")
            interp_PHIT = interp1d(one_well['TST'], one_well['PHIT'], kind='linear', fill_value="extrapolate")
            interp_TVD = interp1d(one_well['TST'], one_well['TVD_SCS'], kind='linear', fill_value="extrapolate")
            interp_NET_clp2 = interp1d(one_well['TST'], one_well['NET_clp2'], kind='linear', fill_value="extrapolate")
            interp_LPERM = interp1d(one_well['TST'], one_well['LPERM'], kind='linear', fill_value="extrapolate")
            interp_KHtst = interp1d(one_well['TST'], one_well['KHtst'], kind='linear', fill_value="extrapolate")
            interp_VSH_GRcube = interp1d(one_well['TST'], one_well['VSH_GRcube'], kind='linear', fill_value="extrapolate")
            # Create a new DataFrame with the interpolated values for new TVD_SCS
            new_data = {
                'well': [well_name for _ in range(len(new_TST_values))],
                'FORMATION_up': [formation_name for _ in range(len(new_TST_values))],
                'tst_index': [_ for _ in range(len(new_TST_values))],
                'TST': new_TST_values,
                'X_traj': interp_X(new_TST_values),
                'Y_traj': interp_Y(new_TST_values),
                'PHIT': interp_PHIT(new_TST_values),
                'TVD_SCS': interp_TVD(new_TST_values),
                'NET_clp2': interp_NET_clp2(new_TST_values),
                'LPERM': interp_LPERM(new_TST_values),
                'KHtst': interp_KHtst(new_TST_values),
                'VSH_GRcube':interp_VSH_GRcube(new_TST_values)
            }
            new_df = pd.DataFrame(new_data)
            return new_df
        df_lst = []
        print(f'Start interpolation of {formation_name}')
        for wellnames in tqdm(dataset_logs.well.unique()):
            well_sel = dataset_logs[dataset_logs.well == wellnames]
            well_interp = interpolate_by_depth(well_sel, formation_name, step)
            df_lst.append(well_interp)
        result = pd.concat(df_lst)
        return result
    well_bal8 = dataset[(dataset.FORMATION_up == 'Balakhany VIII')]
    well_bal10 = dataset[(dataset.FORMATION_up == 'Balakhany X')]
    well_bal8_interp = interpolate_by_depth_fm(well_bal8, 'Balakhany VIII', 0.1)
    well_bal10_interp = interpolate_by_depth_fm(well_bal10, 'Balakhany X', 0.1)
    well_bal8_interp_rn = well_bal8_interp.rename(columns={'PHIT':'PHIT_orig'})
    well_bal10_interp_rn = well_bal10_interp.rename(columns={'PHIT':'PHIT_orig'})

    def phit_rolling_averaging(input_dataset, samples_per_window):
        df_lst = []
        avg_report = []
        fmname = input_dataset['FORMATION_up'].iloc[0] 
        print(f'Start rolling averaging of {fmname}')
        for wellname in tqdm(input_dataset.well.unique()):
            dataset = input_dataset[input_dataset.well == wellname]
            window_size = int(len(dataset) / samples_per_window)
            dataset['PHIT'] = dataset['PHIT_orig'].rolling(window=window_size, center=True).mean()
            dataset =  dataset.dropna(subset=['PHIT'])
            df_lst.append(dataset)
            avg_report.append((wellname, len(dataset), window_size, samples_per_window))
        result = pd.concat(df_lst)
        avg_report_df = pd.DataFrame(avg_report, columns=['well','lenght_ds','window_size','samples_per_window'])
        return result, avg_report_df
    samples_per_window = 100
    well_bal8_interp_phavg, avg_report_df8 = phit_rolling_averaging(well_bal8_interp_rn, samples_per_window)
    well_bal10_interp_phavg, avg_report_df10 = phit_rolling_averaging(well_bal10_interp_rn, samples_per_window)
    well_bal8_interp_phavg['PHIT_clp'] = well_bal8_interp_phavg['PHIT']
    well_bal10_interp_phavg['PHIT_clp'] = well_bal10_interp_phavg['PHIT']
    well_bal8_interp_phavg['LPERM_clp'] = well_bal8_interp_phavg['LPERM']
    well_bal10_interp_phavg['LPERM_clp'] = well_bal10_interp_phavg['LPERM']
    well_bal8_interp_phavg.loc[well_bal8_interp_phavg.NET_clp2 == 0, 'PHIT_clp'] = 0.12
    well_bal10_interp_phavg.loc[well_bal10_interp_phavg.NET_clp2 == 0, 'PHIT_clp'] = 0.12
    well_bal8_interp_phavg.loc[well_bal8_interp_phavg.NET_clp2 == 0, 'LPERM_clp'] = 0.1
    well_bal10_interp_phavg.loc[well_bal10_interp_phavg.NET_clp2 == 0, 'LPERM_clp'] = 0.1
    return well_bal8_interp_phavg, well_bal10_interp_phavg
well_bal8_interp_phavg, well_bal10_interp_phavg = vsh_gr_cube_recalc(df_bal_net2_kh_cube)

In [None]:
def well_letter_def(dataset):
    wells_letter = [wellname[0] for wellname in dataset.well.unique()]
    return set(wells_letter)
well_letter_def(well_bal8_interp_phavg)

In [None]:
def well_plots_phit_vsh_matrix(dataset, platform, variable, flag, max_var, comment):
    """
    flag = 'phit' or 'perm'
    """
    rows = 4
    columns = 9
    wells_letter = [wellname for wellname in dataset.well.unique() if wellname.startswith(platform)]
    fig, ax = plt.subplots(rows,columns, figsize=(16,rows*3))
    counter = 0
    y_real_list = []
    for j in range(0, rows):
        for i in range(0, columns):
            if counter < len(wells_letter):
                data = dataset[dataset.well==wells_letter[counter]]
                y_real_list.append(len(data))
                counter +=1
    max_ind = max(y_real_list)
    counter = 0
    for j in range(0, rows):
        for i in range(0, columns):
            if counter < len(wells_letter):
                well_data = dataset[dataset.well==wells_letter[counter]]
                ind = well_data[variable]
                y_real = [k for k in range(len(ind))]
                y_desired = [k for k in range(max_ind)]
                y_diff = len(y_desired) - len(y_real)
                values_to_add = [0.12 for k in range(y_diff)]
                x = well_data[variable]
                x_gr = well_data['VSH_GRcube']
                x_new = pd.concat([x, pd.Series(values_to_add)])
                x_gr_new = pd.concat([x_gr, pd.Series(values_to_add)])          
                if flag == 'phit':
                    ax[j,i].plot(x_new, y_desired, color='green', lw=1.5, alpha=1, zorder=1)
                    ax[j,i].set_xlim(0.1, 0.35)
                    # twin = ax[j,i].twiny()
                    # twin.plot(x_gr_new, y_desired, color='green', lw=2, alpha=0.5, zorder=0)
                    # twin.set_xlim(0, 1)
                if flag == 'perm':
                    ax[j,i].plot(x_new, y_desired, color='purple', lw=2, alpha=0.75)
                    ax[j,i].set_xscale('log')
                    ax[j,i].set_xlim(0.1, max_var)
                ax[j,i].set_title(wells_letter[counter] + comment)
                ax[j,i].invert_yaxis()
                ax[j,i].grid()
                counter +=1

    return plt.tight_layout()
# for letter in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J']:
for letter in ['A','B']:
    well_plots_phit_vsh_matrix(well_bal8_interp_phavg, letter, 'PHIT_clp', 'phit', 0.35, ' bal8')

## Clustering top_phi_bot layering v2 

#### Data preparation

In [None]:
def clustering_data_calculation(dataset):
    df_net2_bal8 = dataset[[    'well', 'MD', 'TST', 'TVD_SCS','NET_clp2', 'FORMATION_up', 'FORMATION', 
                                'LPERM', 'PHIT', 'VSH', 'KHtst','PHITHtst', 'VSHHtst', 'X_mean','Y_mean','field']]
    df_net2_bal8 = df_net2_bal8[df_net2_bal8.FORMATION_up=='Balakhany VIII']
    df_net2_bal10 = dataset[[   'well', 'MD', 'TST', 'TVD_SCS','NET_clp2', 'FORMATION_up', 'FORMATION', 
                                'LPERM', 'PHIT', 'VSH', 'KHtst','PHITHtst','VSHHtst', 'X_mean','Y_mean','field']]
    df_net2_bal10 = df_net2_bal10[df_net2_bal10.FORMATION_up=='Balakhany X']
    # Calculation NTD for Bal8 and Bal10 based on NET_clp2
    print('Calculation NTD for Bal8 and Bal10 based on NET_clp2')
    def ntd_calculation_brief(dataset,well,desired_fm, net_var):
        data = dataset[(dataset.well==well) & (dataset.FORMATION_up==desired_fm)]
        data.iloc[0, 3] = 0
        data.iloc[-1, 3] = 0
        tst_top = [data.iloc[i]['TST'] for i in range(len(data)-1)
                    if (data.iloc[i][net_var] == 1 and data.iloc[i-1][net_var]==0)]
        tst_bot = [data.iloc[i]['TST'] for i in range(len(data)-1)
                    if (data.iloc[i][net_var] == 1 and data.iloc[i+1][net_var]==0)]
        tops = zip(tst_top, tst_bot)
        df_htst = pd.DataFrame(tops, columns=['tst_top', 'tst_bot'])
        df_htst['FORMATION_up'] = desired_fm
        df_htst['well'] = well
        df_htst['h_tst'] = df_htst.tst_bot - df_htst.tst_top
        df_htst = df_htst[['well','FORMATION_up','tst_top','tst_bot','h_tst']]
        return df_htst
    df_recalc_list8 = []
    for well in tqdm(df_net2_bal8.well.unique()):
        df = ntd_calculation_brief(df_net2_bal8, well, 'Balakhany VIII', 'NET_clp2')
        df_recalc_list8.append(df)
    ntd_net2_8 = pd.concat(df_recalc_list8)
    ntd_net2_8.drop_duplicates(inplace=True)
    df_recalc_list10 = []
    for well in tqdm(df_net2_bal10.well.unique()):
        df = ntd_calculation_brief(df_net2_bal10, well, 'Balakhany X', 'NET_clp2')
        df_recalc_list10.append(df)
    ntd_net2_10 = pd.concat(df_recalc_list10)
    ntd_net2_10.drop_duplicates(inplace=True)

    print('Calculation values for NTD Bal8 and Bal10')
    def ntd_properties_dataframe(dataset_ntd, dataset_logs, fmname):
        well_data = []
        well_formation = fmname
        df_lst = []
        for well in tqdm(dataset_ntd.well.unique()[:]):
            ntd_well_avgprop = dataset_ntd[(dataset_ntd.well ==well)]
            well_avgprop_sel = dataset_logs[(dataset_logs.well==well)]
            fm_top = dataset_logs[(dataset_logs.well==well)]['TST'].iloc[0]
            fm_bot = dataset_logs[(dataset_logs.well==well)]['TST'].iloc[-1]
            well_phit = []
            well_vsh = []
            well_gperm = []
            well_top = []
            well_bot = []
            well_h = []
            well_fm_top = []
            well_fm_bot = []
            well_name = []
            well_fm = []
            well_khtst = []
            for layers in range(len(ntd_well_avgprop.well)):
                ntd_top = ntd_well_avgprop.iloc[layers, 2].round(3)
                ntd_bot = ntd_well_avgprop.iloc[layers, 3].round(3)
                ntd_h = ntd_well_avgprop.iloc[layers, 4].round(3)
                phit_lst = []
                vsh_lst = []
                perm_lst = []
                khtst_lst = []
                for depth in range(len(well_avgprop_sel.TST)):
                    well_avgprop_tst = well_avgprop_sel['TST'].iloc[depth].round(3)
                    if well_avgprop_tst >= ntd_top and well_avgprop_tst <= ntd_bot:
                        phit_lst.append(well_avgprop_sel['PHIT'].iloc[depth])
                        vsh_lst.append(well_avgprop_sel['VSH'].iloc[depth])
                        perm_lst.append(well_avgprop_sel['LPERM'].iloc[depth])
                        khtst_lst.append(well_avgprop_sel['KHtst'].iloc[depth])
                well_name.append(well)
                well_fm.append(well_formation)
                well_phit.append(mean(phit_lst))
                well_vsh.append(mean(vsh_lst))
                well_gperm.append(gmean(perm_lst))
                well_khtst.append(khtst_lst[0] - khtst_lst[-1])
                well_h.append(ntd_h)
                well_top.append(ntd_top)
                well_bot.append(ntd_bot)
                well_fm_top.append(fm_top)
                well_fm_bot.append(fm_bot)
                well_data = zip(well_name,well_fm,well_phit, well_vsh, well_gperm, well_khtst, well_h, well_top, well_bot, well_fm_top, well_fm_bot)
                well_df = pd.DataFrame(well_data, columns=[ 'well','FORMATION_up',        
                                                            'phit_avg',
                                                            'vsh_avg', 
                                                            'perm_avg',
                                                            'khtst',
                                                            'htst',
                                                            'top_tst',
                                                            'bot_tst',
                                                            'fm_top_tst',
                                                            'fm_bot_tst'])
                well_df['not_htst'] = well_df['top_tst'].shift(-1)-well_df['bot_tst']
                well_df = well_df[['well', 'FORMATION_up', 'phit_avg', 'vsh_avg', 'perm_avg', 'khtst','htst', 'not_htst','top_tst', 'bot_tst', 'fm_top_tst', 'fm_bot_tst']]
            df_lst.append(well_df)
        result = pd.concat(df_lst)
        return result
    ntd_val_bal8 = ntd_properties_dataframe(ntd_net2_8, df_net2_bal8, 'Balakhany VIII')
    ntd_val_bal10 = ntd_properties_dataframe(ntd_net2_10, df_net2_bal10, 'Balakhany X')
    ntd_val_final = pd.concat([ntd_val_bal8, ntd_val_bal10])
    return ntd_val_final
ntd_val_final = clustering_data_calculation(df_bal_net2_kh)
ntd_val_final8 = ntd_val_final[ntd_val_final.FORMATION_up == 'Balakhany VIII']
ntd_val_final10 = ntd_val_final[ntd_val_final.FORMATION_up == 'Balakhany X']

def nothtst_nan_fill(dataset_ntd, fmname):
    def nan_change_diff_fmbottom(dataset, wellname, fmname):
        row_change = dataset[(dataset.well == wellname) & (dataset.FORMATION_up == fmname) & (dataset.not_htst.isna())]
        row_change['not_htst'] = row_change['fm_bot_tst'] - row_change['bot_tst']
        return row_change
    df_list = []
    for wellname in dataset_ntd.well.unique():
        df = nan_change_diff_fmbottom(dataset_ntd, wellname, fmname)
        df_list.append(df)
    res_df_list = pd.concat(df_list)
    result = pd.concat([dataset_ntd, res_df_list])
    result = result.sort_values(by=['well','top_tst'])
    result_final = result.dropna(subset=['not_htst'], axis=0)
    return result_final
ntd_val_final8_clean = nothtst_nan_fill(ntd_val_final8, 'Balakhany VIII')
ntd_val_final10_clean = nothtst_nan_fill(ntd_val_final10, 'Balakhany X')

def top_phit_bot_clustering(dataset):
    print('Top & bot calculation')
    def top_phit_bot_collection_run(dataset):
        def top_phit_bot_collection(dataset, wellname):
            data = dataset[dataset.well == wellname]
            data['top_htst'] = data['top_tst'] - data['fm_top_tst']
            data['top_htst'].iloc[1:] = data['not_htst'].iloc[:-1]
            data['bot_htst'] = data['not_htst']
            data = data[['well', 'FORMATION_up', 'phit_avg', 'vsh_avg', 'khtst',
                         'top_htst','htst','bot_htst', 'fm_top_tst', 'fm_bot_tst']]
            return data
        df_lst = []
        for wellname in tqdm(dataset.well.unique()):
            res_df = top_phit_bot_collection(dataset, wellname)
            df_lst.append(res_df)
        top_phi_bot_cluster = pd.concat(df_lst).reset_index(drop=True)
        return top_phi_bot_cluster
    top_phi_bot_cluster = top_phit_bot_collection_run(dataset)

    def top_phit_bot_ntg_run(dataset):
        def top_phit_bot_ntg(dataset, wellname):
            ntg = []
            data = dataset[dataset.well == wellname].reset_index(drop=True)
            for ind, row in data.iterrows():
                if ind == 0:
                    ntg.append(row['htst']/(row['bot_htst'] + row['htst']))
                if ind != 0:
                    ntg.append(row['htst']/(row['bot_htst'] + row['htst'] + row['top_htst']))
                if ind == len(data):
                    ntg.append(row['htst']/(row['top_htst'] + row['htst']))
            result = pd.concat([data, pd.DataFrame({'ntg':ntg})], axis=1)
            return result
        df_lst = []
        for wellname in dataset.well.unique():
            df = top_phit_bot_ntg(dataset, wellname)
            df_lst.append(df)
        top_phi_bot_cluster_ntg = pd.concat(df_lst).reset_index(drop=True)
        return top_phi_bot_cluster_ntg
    top_phi_bot_cluster_ntg = top_phit_bot_ntg_run(top_phi_bot_cluster)
    
    return top_phi_bot_cluster_ntg
top_phi_bot_cluster8 = top_phit_bot_clustering(ntd_val_final8_clean)
top_phi_bot_cluster10 = top_phit_bot_clustering(ntd_val_final10_clean)

In [None]:
def data_clustering(dataset, feature_list, scaler, cluster_num):
    """
    MinMaxScaler(), StandardScaler()
    """
    data = dataset[feature_list]
    normalized_data = scaler.fit_transform(data)

    kmeans = KMeans(n_clusters=cluster_num, random_state=42)
    kmeans_labels = kmeans.fit_predict(normalized_data)
    kmeans_labels = pd.DataFrame(kmeans_labels, columns=['kmeans'])

    gmm = GaussianMixture(n_components=cluster_num, random_state=42)
    gmm.fit(normalized_data)
    gmm_labels = gmm.predict(normalized_data)
    gmm_labels = pd.DataFrame(gmm_labels, columns=['gmm'])

    agglomerative = AgglomerativeClustering(n_clusters=cluster_num)
    agglomerative_labels = agglomerative.fit_predict(normalized_data)
    agglomerative_labels = pd.DataFrame(agglomerative_labels, columns=['agglomer'])
    result = pd.concat([top_phi_bot_cluster8, kmeans_labels, gmm_labels, agglomerative_labels], axis=1)
    return result
data_clustered8 = data_clustering(top_phi_bot_cluster8, ['phit_avg', 'htst'], StandardScaler(), 3)
data_clustered8.head(3)

In [None]:
def histo_clustering(dataset, clustering, comment):
    data = dataset[dataset.phit_avg !=0]
    fig, ax = plt.subplots(1, 4, figsize=(18, 4))
    custom_palette = {2: 'red', 1: 'green', 0: 'blue'}
    sns.histplot(data=data, x='phit_avg', hue=clustering, ax=ax[0], kde=True,  palette=custom_palette)
    ax[0].grid(True, axis='x'), ax[0].set_xticks(np.arange(0.12, 0.32, 0.02)), ax[0].tick_params(axis='both', which='major', labelsize=8)
    sns.histplot(data=data[data.htst < 30], x='htst', hue=clustering, ax=ax[1], kde=True,  palette=custom_palette)
    ax[1].grid(True, axis='x'), ax[1].set_xticks(np.arange(0, 30, 3)), ax[1].tick_params(axis='both', which='major', labelsize=8)
    sns.histplot(data=data, x='ntg', hue=clustering, ax=ax[2], kde=True,  palette=custom_palette)
    ax[2].grid(True, axis='x'), ax[2].set_xticks(np.arange(0, 1, 0.1)), ax[2].tick_params(axis='both', which='major', labelsize=8)
    sns.histplot(data=data, x='vsh_avg', hue=clustering, ax=ax[3], kde=True,  palette=custom_palette)
    ax[3].grid(True, axis='x'), ax[3].set_xticks(np.arange(0, 0.6, 0.1)), ax[3].tick_params(axis='both', which='major', labelsize=8)
    fig.suptitle(comment)
histo_clustering(data_clustered8, 'kmeans', 'Kmeans Bal VIII')

In [None]:
def well_collecting_clusters_top_phi_bot_v2(dataset, clustering, fm):
    df_lst = []
    for wellname in dataset.well.unique()[:]:
        data = dataset[dataset.well == wellname]
        well_lst = []
        phit_lst = []
        htst_lst = []
        bot_lst = []
        ntg_lst = []
        vsh_lst = []
        cluster_lst = []
        for ind, row in data.iterrows():
            well_lst.append(wellname)
            well_lst.append(wellname)

            phit_lst.append(0)
            phit_lst.append(row['phit_avg'])

            cluster_lst.append(np.nan)
            cluster_lst.append(row[clustering])

            htst_lst.append(row['top_htst'])
            htst_lst.append(row['htst'])
            
            bot_lst.append(row['bot_htst'])

            ntg_lst.append(0)
            ntg_lst.append(row['ntg'])

            vsh_lst.append(0)
            vsh_lst.append(row['vsh_avg'])

        phit_lst.append(0)
        cluster_lst.append(np.nan)
        htst_lst.append(data['bot_htst'].iloc[-1])
        well_lst.append(wellname)
        well_collect_cluster_short = pd.DataFrame(zip(well_lst, phit_lst, htst_lst, ntg_lst, vsh_lst, cluster_lst ), columns=[  'well','phit', 'htst', 
                                                                                                                                'ntg', 'vsh', 'cluster'])
        well_last_row = pd.DataFrame({'well':[well_lst[-1]], 'phit':[0], 'htst': [bot_lst[-1]], 'ntg':[0], 'vsh':[0], 'cluster':[cluster_lst[-1]]})
        well_collect_cluster = pd.concat([well_collect_cluster_short, well_last_row]).reset_index(drop=True)
        well_collect_cluster['depth'] = well_collect_cluster['htst'].cumsum()
        df_lst.append(well_collect_cluster)
    result = pd.concat(df_lst)
    result['FORMATION_up'] = fm
    return result
tpb8_kmeans_v3 = well_collecting_clusters_top_phi_bot_v2(data_clustered8, 'kmeans', 'Balakhany VIII')

def coloring_clusters_matrix_tpb3(dataset, letters_list, rows, columns, clustering, output_flag):
    """
    ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J']
    """
    def clusters_rectangle(data, k, color):
        # cluster_xy = data['depth'].iloc[k-2]
        cluster_xy = data['depth'].iloc[k-1]
        # cluster_h = data['depth'].iloc[k+1] - data['depth'].iloc[k-2]
        cluster_h = data['depth'].iloc[k] - data['depth'].iloc[k-1]
        rectangle = patches.Rectangle((0, cluster_xy) , 1, cluster_h, edgecolor=color, facecolor=color, alpha=0.25)
        ax[j,i].add_patch(rectangle)
    for letter in letters_list:
        wells_letter = [wellname for wellname in dataset.well.unique() if wellname.startswith(letter)]
        fig, ax = plt.subplots(rows,columns, figsize=(16,rows*2.5))
        counter = 0
        for j in range(0, rows):
            for i in range(0, columns):
                if counter < len(wells_letter):
                    wellname = wells_letter[counter]
                    welldata = dataset[dataset.well==wellname]
                    df_top = pd.DataFrame({'well':[wellname], 'phit':[0], 'htst':[0], 'cluster':welldata['cluster'].iloc[0],'depth':[0]})
                    welldata = pd.concat([df_top, welldata]).reset_index().drop('index', axis=1)
                    ax[j,i].plot(welldata['phit'], welldata['depth'], drawstyle='steps-post', color='black', alpha=1, lw=0.75)
                    ax[j,i].set_xlim(0, 0.35)
                    ax[j,i].invert_yaxis()
                    ax[j,i].set_title(wellname)
                    ax[j,i].tick_params(axis='both', which='major', labelsize=10)
                    ax[j,i].grid()
                    for k in range(len(welldata)):
                        if welldata['phit'].iloc[k] > 0 and welldata['cluster'].iloc[k] == 0:
                            clusters_rectangle(welldata, k, 'blue')
                        if welldata['phit'].iloc[k] > 0 and welldata['cluster'].iloc[k] == 1:
                            clusters_rectangle(welldata, k, 'green')
                        if welldata['phit'].iloc[k] > 0 and welldata['cluster'].iloc[k] == 2:
                            clusters_rectangle(welldata, k, 'red')
                    fig.suptitle(clustering)
                    fig.tight_layout()
                    counter +=1
        if output_flag == 'print':
            plt.savefig('.\plots\\clustering_wells_tpb\\' + clustering + '_' + str(letter) +'.png')
        else:
            pass
coloring_clusters_matrix_tpb3(tpb8_kmeans_v3, ['B'], 4, 9, 'kmeans bal8', 'dontprint')

### Kmeans map

In [None]:
def cluster_data_to_map(dataset, dataset_coord, comment):
    def dataset_groupby(dataset):
        result = dataset.groupby(['well','cluster'])['htst'].sum().reset_index()
        return result
    tpb_test_v3_piechart = dataset_groupby(dataset)

    def cluster_transpose(dataset, wellname):
        result = dataset[dataset.well==wellname]
        result.loc[result.cluster == 0, 'cluster_0'] = result.htst
        result.loc[result.cluster == 1, 'cluster_1'] = result.htst
        result.loc[result.cluster == 2, 'cluster_2'] = result.htst
        result.fillna(0)
        result = result.groupby('well').sum().reset_index()
        result = result[['well', 'cluster_0', 'cluster_1','cluster_2']]
        return result
    df_lst = []
    for wellname in tpb_test_v3_piechart.well.unique():
        df = cluster_transpose(tpb_test_v3_piechart, wellname)
        df_lst.append(df)
    data_transpose = pd.concat(df_lst).reset_index(drop=True)

    def coordinates_calc(dataset_coord, fm):
        dataset_coord = dataset_coord[dataset_coord.FORMATION_up == fm]
        result = dataset_coord.groupby('well')[['X_mean','Y_mean']].apply(lambda x: x.iloc[0]).reset_index()
        return result 
    coord = coordinates_calc(dataset_coord, 'Balakhany VIII')
    data_transpose_coord = data_transpose.set_index('well').join(coord.set_index('well'), rsuffix='_coord').reset_index()

    def piechart_map(dataset_map):
        fig, ax = plt.subplots(figsize=(13,10))
        ax.scatter(bal8_1510['X']/1000, bal8_1510['Y']/1000, c=bal8_1510['geobody'])
        for ind, row in dataset_map.iterrows():
                ax.pie([row['cluster_0'], row['cluster_1'], row['cluster_2']], 
                        radius=0.3, center=(row['X_mean']/1000, row['Y_mean']/1000), wedgeprops={"linewidth": 0.5, "edgecolor": "gray", "alpha":0.75},
                        colors=['blue', 'green', 'red'], frame=True)
        # plt.grid()
        plt.title(comment)     
    piechart_map(data_transpose_coord)
    return data_transpose_coord
cluster_kmeans = cluster_data_to_map(tpb8_kmeans_v3, df_bal_net2_kh, 'Well Bal8 clustering by Kmeans & Bal8 1510 geobodies')

### Normalization of clusters thickness

In [124]:
def clustering_normalization(dataset, dataset_coord, fm):
    df_lst = []
    for wellname in dataset.well.unique():
        data = dataset[dataset.well == wellname]
        h_0 = data[data.cluster==0].htst.sum()
        h_1 = data[data.cluster==1].htst.sum()
        h_2 = data[data.cluster==2].htst.sum()

        total_thick = data.depth.iloc[-1]
        h_0 = h_0 / total_thick
        h_1 = h_1 / total_thick
        h_2 = h_2 / total_thick 

        welldata = pd.DataFrame({'well':[wellname], 'FORMATION_up':fm, 'cluster_0':[h_0], 'cluster_1':[h_1], 'cluster_2':[h_2]})
        df_lst.append(welldata)
    well_df = pd.concat(df_lst).reset_index(drop=True)
    coord_xy_init = dataset_coord[['well','FORMATION_up', 'X_mean', 'Y_mean']].groupby(['well','FORMATION_up']).apply(lambda x: x.iloc[0]).reset_index(drop=True)
    coord_xy = coord_xy_init[coord_xy_init['FORMATION_up'] == fm]
    result = well_df.set_index('well').join(coord_xy.drop('FORMATION_up', axis=1).set_index('well')).reset_index()
    return result
cluster_norm = clustering_normalization(tpb8_kmeans_v3, df_bal_net2_kh, 'Balakhany VIII').drop('FORMATION_up', axis=1)
# cluster_norm.to_csv(r'C:\jupyter\SPP\output\petrel\kmeans_normalize.txt', sep=' ', index=False)

### Gmm map

In [None]:
tpb_gmm_v3 = well_collecting_clusters_top_phi_bot_v2(result, 'gmm')

def cluster_data_to_map(dataset, dataset_coord, comment):
    def dataset_groupby(dataset):
        result = dataset.groupby(['well','cluster'])['htst'].sum().reset_index()
        return result
    tpb_test_v3_piechart = dataset_groupby(dataset)

    def cluster_transpose(dataset, wellname):
        result = dataset[dataset.well==wellname]
        result.loc[result.cluster == 0, 'cluster_0'] = result.htst
        result.loc[result.cluster == 1, 'cluster_1'] = result.htst
        result.loc[result.cluster == 2, 'cluster_2'] = result.htst
        result.fillna(0)
        result = result.groupby('well').sum().reset_index()
        result = result[['well', 'cluster_0', 'cluster_1','cluster_2']]
        return result
    df_lst = []
    for wellname in tpb_test_v3_piechart.well.unique():
        df = cluster_transpose(tpb_test_v3_piechart, wellname)
        df_lst.append(df)
    data_transpose = pd.concat(df_lst).reset_index(drop=True)

    def coordinates_calc(dataset_coord, fm):
        dataset_coord = dataset_coord[dataset_coord.FORMATION_up == fm]
        result = dataset_coord.groupby('well')[['X_mean','Y_mean']].apply(lambda x: x.iloc[0]).reset_index()
        return result 
    coord = coordinates_calc(dataset_coord, 'Balakhany VIII')
    data_transpose_coord = data_transpose.set_index('well').join(coord.set_index('well'), rsuffix='_coord').reset_index()

    def piechart_map(dataset_map):
        fig, ax = plt.subplots(figsize=(13,13))
        for ind, row in dataset_map.iterrows():
                ax.pie([row['cluster_0'], row['cluster_1'], row['cluster_2']], 
                        radius=0.3, center=(row['X_mean']/1000, row['Y_mean']/1000), wedgeprops={"linewidth": 0.5, "edgecolor": "gray", "alpha":0.75},
                        colors=['blue', 'green', 'red'], frame=True)
        plt.grid()
        plt.title(comment)
    piechart_map(data_transpose_coord)

    return data_transpose_coord
histo_clustering(result, 'gmm', 'Kmeans')
cluster_gmm = cluster_data_to_map(tpb_gmm_v3, df_bal_net2_kh, 'Gmm')

### Workflow for prediction khtst from phit_pred

In [None]:
## Archiv models
# model3 = model_preiction_split(dataset8, SVR(), 0.05, 'dont_display')
# model6 = model_preiction_split(dataset8, SGDRegressor(random_state=42), 0.05, 'dont_display')
# model7 = model_preiction_split(dataset8, GaussianProcessRegressor(random_state=42), 0.05, 'dont_display')
# model8 = model_preiction_split(dataset8, DecisionTreeRegressor(random_state=42), 0.05, 'dont_display')
# model9 = model_preiction_split(dataset8, GradientBoostingRegressor(random_state=42), 0.05, 'dont_display')
# model11 = model_preiction_split(dataset8, MLPRegressor(random_state=1, max_iter=1000), 0.05, 'dont_display')

In [None]:
histo_clustering(data_clustered8, 'kmeans', 'Kmeans')

#### Experiments with offset wells on the map

In [None]:
def display_well_traj_map(dataset, fmname, mult, wellname):
    def well_traj_data_calculation(dataset):
        map_data = dataset.dropna(subset=['KHtst'])
        map_data_top = map_data.groupby(['well','FORMATION_up'])[['X_traj','Y_traj']].apply(lambda x: x.iloc[0:-100:100]).reset_index()
        map_data_bot = map_data.groupby(['well','FORMATION_up'])[['X_traj','Y_traj']].apply(lambda x: x.iloc[-1]).reset_index()
        map_data_middle = map_data.groupby(['well','FORMATION_up'])[['X_mean', 'Y_mean', 'KHtst', 'TVD_SCS', 'Status']].apply(lambda x: x.iloc[0]).reset_index()
        map_trajectory_display = pd.concat([map_data_top, map_data_bot]).sort_values(by=['well','FORMATION_up']).drop('level_2', axis=1)
        return map_trajectory_display, map_data_middle
    map_trajectory_display, map_data_middle = well_traj_data_calculation(dataset)
    def well_offset_coord(wellname, fm):
        def dataset_for_spatial_prediction(dataset_full, dataset_cluster, offset_qty, cluster_algo, cluster_list):
                
                def joining_coordinates(dataset_full, dataset_cluster, cluster_algo, cluster_list):
                    coordinates = dataset_full.groupby(['well','FORMATION_up'])[['X_mean','Y_mean']].apply(lambda x: x.iloc[0]).reset_index()
                    dataset_cluster = dataset_cluster[(dataset_cluster[cluster_algo].isin(cluster_list))]
                    result = dataset_cluster.set_index(['well','FORMATION_up']).join(coordinates.set_index(['well','FORMATION_up'])).reset_index()
                    coordinates = result[['well','FORMATION_up', 'X_mean', 'Y_mean']].groupby(['well','FORMATION_up']).apply(lambda x: x.iloc[0]).reset_index(drop=True)
                    return coordinates, result
                coordinates, dataset_cluster_xy = joining_coordinates(dataset_full, dataset_cluster, cluster_algo, cluster_list)
                coordinates = coordinates[~coordinates.well.isin(['A14Y'])]

                def well_distance_calculation(coordinates, fm):
                    coordinates_fm = coordinates[coordinates.FORMATION_up == fm]
                    df_distance_fm = pd.DataFrame(euclidean_distances(coordinates_fm[['X_mean', 'Y_mean']]), columns=list(coordinates_fm.well))
                    well_name_rows = coordinates_fm.well.reset_index().drop(['index'], axis=1)
                    result = df_distance_fm.join(well_name_rows).set_index('well').reset_index()
                    return result
                well_dist_crosstable_8 = well_distance_calculation(coordinates, 'Balakhany VIII')

                def offset_well_names_dist(dataset, offset_qty):
                    df_lst = []
                    for ind in range(len(dataset.well.unique())):
                        off_well_series = dataset.iloc[ind]
                        off_well_selected = pd.DataFrame(off_well_series)[1:].sort_values(by=ind)[:offset_qty+1].T
                        off_well_selected['well'] = off_well_selected.columns[0]
                        off_well_selected = off_well_selected.drop(columns= off_well_selected.well, axis=1)

                        dist_titles = ['dist_' + str(num+1) for num in range(offset_qty)]
                        well_titles = ['well_' + str(num+1) for num in range(offset_qty)]

                        col_names = []
                        for i in range(len(off_well_selected.columns[:-1])):
                            col = off_well_selected.columns[i]
                            col_names.append(col)
                            off_well_selected = off_well_selected.rename(columns={col:dist_titles[i]})

                        off_well_names = pd.DataFrame(col_names).T
                        col_names = []
                        for i in range(len(off_well_names.columns)):
                            col = off_well_names.columns[i]
                            col_names.append(col)
                            off_well_names = off_well_names.rename(columns={col:well_titles[i]})
                        
                        concat_well_data = pd.concat([off_well_names.reset_index(drop=True), off_well_selected.reset_index(drop=True)], axis=1)
                        df_lst.append(concat_well_data)
                    result = pd.concat(df_lst).reset_index(drop=True)
                    return result
                well_dist_data8 = offset_well_names_dist(well_dist_crosstable_8, offset_qty)

                return well_dist_data8
        well_offset_df = dataset_for_spatial_prediction(df_bal_net2_kh, data_clustered8, 3, 'kmeans', cluster_list = [0,1,2])

        coord_xy = df_bal_net2_kh[['well','FORMATION_up','X_mean','Y_mean']].groupby(['well','FORMATION_up']).apply(lambda x: x.iloc[0]).reset_index(drop=True)
        coord_xy_fm = coord_xy[coord_xy['FORMATION_up'] == fm].reset_index(drop=True)

        data = well_offset_df[well_offset_df.well == wellname]
        df_coord = []
        for col in data.columns:
            if 'well_' in col:
                coord = coord_xy_fm[coord_xy_fm.well == data[col].iloc[0]]
                df_coord.append(coord)
        result = pd.concat(df_coord)
        return result
    well_offset_df = well_offset_coord(wellname, fmname)   
    def display_map(trajectory, map_data_middle, well_offset, fmname, mult, path, comment, print_flag):
        target_wellname = map_data_middle[(map_data_middle.well == wellname) & (map_data_middle.FORMATION_up == fmname)]
        trajectory = trajectory[trajectory.FORMATION_up == fmname]
        map_data_middle = map_data_middle[map_data_middle.FORMATION_up == fmname]
        map_data_middle['KHtst'] = map_data_middle['KHtst'].round(0)
        traj = go.Scatter(  x=trajectory.X_traj, y=trajectory.Y_traj, 
                            mode='markers',
                            marker=dict(color='black', size=1),
                            customdata = trajectory[['well']],
                            hovertemplate="".join(["well:%{customdata[0]}<extra></extra>"])
                            )
        wells = go.Scatter( x=map_data_middle.X_mean, y=map_data_middle.Y_mean, 
                            mode='markers',
                            # marker=dict(symbol='diamond', color='red', size=7, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                            marker=dict(color=map_data_middle.KHtst, size=map_data_middle.KHtst*mult, colorscale='RdYlGn',  showscale=True,
                                        line=dict(color='rgb(47, 57, 61)', width=0.5)),
                            customdata = map_data_middle[['well', 'KHtst']],
                            hovertemplate="".join(["well:%{customdata[0]},kh:%{customdata[1]}<extra></extra>"]))
        
        offset = go.Scatter( x=well_offset.X_mean, y=well_offset.Y_mean, mode='markers',
                            marker=dict(color='rgba(0,0,0,0)', size=10, line=dict(color='red', width=1.5)))
        target_well = go.Scatter( x=target_wellname.X_mean, y=target_wellname.Y_mean, mode='markers',
                    marker=dict(color='rgba(0,0,0,0)', size=10, line=dict(color='green', width=1.5)))

        fig = go.Figure()
        fig.add_trace(traj)
        fig.add_trace(wells)
        fig.add_trace(offset)
        fig.add_trace(target_well)
        fig.update_layout(  title_text= ('Map of traj and well mean points'+ ' ' + fmname + ' Size of bubbles is KHtst.'),
                            autosize=True, width=700, height=400, margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
        if print_flag == 'print':
            go_offline.plot(fig, filename=path + comment, validate=True, auto_open=False)
        else:
            pass
        return fig.show()
    display_map(map_trajectory_display, map_data_middle, well_offset_df, fmname, mult, 'plots/', 'Balakhany8_KHtst', 'dont_print')
    
    return map_trajectory_display, map_data_middle

# B34, E16Y, A01W, E05Z, D06, B13ST2, C18, A07Z, A11Z, A04
wellname_list = ['B34', 'E16Y', 'A01W', 'E05Z', 'D06', 'B13ST2', 'C18', 'A07Z', 'A11Z', 'A04']
for wellname in wellname_list:
    map_trajectory_display, map_data_middle = display_well_traj_map(df_bal_net2_kh, 'Balakhany VIII', 0.00125, wellname)
    def numerical_data_display(dataset_offset, wellname):
        result = dataset_offset[dataset_offset.well == wellname][['well','FORMATION_up',
                                                                'phit_wavg_target',
                                                                'dist_1','dist_2','dist_3', 'phit_wavg_1', 'phit_wavg_2', 'phit_wavg_3']]
        return result
    print(numerical_data_display(test_full['khtst_data'], wellname))

#### CLuster 0,1,2

In [None]:
# test = test_full['phit_pred']
# test['diff'] = abs(test.y_orig - test.y_pred)*100
# test.sort_values(by='diff', ascending=False)

In [None]:
def khtst_workflow(cluster_list):

    def dataset_for_spatial_prediction(dataset_full, dataset_cluster, offset_qty, cluster_algo, cluster_list):
            
            def joining_coordinates(dataset_full, dataset_cluster, cluster_algo, cluster_list):
                coordinates = dataset_full.groupby(['well','FORMATION_up'])[['X_mean','Y_mean']].apply(lambda x: x.iloc[0]).reset_index()
                dataset_cluster = dataset_cluster[(dataset_cluster[cluster_algo].isin(cluster_list))]
                result = dataset_cluster.set_index(['well','FORMATION_up']).join(coordinates.set_index(['well','FORMATION_up'])).reset_index()
                coordinates = result[['well','FORMATION_up', 'X_mean', 'Y_mean']].groupby(['well','FORMATION_up']).apply(lambda x: x.iloc[0]).reset_index(drop=True)
                return coordinates, result
            coordinates, dataset_cluster_xy = joining_coordinates(dataset_full, dataset_cluster, cluster_algo, cluster_list)
            coordinates = coordinates[~coordinates.well.isin(['A14Y'])]

            def well_distance_calculation(coordinates, fm):
                coordinates_fm = coordinates[coordinates.FORMATION_up == fm]
                df_distance_fm = pd.DataFrame(euclidean_distances(coordinates_fm[['X_mean', 'Y_mean']]), columns=list(coordinates_fm.well))
                well_name_rows = coordinates_fm.well.reset_index().drop(['index'], axis=1)
                result = df_distance_fm.join(well_name_rows).set_index('well').reset_index()
                return result
            well_dist_crosstable_8 = well_distance_calculation(coordinates, 'Balakhany VIII')

            def offset_well_names_dist(dataset, offset_qty):
                df_lst = []
                for ind in range(len(dataset.well.unique())):
                    off_well_series = dataset.iloc[ind]
                    off_well_selected = pd.DataFrame(off_well_series)[1:].sort_values(by=ind)[:offset_qty+1].T
                    off_well_selected['well'] = off_well_selected.columns[0]
                    off_well_selected = off_well_selected.drop(columns= off_well_selected.well, axis=1)

                    dist_titles = ['dist_' + str(num+1) for num in range(offset_qty)]
                    well_titles = ['well_' + str(num+1) for num in range(offset_qty)]

                    col_names = []
                    for i in range(len(off_well_selected.columns[:-1])):
                        col = off_well_selected.columns[i]
                        col_names.append(col)
                        off_well_selected = off_well_selected.rename(columns={col:dist_titles[i]})

                    off_well_names = pd.DataFrame(col_names).T
                    col_names = []
                    for i in range(len(off_well_names.columns)):
                        col = off_well_names.columns[i]
                        col_names.append(col)
                        off_well_names = off_well_names.rename(columns={col:well_titles[i]})
                    
                    concat_well_data = pd.concat([off_well_names.reset_index(drop=True), off_well_selected.reset_index(drop=True)], axis=1)
                    df_lst.append(concat_well_data)
                result = pd.concat(df_lst).reset_index(drop=True)
                return result
            well_dist_data8 = offset_well_names_dist(well_dist_crosstable_8, offset_qty)

            def offset_wells_features_calculation(dataset_dist, dataset_clusters, cluster_algo, cluster_list, fm):
                df_lst = []
                for wellname in dataset_dist.well.unique():
                    data = dataset_dist[dataset_dist.well == wellname]
                    cc = 0
                    for j in data.columns:
                        if 'well_' in j:
                            cc += 1
                            offset_wellname = data[j].values[0]
                            data_cluster = dataset_clusters[(dataset_clusters.well == offset_wellname) & 
                                                                (dataset_clusters[cluster_algo].isin(cluster_list))]
                            var_name = 'phit_wavg_' + str(cc)
                            data[var_name] = ((data_cluster['phit_avg'] * data_cluster['htst']).sum()) / (data_cluster['htst'].sum())
                            var_name = 'vsh_wavg_' + str(cc)
                            data[var_name] = ((data_cluster['vsh_avg'] * data_cluster['htst']).sum()) / (data_cluster['htst'].sum())
                            var_name = 'htst_sum_' + str(cc)
                            data[var_name] = data_cluster['htst'].sum()                
                    df_lst.append(data)
                result = pd.concat(df_lst).reset_index(drop=True)
                result['FORMATION_up'] = fm
                return result
            well_features8 = offset_wells_features_calculation(well_dist_data8, dataset_cluster, cluster_algo, cluster_list, 'Balakhany VIII')

            def target_wells_variable_calculation(dataset_dist, dataset_clusters, cluster_algo, cluster_list, fm):
                df_lst = []
                for wellname in dataset_dist.well.unique():
                    df = pd.DataFrame({'well': [wellname], 'FORMATION_up': [fm], 'phit_wavg_target': [0]})
                    data = dataset_clusters[(dataset_clusters.well == wellname) & 
                                            (dataset_clusters[cluster_algo].isin(cluster_list))]
                    df['phit_wavg_target'] = ((data['phit_avg'] * data['htst']).sum()) / (data['htst'].sum())
                    df_lst.append(df)
                result = pd.concat(df_lst).reset_index(drop=True)
                return result
            well_target8 = target_wells_variable_calculation(well_dist_data8, dataset_cluster, cluster_algo, cluster_list, 'Balakhany VIII')
            
            dataset8 = well_target8.set_index(['well','FORMATION_up']).join(well_features8.set_index(['well','FORMATION_up'])).reset_index()

            result = {'dataset8':dataset8, 'cluster_xy':dataset_cluster_xy, 'well_dist8':well_dist_data8, 'coordinates':coordinates,
                    'target8':well_target8, 'feature8':well_features8, 'dist_crosstable8':well_dist_crosstable_8}
            return result
    input_ph8 = dataset_for_spatial_prediction(df_bal_net2_kh, data_clustered8, 3, 'kmeans', cluster_list)['dataset8']
    print(f'Dataset features {textwrap.fill(str(list(input_ph8.columns)), width=150)}')

    def run_phit_pred_split(dataset, cluster_list, tolerance):
        def model_prediction_split(dataset, selected_model, target, tolerance, model_name, display_flag='display'):
            drop_lst_X = ['well','FORMATION_up', 'well_1', 'well_2', 'well_3']
            drop_lst_y = ['well','FORMATION_up']

            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=55)

            y_train_wnames = y_train[['well','FORMATION_up']].reset_index(drop=True)
            y_test_wnames = y_test[['well','FORMATION_up']].reset_index(drop=True)

            X_train = X_train.drop(drop_lst_X, axis=1)
            X_test = X_test.drop(drop_lst_X, axis=1)
            y_train = y_train.drop(drop_lst_y, axis=1)
            y_test = y_test.drop(drop_lst_y, axis=1)

            model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
            # model = selected_model
            model.fit(X_train, y_train)
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)
            y_train = np.array(y_train).flatten()
            y_test = np.array(y_test).flatten()
            train = pd.DataFrame(zip(y_train,y_pred_train), columns=['y_orig', 'y_pred'])
            train = pd.concat([y_train_wnames, train], axis=1)
            test = pd.DataFrame(zip(y_test,y_pred_test), columns=['y_orig', 'y_pred'])
            test = pd.concat([y_test_wnames, test], axis=1)

            train['up'] = train['y_orig']*(1 + tolerance)
            train['down'] = train['y_orig']*(1 - tolerance)
            train['qc'] = 'out'
            train['dataset'] = 'train'
            train.loc[(train['y_pred'] <= train.up) & (train['y_pred'] >= train.down), 'qc'] = 'in'
            trainqc = train.qc.value_counts(normalize=True)

            test['up'] = test['y_orig']*(1 + tolerance)
            test['down'] = test['y_orig']*(1 - tolerance)
            test['qc'] = 'out'
            test['dataset'] = 'test'
            test.loc[(test['y_pred'] <= test.up) & (test['y_pred'] >= test.down), 'qc'] = 'in'
            testqc = test.qc.value_counts(normalize=True)
            df = pd.concat([train, test])
            df['y_pred'] = df['y_pred'].astype('float')

            result = {'result':df, 'testqc':testqc['in'].round(2), 'trainqc':trainqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            if display_flag == 'display':
                print('features dataset: \n', list(X_train.columns))
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            else: 
                print('test "in":', '{:.2f}'.format(result['testqc'].round(2)),'\t', model_name)
            return result
        def xplot_qc2(data, qc_train, qc_test, y_orig, y_pred, max_val, rng, margin, round, comment):
            data = data.round({y_orig: round, y_pred: round})
            ds_train = data[data.dataset == 'train']
            ds_test = data[data.dataset == 'test']
            up_range = rng + 1
            dwn_range = 1 - rng
            colors = {'in': 'green', 'out': 'red'}
            qc_colors_tr = [colors[qc] for qc in ds_train.qc]
            qc_colors_ts = [colors[qc] for qc in ds_test.qc]
            scatter_train = go.Scatter( x=ds_train['y_orig'], y=ds_train['y_pred'],
                                        mode='markers',
                                        marker=dict(color=qc_colors_tr, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = ds_train[['well',y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            scatter_test = go.Scatter(  x=ds_test[y_orig], y=ds_test[y_pred], 
                                        mode='markers',
                                        marker=dict(color=qc_colors_ts, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = data[['well', y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            line_trace_up = go.Scatter(x=[0, max_val], y=[0 + margin, max_val*up_range + margin], mode='lines+markers', line=dict(color='blue'))
            line_trace_dw = go.Scatter(x=[0, max_val], y=[0 - margin, max_val*dwn_range - margin], mode='lines+markers', marker=dict(color='blue'))
            fig = make_subplots(rows=1, cols=2, subplot_titles=(f'train ds {qc_train}', f'test ds {qc_test}'))
            fig.add_trace(scatter_train,  row=1, col=1)
            fig.add_trace(line_trace_up,  row=1, col=1)
            fig.add_trace(line_trace_dw,  row=1, col=1)
            fig.update_xaxes(title_text='actual', row=1, col=1)
            fig.update_yaxes(title_text='predict', row=1, col=1)
            fig.add_trace(scatter_test,  row=1, col=2)
            fig.add_trace(line_trace_up,  row=1, col=2)
            fig.add_trace(line_trace_dw,  row=1, col=2)
            fig.update_xaxes(title_text='actual', row=1, col=2)
            fig.update_yaxes(title_text='predict', row=1, col=2)
            fig.update_layout(  title_text= (comment), width=700, height=350, 
                                margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
            return fig.show()

        print(f'Cluster list is: {cluster_list}')   
        target = 'phit_wavg_target'
        model1_ph = model_prediction_split(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance, 'RandomForestRegressor','display')
        model2_ph = model_prediction_split(dataset, BayesianRidge(), target, tolerance, 'BayesianRidge', 'dont_display')
        model3_ph = model_prediction_split(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, tolerance, 'XGBRegressor', 'dont_display')
        model4_ph = model_prediction_split(dataset, CatBoostRegressor(random_state=42, verbose=False), target, tolerance, 'CatBoostRegressor', 'dont_display')
        model5_ph = model_prediction_split(dataset, AdaBoostRegressor(random_state=42), target, tolerance, 'AdaBoostRegressor', 'dont_display')
        model6_ph = model_prediction_split(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, tolerance, 'LGBMRegressor', 'dont_display')

        xplot_qc2(model1_ph['result'], model1_ph['trainqc'], model1_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        # xplot_qc2(model2_ph['result'], model2_ph['trainqc'], model2_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        # xplot_qc2(model3_ph['result'], model3_ph['trainqc'], model3_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        # xplot_qc2(model4_ph['result'], model4_ph['trainqc'], model4_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        # xplot_qc2(model5_ph['result'], model5_ph['trainqc'], model5_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        # xplot_qc2(model6_ph['result'], model6_ph['trainqc'], model6_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model1_ph
    model_split = run_phit_pred_split(input_ph8, cluster_list, tolerance=0.05)['result']

    def run_phit_pred_1_to_all(dataset, cluster_list, tolerance):
        def model_prediction_1_to_all(dataset, selected_model, target, tolerance, model_name):
            drop_lst_X = ['well','FORMATION_up', 'well_1', 'well_2', 'well_3']
            drop_lst_y = ['well','FORMATION_up']
            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]
            print(model_name)
            df_lst = []
            for wellname in tqdm(dataset.well.unique()[:]):
                train = dataset[dataset.well != wellname]
                X_train_init = train.drop(target, axis=1)
                y_train_init = train[['well','FORMATION_up', target]]
                X_train = X_train_init.drop(drop_lst_X, axis=1)
                y_train = y_train_init.drop(drop_lst_y, axis=1)
                model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
                model.fit(X_train, y_train)

                test = dataset[dataset.well == wellname]
                y_test_wnames = test[['well','FORMATION_up']].reset_index(drop=True)
                X_test_init = test.drop(target, axis=1)
                y_test_init = test[['well','FORMATION_up', target]]
                X_test = X_test_init.drop(drop_lst_X, axis=1)
                y_test = y_test_init.drop(drop_lst_y, axis=1).values[0]
                y_pred = model.predict(X_test)
                test = pd.DataFrame(zip(y_test, y_pred), columns=['y_orig', 'y_pred'])
                test = pd.concat([y_test_wnames, test], axis=1)
                df_lst.append(test)
                
            result = pd.concat(df_lst).reset_index(drop=True)
            result['up'] = result['y_orig']*(1 + tolerance)
            result['down'] = result['y_orig']*(1 - tolerance)
            result['qc'] = 'out'
            result.loc[(result['y_pred'] <= result.up) & (result['y_pred'] >= result.down), 'qc'] = 'in'
            resultqc = result.qc.value_counts(normalize=True)

            phit_pred = result[['well','FORMATION_up','y_pred']].rename(columns={'y_pred':'phit_pred'})
            dataset_pred = dataset.set_index(['well','FORMATION_up']).join(phit_pred.set_index(['well','FORMATION_up']), how='inner').reset_index()

            result_dict = {'result':result, 'res_full':dataset_pred, 'testqc':resultqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            return result_dict
        def xplot_qc_1_to_all(data, qc_test, y_orig, y_pred, max_val, rng, margin, round, comment):
            data = data.round({y_orig: round, y_pred: round})
            ds_test = data
            up_range = rng + 1
            dwn_range = 1 - rng
            colors = {'in': 'green', 'out': 'red'}
            qc_colors = [colors[qc] for qc in ds_test.qc]
            scatter_test = go.Scatter( x=ds_test['y_orig'], y=ds_test['y_pred'],
                                        mode='markers',
                                        marker=dict(color=qc_colors, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = ds_test[['well',y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            line_trace_up = go.Scatter(x=[0, max_val], y=[0 + margin, max_val*up_range + margin], mode='lines+markers', line=dict(color='blue'))
            line_trace_dw = go.Scatter(x=[0, max_val], y=[0 - margin, max_val*dwn_range - margin], mode='lines+markers', marker=dict(color='blue'))
            fig = make_subplots(rows=1, cols=1, subplot_titles=(f'test qc {qc_test}',))
            fig.add_trace(scatter_test,  row=1, col=1)
            fig.add_trace(line_trace_up,  row=1, col=1)
            fig.add_trace(line_trace_dw,  row=1, col=1)
            fig.update_xaxes(title_text='actual', row=1, col=1)
            fig.update_yaxes(title_text='predict', row=1, col=1)
            fig.update_layout(  title_text= (comment), width=350, height=350, 
                                margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
            return fig.show()

        target = 'phit_wavg_target'
        model1_ph = model_prediction_1_to_all(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance,'RandomForestRegressor')
        # model2_ph = model_prediction_1_to_all(dataset, BayesianRidge(), target, 0.05, 'BayesianRidge')
        # model3_ph = model_prediction_1_to_all(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, 0.05, 'XGBRegressor')
        # model4_ph = model_prediction_1_to_all(dataset, CatBoostRegressor(random_state=42, verbose=False), target, 0.05,'CatBoostRegressor')
        # model5_ph = model_prediction_1_to_all(dataset, AdaBoostRegressor(random_state=42), target, 0.05, 'AdaBoostRegressor')
        # model6_ph = model_prediction_1_to_all(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, 0.05, 'LGBMRegressor')

        xplot_qc_1_to_all(model1_ph['result'], model1_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        # xplot_qc_1_to_all(model2_ph['result'], model2_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        # xplot_qc_1_to_all(model3_ph['result'], model3_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        # xplot_qc_1_to_all(model4_ph['result'], model4_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        # xplot_qc_1_to_all(model5_ph['result'], model5_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        # xplot_qc_1_to_all(model6_ph['result'], model6_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model1_ph
    model = run_phit_pred_1_to_all(input_ph8, cluster_list, tolerance=0.05)

    def concat_prediction_to_khtst_df(data_pred, data_khtst, data_main, cluster_algo):
        phit_pred8 = data_pred['result'][['well','FORMATION_up','y_pred']].rename(columns={'y_pred':'phit_pred'})
        khtst8 = data_khtst[data_khtst[cluster_algo].isin(cluster_list)].groupby(['well','FORMATION_up'])['khtst'].sum().reset_index()

        khtst8_phit_pred8 = khtst8.set_index(['well','FORMATION_up']).join(phit_pred8.set_index(['well','FORMATION_up'])).reset_index()

        phitpred_khtst = khtst8_phit_pred8.set_index(['well','FORMATION_up']).join(data_main.set_index(['well','FORMATION_up']), how='inner').reset_index()

        phitpred_khtst.insert(19, 'phit_pred', phitpred_khtst.pop('phit_pred'))
        phitpred_khtst.insert(19, 'phit_wavg_target', phitpred_khtst.pop('phit_wavg_target'))
        phitpred_khtst.insert(19, 'khtst', phitpred_khtst.pop('khtst'))
        return phitpred_khtst
    phitpred_khtst = concat_prediction_to_khtst_df(model, data_clustered8, input_ph8, 'kmeans')
    print(f'Concat dataset features {textwrap.fill(str(list(phitpred_khtst.columns)), width=150)}')

    print('\nPrediction KHtst: ')
    def run_khtst_pred_split(dataset, cluster_list, tolerance):

        def model_prediction_split(dataset, selected_model, target, tolerance, model_name, display_flag='display'):
            """
            'well', 'FORMATION_up', 'well_1', 'well_2', 'well_3', 'dist_1',
            'dist_2', 'dist_3', 'phit_wavg_1', 'vsh_wavg_1', 'htst_sum_1',
            'phit_wavg_2', 'vsh_wavg_2', 'htst_sum_2', 'phit_wavg_3', 'vsh_wavg_3',
            'htst_sum_3', 'phit_pred', 'phit_wavg_target', 'khtst'
            """
            drop_lst_X = ['well','FORMATION_up',    'well_1', 'well_2', 'well_3', 'dist_1', 'dist_2','dist_3', 'phit_wavg_target']
            drop_lst_y = ['well','FORMATION_up']

            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=55)

            y_train_wnames = y_train[['well','FORMATION_up']].reset_index(drop=True)
            y_test_wnames = y_test[['well','FORMATION_up']].reset_index(drop=True)

            X_train = X_train.drop(drop_lst_X, axis=1)
            X_test = X_test.drop(drop_lst_X, axis=1)
            y_train = y_train.drop(drop_lst_y, axis=1)
            y_test = y_test.drop(drop_lst_y, axis=1)

            # model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
            model = selected_model
            model.fit(X_train, y_train)
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)
            y_train = np.array(y_train).flatten()
            y_test = np.array(y_test).flatten()
            train = pd.DataFrame(zip(y_train,y_pred_train), columns=['y_orig', 'y_pred'])
            train = pd.concat([y_train_wnames, train], axis=1)
            test = pd.DataFrame(zip(y_test,y_pred_test), columns=['y_orig', 'y_pred'])
            test = pd.concat([y_test_wnames, test], axis=1)

            train['up'] = train['y_orig']*(1 + tolerance)
            train['down'] = train['y_orig']*(1 - tolerance)
            train['qc'] = 'out'
            train['dataset'] = 'train'
            train.loc[(train['y_pred'] <= train.up) & (train['y_pred'] >= train.down), 'qc'] = 'in'
            trainqc = train.qc.value_counts(normalize=True)

            test['up'] = test['y_orig']*(1 + tolerance)
            test['down'] = test['y_orig']*(1 - tolerance)
            test['qc'] = 'out'
            test['dataset'] = 'test'
            test.loc[(test['y_pred'] <= test.up) & (test['y_pred'] >= test.down), 'qc'] = 'in'
            testqc = test.qc.value_counts(normalize=True)
            df = pd.concat([train, test])
            df['y_pred'] = df['y_pred'].astype('float')

            result = {'result':df, 'testqc':testqc['in'].round(2), 'trainqc':trainqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            if display_flag == 'display':
                print(f'features dataset: {list(X_train.columns)}')
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            else: 
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            return result

        def xplot_qc2(data, qc_train, qc_test, y_orig, y_pred, max_val, rng, margin, round, comment):
            data = data.round({y_orig: round, y_pred: round})
            ds_train = data[data.dataset == 'train']
            ds_test = data[data.dataset == 'test']
            up_range = rng + 1
            dwn_range = 1 - rng
            colors = {'in': 'green', 'out': 'red'}
            qc_colors_tr = [colors[qc] for qc in ds_train.qc]
            qc_colors_ts = [colors[qc] for qc in ds_test.qc]
            scatter_train = go.Scatter( x=ds_train['y_orig'], y=ds_train['y_pred'],
                                        mode='markers',
                                        marker=dict(color=qc_colors_tr, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = ds_train[['well',y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            scatter_test = go.Scatter(  x=ds_test[y_orig], y=ds_test[y_pred], 
                                        mode='markers',
                                        marker=dict(color=qc_colors_ts, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = data[['well', y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            line_trace_up = go.Scatter(x=[0, max_val], y=[0 + margin, max_val*up_range + margin], mode='lines+markers', line=dict(color='blue'))
            line_trace_dw = go.Scatter(x=[0, max_val], y=[0 - margin, max_val*dwn_range - margin], mode='lines+markers', marker=dict(color='blue'))
            fig = make_subplots(rows=1, cols=2, subplot_titles=(f'train ds {qc_train}', f'test ds {qc_test}'))
            fig.add_trace(scatter_train,  row=1, col=1)
            fig.add_trace(line_trace_up,  row=1, col=1)
            fig.add_trace(line_trace_dw,  row=1, col=1)
            fig.update_xaxes(title_text='actual', row=1, col=1)
            fig.update_yaxes(title_text='predict', row=1, col=1)
            fig.add_trace(scatter_test,  row=1, col=2)
            fig.add_trace(line_trace_up,  row=1, col=2)
            fig.add_trace(line_trace_dw,  row=1, col=2)
            fig.update_xaxes(title_text='actual', row=1, col=2)
            fig.update_yaxes(title_text='predict', row=1, col=2)
            fig.update_layout(  title_text= (comment), width=700, height=350, 
                                margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
            return fig.show()

        print(f'Cluster list is: {cluster_list}')
        target = 'khtst'
        model1_kh = model_prediction_split(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance, 'RandomForestRegressor','display')
        model2_kh = model_prediction_split(dataset, BayesianRidge(), target, tolerance, 'BayesianRidge', 'dont_display')
        model3_kh = model_prediction_split(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, tolerance, 'XGBRegressor', 'dont_display')
        model4_kh = model_prediction_split(dataset, CatBoostRegressor(random_state=42, verbose=False), target, tolerance, 'CatBoostRegressor', 'dont_display')
        model5_kh = model_prediction_split(dataset, AdaBoostRegressor(random_state=42), target, tolerance, 'AdaBoostRegressor', 'dont_display')
        model6_kh = model_prediction_split(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, tolerance, 'LGBMRegressor', 'dont_display')

        # xplot_qc2(model1_kh['result'], model1_kh['trainqc'], model1_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        # xplot_qc2(model2_kh['result'], model2_kh['trainqc'], model2_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        # xplot_qc2(model3_kh['result'], model3_kh['trainqc'], model3_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        xplot_qc2(model4_kh['result'], model4_kh['trainqc'], model4_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        # xplot_qc2(model5_kh['result'], model5_kh['trainqc'], model5_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        # xplot_qc2(model6_kh['result'], model6_kh['trainqc'], model6_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model4_kh
    model_khtst = run_khtst_pred_split(phitpred_khtst, cluster_list, 0.25)
    result = {'khtst_pred':model_khtst['result'], 'khtst_data':phitpred_khtst, 'phit_pred':model['result']}
    return result
test_full = khtst_workflow(cluster_list = [0,1,2])

In [None]:
def pairplot_special(dataset, xsize, ysize, flag=1):
    if flag == 1:
        def corrfunc(x, y, **kws):
            r, _ = stats.pearsonr(x, y)
            ax = plt.gca()
            ax.annotate("r = {:.2f}".format(r),
                        xy=(.1, .9), xycoords=ax.transAxes)
        sns.set_context(rc={'axes.labelsize':10, 'lines.linewidth': 0.75})
        g = sns.PairGrid(dataset)
        g.fig.set_size_inches(xsize,ysize)
        g.set(xticklabels=[], yticklabels=[]) 
        g.map_upper(plt.scatter, s=10, alpha=0.5)
        g.map_diag(sns.distplot, kde=False)
        g.map_lower(sns.kdeplot, cmap="Blues_d")
        g.map_lower(corrfunc)
    else:
        pass
drop_lst_X = ['well','FORMATION_up',    'well_1', 'well_2', 'well_3',
                                        'dist_1', 'dist_2', 'dist_3', 'vsh_wavg_1', 'htst_sum_1', 'vsh_wavg_2', 'htst_sum_2', 'vsh_wavg_3', 'htst_sum_3',]
pairplot_special(test_full['khtst_data'].drop(drop_lst_X, axis=1), 7, 7, 1)

#### Cluster 0

In [None]:
def khtst_workflow(cluster_list):

    def dataset_for_spatial_prediction(dataset_full, dataset_cluster, offset_qty, cluster_algo, cluster_list):
            
            def joining_coordinates(dataset_full, dataset_cluster, cluster_algo, cluster_list):
                coordinates = dataset_full.groupby(['well','FORMATION_up'])[['X_mean','Y_mean']].apply(lambda x: x.iloc[0]).reset_index()
                dataset_cluster = dataset_cluster[(dataset_cluster[cluster_algo].isin(cluster_list))]
                result = dataset_cluster.set_index(['well','FORMATION_up']).join(coordinates.set_index(['well','FORMATION_up'])).reset_index()
                coordinates = result[['well','FORMATION_up', 'X_mean', 'Y_mean']].groupby(['well','FORMATION_up']).apply(lambda x: x.iloc[0]).reset_index(drop=True)
                return coordinates, result
            coordinates, dataset_cluster_xy = joining_coordinates(dataset_full, dataset_cluster, cluster_algo, cluster_list)
            coordinates = coordinates[~coordinates.well.isin(['A14Y'])]

            def well_distance_calculation(coordinates, fm):
                coordinates_fm = coordinates[coordinates.FORMATION_up == fm]
                df_distance_fm = pd.DataFrame(euclidean_distances(coordinates_fm[['X_mean', 'Y_mean']]), columns=list(coordinates_fm.well))
                well_name_rows = coordinates_fm.well.reset_index().drop(['index'], axis=1)
                result = df_distance_fm.join(well_name_rows).set_index('well').reset_index()
                return result
            well_dist_crosstable_8 = well_distance_calculation(coordinates, 'Balakhany VIII')

            def offset_well_names_dist(dataset, offset_qty):
                df_lst = []
                for ind in range(len(dataset.well.unique())):
                    off_well_series = dataset.iloc[ind]
                    off_well_selected = pd.DataFrame(off_well_series)[1:].sort_values(by=ind)[:offset_qty+1].T
                    off_well_selected['well'] = off_well_selected.columns[0]
                    off_well_selected = off_well_selected.drop(columns= off_well_selected.well, axis=1)

                    dist_titles = ['dist_' + str(num+1) for num in range(offset_qty)]
                    well_titles = ['well_' + str(num+1) for num in range(offset_qty)]

                    col_names = []
                    for i in range(len(off_well_selected.columns[:-1])):
                        col = off_well_selected.columns[i]
                        col_names.append(col)
                        off_well_selected = off_well_selected.rename(columns={col:dist_titles[i]})

                    off_well_names = pd.DataFrame(col_names).T
                    col_names = []
                    for i in range(len(off_well_names.columns)):
                        col = off_well_names.columns[i]
                        col_names.append(col)
                        off_well_names = off_well_names.rename(columns={col:well_titles[i]})
                    
                    concat_well_data = pd.concat([off_well_names.reset_index(drop=True), off_well_selected.reset_index(drop=True)], axis=1)
                    df_lst.append(concat_well_data)
                result = pd.concat(df_lst).reset_index(drop=True)
                return result
            well_dist_data8 = offset_well_names_dist(well_dist_crosstable_8, offset_qty)

            def offset_wells_features_calculation(dataset_dist, dataset_clusters, cluster_algo, cluster_list, fm):
                df_lst = []
                for wellname in dataset_dist.well.unique():
                    data = dataset_dist[dataset_dist.well == wellname]
                    cc = 0
                    for j in data.columns:
                        if 'well_' in j:
                            cc += 1
                            offset_wellname = data[j].values[0]
                            data_cluster = dataset_clusters[(dataset_clusters.well == offset_wellname) & 
                                                                (dataset_clusters[cluster_algo].isin(cluster_list))]
                            var_name = 'phit_wavg_' + str(cc)
                            data[var_name] = ((data_cluster['phit_avg'] * data_cluster['htst']).sum()) / (data_cluster['htst'].sum())
                            var_name = 'vsh_wavg_' + str(cc)
                            data[var_name] = ((data_cluster['vsh_avg'] * data_cluster['htst']).sum()) / (data_cluster['htst'].sum())
                            var_name = 'htst_sum_' + str(cc)
                            data[var_name] = data_cluster['htst'].sum()                
                    df_lst.append(data)
                result = pd.concat(df_lst).reset_index(drop=True)
                result['FORMATION_up'] = fm
                return result
            well_features8 = offset_wells_features_calculation(well_dist_data8, dataset_cluster, cluster_algo, cluster_list, 'Balakhany VIII')

            def target_wells_variable_calculation(dataset_dist, dataset_clusters, cluster_algo, cluster_list, fm):
                df_lst = []
                for wellname in dataset_dist.well.unique():
                    df = pd.DataFrame({'well': [wellname], 'FORMATION_up': [fm], 'phit_wavg_target': [0]})
                    data = dataset_clusters[(dataset_clusters.well == wellname) & 
                                            (dataset_clusters[cluster_algo].isin(cluster_list))]
                    df['phit_wavg_target'] = ((data['phit_avg'] * data['htst']).sum()) / (data['htst'].sum())
                    df_lst.append(df)
                result = pd.concat(df_lst).reset_index(drop=True)
                return result
            well_target8 = target_wells_variable_calculation(well_dist_data8, dataset_cluster, cluster_algo, cluster_list, 'Balakhany VIII')
            
            dataset8 = well_target8.set_index(['well','FORMATION_up']).join(well_features8.set_index(['well','FORMATION_up'])).reset_index()

            result = {'dataset8':dataset8, 'cluster_xy':dataset_cluster_xy, 'well_dist8':well_dist_data8, 'coordinates':coordinates,
                    'target8':well_target8, 'feature8':well_features8, 'dist_crosstable8':well_dist_crosstable_8}
            return result
    input_ph8 = dataset_for_spatial_prediction(df_bal_net2_kh, data_clustered8, 3, 'kmeans', cluster_list)['dataset8']
    print(f'Dataset features {textwrap.fill(str(list(input_ph8.columns)), width=150)}')

    def run_phit_pred_split(dataset, cluster_list, tolerance):
        def model_prediction_split(dataset, selected_model, target, tolerance, model_name, display_flag='display'):
            drop_lst_X = ['well','FORMATION_up', 'well_1', 'well_2', 'well_3']
            drop_lst_y = ['well','FORMATION_up']

            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=55)

            y_train_wnames = y_train[['well','FORMATION_up']].reset_index(drop=True)
            y_test_wnames = y_test[['well','FORMATION_up']].reset_index(drop=True)

            X_train = X_train.drop(drop_lst_X, axis=1)
            X_test = X_test.drop(drop_lst_X, axis=1)
            y_train = y_train.drop(drop_lst_y, axis=1)
            y_test = y_test.drop(drop_lst_y, axis=1)

            model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
            # model = selected_model
            model.fit(X_train, y_train)
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)
            y_train = np.array(y_train).flatten()
            y_test = np.array(y_test).flatten()
            train = pd.DataFrame(zip(y_train,y_pred_train), columns=['y_orig', 'y_pred'])
            train = pd.concat([y_train_wnames, train], axis=1)
            test = pd.DataFrame(zip(y_test,y_pred_test), columns=['y_orig', 'y_pred'])
            test = pd.concat([y_test_wnames, test], axis=1)

            train['up'] = train['y_orig']*(1 + tolerance)
            train['down'] = train['y_orig']*(1 - tolerance)
            train['qc'] = 'out'
            train['dataset'] = 'train'
            train.loc[(train['y_pred'] <= train.up) & (train['y_pred'] >= train.down), 'qc'] = 'in'
            trainqc = train.qc.value_counts(normalize=True)

            test['up'] = test['y_orig']*(1 + tolerance)
            test['down'] = test['y_orig']*(1 - tolerance)
            test['qc'] = 'out'
            test['dataset'] = 'test'
            test.loc[(test['y_pred'] <= test.up) & (test['y_pred'] >= test.down), 'qc'] = 'in'
            testqc = test.qc.value_counts(normalize=True)
            df = pd.concat([train, test])
            df['y_pred'] = df['y_pred'].astype('float')

            result = {'result':df, 'testqc':testqc['in'].round(2), 'trainqc':trainqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            if display_flag == 'display':
                print('features dataset: \n', list(X_train.columns))
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            else: 
                print('test "in":', '{:.2f}'.format(result['testqc'].round(2)),'\t', model_name)
            return result
        def xplot_qc2(data, qc_train, qc_test, y_orig, y_pred, max_val, rng, margin, round, comment):
            data = data.round({y_orig: round, y_pred: round})
            ds_train = data[data.dataset == 'train']
            ds_test = data[data.dataset == 'test']
            up_range = rng + 1
            dwn_range = 1 - rng
            colors = {'in': 'green', 'out': 'red'}
            qc_colors_tr = [colors[qc] for qc in ds_train.qc]
            qc_colors_ts = [colors[qc] for qc in ds_test.qc]
            scatter_train = go.Scatter( x=ds_train['y_orig'], y=ds_train['y_pred'],
                                        mode='markers',
                                        marker=dict(color=qc_colors_tr, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = ds_train[['well',y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            scatter_test = go.Scatter(  x=ds_test[y_orig], y=ds_test[y_pred], 
                                        mode='markers',
                                        marker=dict(color=qc_colors_ts, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = data[['well', y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            line_trace_up = go.Scatter(x=[0, max_val], y=[0 + margin, max_val*up_range + margin], mode='lines+markers', line=dict(color='blue'))
            line_trace_dw = go.Scatter(x=[0, max_val], y=[0 - margin, max_val*dwn_range - margin], mode='lines+markers', marker=dict(color='blue'))
            fig = make_subplots(rows=1, cols=2, subplot_titles=(f'train ds {qc_train}', f'test ds {qc_test}'))
            fig.add_trace(scatter_train,  row=1, col=1)
            fig.add_trace(line_trace_up,  row=1, col=1)
            fig.add_trace(line_trace_dw,  row=1, col=1)
            fig.update_xaxes(title_text='actual', row=1, col=1)
            fig.update_yaxes(title_text='predict', row=1, col=1)
            fig.add_trace(scatter_test,  row=1, col=2)
            fig.add_trace(line_trace_up,  row=1, col=2)
            fig.add_trace(line_trace_dw,  row=1, col=2)
            fig.update_xaxes(title_text='actual', row=1, col=2)
            fig.update_yaxes(title_text='predict', row=1, col=2)
            fig.update_layout(  title_text= (comment), width=700, height=350, 
                                margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
            return fig.show()

        print(f'Cluster list is: {cluster_list}')   
        target = 'phit_wavg_target'
        model1_ph = model_prediction_split(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance, 'RandomForestRegressor','display')
        model2_ph = model_prediction_split(dataset, BayesianRidge(), target, tolerance, 'BayesianRidge', 'dont_display')
        model3_ph = model_prediction_split(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, tolerance, 'XGBRegressor', 'dont_display')
        model4_ph = model_prediction_split(dataset, CatBoostRegressor(random_state=42, verbose=False), target, tolerance, 'CatBoostRegressor', 'dont_display')
        model5_ph = model_prediction_split(dataset, AdaBoostRegressor(random_state=42), target, tolerance, 'AdaBoostRegressor', 'dont_display')
        model6_ph = model_prediction_split(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, tolerance, 'LGBMRegressor', 'dont_display')

        xplot_qc2(model1_ph['result'], model1_ph['trainqc'], model1_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        # xplot_qc2(model2_ph['result'], model2_ph['trainqc'], model2_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        # xplot_qc2(model3_ph['result'], model3_ph['trainqc'], model3_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        # xplot_qc2(model4_ph['result'], model4_ph['trainqc'], model4_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        # xplot_qc2(model5_ph['result'], model5_ph['trainqc'], model5_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        # xplot_qc2(model6_ph['result'], model6_ph['trainqc'], model6_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model1_ph
    model_split = run_phit_pred_split(input_ph8, cluster_list, tolerance=0.05)['result']

    def run_phit_pred_1_to_all(dataset, cluster_list, tolerance):
        def model_prediction_1_to_all(dataset, selected_model, target, tolerance, model_name):
            drop_lst_X = ['well','FORMATION_up', 'well_1', 'well_2', 'well_3']
            drop_lst_y = ['well','FORMATION_up']
            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]
            print(model_name)
            df_lst = []
            for wellname in tqdm(dataset.well.unique()[:]):
                train = dataset[dataset.well != wellname]
                X_train_init = train.drop(target, axis=1)
                y_train_init = train[['well','FORMATION_up', target]]
                X_train = X_train_init.drop(drop_lst_X, axis=1)
                y_train = y_train_init.drop(drop_lst_y, axis=1)
                model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
                model.fit(X_train, y_train)

                test = dataset[dataset.well == wellname]
                y_test_wnames = test[['well','FORMATION_up']].reset_index(drop=True)
                X_test_init = test.drop(target, axis=1)
                y_test_init = test[['well','FORMATION_up', target]]
                X_test = X_test_init.drop(drop_lst_X, axis=1)
                y_test = y_test_init.drop(drop_lst_y, axis=1).values[0]
                y_pred = model.predict(X_test)
                test = pd.DataFrame(zip(y_test, y_pred), columns=['y_orig', 'y_pred'])
                test = pd.concat([y_test_wnames, test], axis=1)
                df_lst.append(test)
                
            result = pd.concat(df_lst).reset_index(drop=True)
            result['up'] = result['y_orig']*(1 + tolerance)
            result['down'] = result['y_orig']*(1 - tolerance)
            result['qc'] = 'out'
            result.loc[(result['y_pred'] <= result.up) & (result['y_pred'] >= result.down), 'qc'] = 'in'
            resultqc = result.qc.value_counts(normalize=True)

            phit_pred = result[['well','FORMATION_up','y_pred']].rename(columns={'y_pred':'phit_pred'})
            dataset_pred = dataset.set_index(['well','FORMATION_up']).join(phit_pred.set_index(['well','FORMATION_up']), how='inner').reset_index()

            result_dict = {'result':result, 'res_full':dataset_pred, 'testqc':resultqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            return result_dict
        def xplot_qc_1_to_all(data, qc_test, y_orig, y_pred, max_val, rng, margin, round, comment):
            data = data.round({y_orig: round, y_pred: round})
            ds_test = data
            up_range = rng + 1
            dwn_range = 1 - rng
            colors = {'in': 'green', 'out': 'red'}
            qc_colors = [colors[qc] for qc in ds_test.qc]
            scatter_test = go.Scatter( x=ds_test['y_orig'], y=ds_test['y_pred'],
                                        mode='markers',
                                        marker=dict(color=qc_colors, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = ds_test[['well',y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            line_trace_up = go.Scatter(x=[0, max_val], y=[0 + margin, max_val*up_range + margin], mode='lines+markers', line=dict(color='blue'))
            line_trace_dw = go.Scatter(x=[0, max_val], y=[0 - margin, max_val*dwn_range - margin], mode='lines+markers', marker=dict(color='blue'))
            fig = make_subplots(rows=1, cols=1, subplot_titles=(f'test qc {qc_test}',))
            fig.add_trace(scatter_test,  row=1, col=1)
            fig.add_trace(line_trace_up,  row=1, col=1)
            fig.add_trace(line_trace_dw,  row=1, col=1)
            fig.update_xaxes(title_text='actual', row=1, col=1)
            fig.update_yaxes(title_text='predict', row=1, col=1)
            fig.update_layout(  title_text= (comment), width=350, height=350, 
                                margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
            return fig.show()

        target = 'phit_wavg_target'
        model1_ph = model_prediction_1_to_all(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance,'RandomForestRegressor')
        # model2_ph = model_prediction_1_to_all(dataset, BayesianRidge(), target, 0.05, 'BayesianRidge')
        # model3_ph = model_prediction_1_to_all(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, 0.05, 'XGBRegressor')
        # model4_ph = model_prediction_1_to_all(dataset, CatBoostRegressor(random_state=42, verbose=False), target, 0.05,'CatBoostRegressor')
        # model5_ph = model_prediction_1_to_all(dataset, AdaBoostRegressor(random_state=42), target, 0.05, 'AdaBoostRegressor')
        # model6_ph = model_prediction_1_to_all(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, 0.05, 'LGBMRegressor')

        xplot_qc_1_to_all(model1_ph['result'], model1_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        # xplot_qc_1_to_all(model2_ph['result'], model2_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        # xplot_qc_1_to_all(model3_ph['result'], model3_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        # xplot_qc_1_to_all(model4_ph['result'], model4_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        # xplot_qc_1_to_all(model5_ph['result'], model5_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        # xplot_qc_1_to_all(model6_ph['result'], model6_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model1_ph
    model = run_phit_pred_1_to_all(input_ph8, cluster_list, tolerance=0.05)

    def concat_prediction_to_khtst_df(data_pred, data_khtst, data_main, cluster_algo):
        phit_pred8 = data_pred['result'][['well','FORMATION_up','y_pred']].rename(columns={'y_pred':'phit_pred'})
        khtst8 = data_khtst[data_khtst[cluster_algo].isin(cluster_list)].groupby(['well','FORMATION_up'])['khtst'].sum().reset_index()

        khtst8_phit_pred8 = khtst8.set_index(['well','FORMATION_up']).join(phit_pred8.set_index(['well','FORMATION_up'])).reset_index()

        phitpred_khtst = khtst8_phit_pred8.set_index(['well','FORMATION_up']).join(data_main.set_index(['well','FORMATION_up']), how='inner').reset_index()

        phitpred_khtst.insert(19, 'phit_pred', phitpred_khtst.pop('phit_pred'))
        phitpred_khtst.insert(19, 'phit_wavg_target', phitpred_khtst.pop('phit_wavg_target'))
        phitpred_khtst.insert(19, 'khtst', phitpred_khtst.pop('khtst'))
        return phitpred_khtst
    phitpred_khtst = concat_prediction_to_khtst_df(model, data_clustered8, input_ph8, 'kmeans')
    print(f'Concat dataset features {textwrap.fill(str(list(phitpred_khtst.columns)), width=150)}')

    print('\nPrediction KHtst: ')
    def run_khtst_pred_split(dataset, cluster_list, tolerance):

        def model_prediction_split(dataset, selected_model, target, tolerance, model_name, display_flag='display'):
            """
            'well', 'FORMATION_up', 'well_1', 'well_2', 'well_3', 'dist_1',
            'dist_2', 'dist_3', 'phit_wavg_1', 'vsh_wavg_1', 'htst_sum_1',
            'phit_wavg_2', 'vsh_wavg_2', 'htst_sum_2', 'phit_wavg_3', 'vsh_wavg_3',
            'htst_sum_3', 'phit_pred', 'phit_wavg_target', 'khtst'
            """
            drop_lst_X = ['well','FORMATION_up',    'well_1', 'well_2', 'well_3', 'dist_1', 'dist_2','dist_3', 'phit_wavg_target']
            drop_lst_y = ['well','FORMATION_up']

            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=55)

            y_train_wnames = y_train[['well','FORMATION_up']].reset_index(drop=True)
            y_test_wnames = y_test[['well','FORMATION_up']].reset_index(drop=True)

            X_train = X_train.drop(drop_lst_X, axis=1)
            X_test = X_test.drop(drop_lst_X, axis=1)
            y_train = y_train.drop(drop_lst_y, axis=1)
            y_test = y_test.drop(drop_lst_y, axis=1)

            # model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
            model = selected_model
            model.fit(X_train, y_train)
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)
            y_train = np.array(y_train).flatten()
            y_test = np.array(y_test).flatten()
            train = pd.DataFrame(zip(y_train,y_pred_train), columns=['y_orig', 'y_pred'])
            train = pd.concat([y_train_wnames, train], axis=1)
            test = pd.DataFrame(zip(y_test,y_pred_test), columns=['y_orig', 'y_pred'])
            test = pd.concat([y_test_wnames, test], axis=1)

            train['up'] = train['y_orig']*(1 + tolerance)
            train['down'] = train['y_orig']*(1 - tolerance)
            train['qc'] = 'out'
            train['dataset'] = 'train'
            train.loc[(train['y_pred'] <= train.up) & (train['y_pred'] >= train.down), 'qc'] = 'in'
            trainqc = train.qc.value_counts(normalize=True)

            test['up'] = test['y_orig']*(1 + tolerance)
            test['down'] = test['y_orig']*(1 - tolerance)
            test['qc'] = 'out'
            test['dataset'] = 'test'
            test.loc[(test['y_pred'] <= test.up) & (test['y_pred'] >= test.down), 'qc'] = 'in'
            testqc = test.qc.value_counts(normalize=True)
            df = pd.concat([train, test])
            df['y_pred'] = df['y_pred'].astype('float')

            result = {'result':df, 'testqc':testqc['in'].round(2), 'trainqc':trainqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            if display_flag == 'display':
                print(f'features dataset: {list(X_train.columns)}')
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            else: 
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            return result

        print(f'Cluster list is: {cluster_list}')
        target = 'khtst'
        model1_kh = model_prediction_split(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance, 'RandomForestRegressor','display')
        model2_kh = model_prediction_split(dataset, BayesianRidge(), target, tolerance, 'BayesianRidge', 'dont_display')
        model3_kh = model_prediction_split(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, tolerance, 'XGBRegressor', 'dont_display')
        model4_kh = model_prediction_split(dataset, CatBoostRegressor(random_state=42, verbose=False), target, tolerance, 'CatBoostRegressor', 'dont_display')
        model5_kh = model_prediction_split(dataset, AdaBoostRegressor(random_state=42), target, tolerance, 'AdaBoostRegressor', 'dont_display')
        model6_kh = model_prediction_split(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, tolerance, 'LGBMRegressor', 'dont_display')

        # xplot_qc2(model1_kh['result'], model1_kh['trainqc'], model1_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        # xplot_qc2(model2_kh['result'], model2_kh['trainqc'], model2_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        # xplot_qc2(model3_kh['result'], model3_kh['trainqc'], model3_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        xplot_qc2(model4_kh['result'], model4_kh['trainqc'], model4_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        # xplot_qc2(model5_kh['result'], model5_kh['trainqc'], model5_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        # xplot_qc2(model6_kh['result'], model6_kh['trainqc'], model6_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model4_kh
    model_khtst = run_khtst_pred_split(phitpred_khtst, cluster_list, 0.25)
    result = {'khtst_pred':model_khtst, 'khtst_data':phitpred_khtst, 'phit_pred':model['result']}
    return result
test_0 = khtst_workflow(cluster_list = [0])

In [None]:
def pairplot_special(dataset, xsize, ysize, flag=1):
    if flag == 1:
        def corrfunc(x, y, **kws):
            r, _ = stats.pearsonr(x, y)
            ax = plt.gca()
            ax.annotate("r = {:.2f}".format(r),
                        xy=(.1, .9), xycoords=ax.transAxes)
        sns.set_context(rc={'axes.labelsize':10, 'lines.linewidth': 0.75})
        g = sns.PairGrid(dataset)
        g.fig.set_size_inches(xsize,ysize)
        g.set(xticklabels=[], yticklabels=[]) 
        g.map_upper(plt.scatter, s=10, alpha=0.5)
        g.map_diag(sns.distplot, kde=False)
        g.map_lower(sns.kdeplot, cmap="Blues_d")
        g.map_lower(corrfunc)
    else:
        pass
drop_lst_X = ['well','FORMATION_up',    'well_1', 'well_2', 'well_3',
                                        'dist_1', 'dist_2', 'dist_3', 'vsh_wavg_1', 'htst_sum_1', 'vsh_wavg_2', 'htst_sum_2', 'vsh_wavg_3', 'htst_sum_3', 'cluster']
pairplot_special(test_0['khtst_data'].drop(drop_lst_X, axis=1), 7, 7, 1)

#### Cluster 1

In [None]:
def khtst_workflow(cluster_list):

    def dataset_for_spatial_prediction(dataset_full, dataset_cluster, offset_qty, cluster_algo, cluster_list):
            
            def joining_coordinates(dataset_full, dataset_cluster, cluster_algo, cluster_list):
                coordinates = dataset_full.groupby(['well','FORMATION_up'])[['X_mean','Y_mean']].apply(lambda x: x.iloc[0]).reset_index()
                dataset_cluster = dataset_cluster[(dataset_cluster[cluster_algo].isin(cluster_list))]
                result = dataset_cluster.set_index(['well','FORMATION_up']).join(coordinates.set_index(['well','FORMATION_up'])).reset_index()
                coordinates = result[['well','FORMATION_up', 'X_mean', 'Y_mean']].groupby(['well','FORMATION_up']).apply(lambda x: x.iloc[0]).reset_index(drop=True)
                return coordinates, result
            coordinates, dataset_cluster_xy = joining_coordinates(dataset_full, dataset_cluster, cluster_algo, cluster_list)
            coordinates = coordinates[~coordinates.well.isin(['A14Y'])]

            def well_distance_calculation(coordinates, fm):
                coordinates_fm = coordinates[coordinates.FORMATION_up == fm]
                df_distance_fm = pd.DataFrame(euclidean_distances(coordinates_fm[['X_mean', 'Y_mean']]), columns=list(coordinates_fm.well))
                well_name_rows = coordinates_fm.well.reset_index().drop(['index'], axis=1)
                result = df_distance_fm.join(well_name_rows).set_index('well').reset_index()
                return result
            well_dist_crosstable_8 = well_distance_calculation(coordinates, 'Balakhany VIII')

            def offset_well_names_dist(dataset, offset_qty):
                df_lst = []
                for ind in range(len(dataset.well.unique())):
                    off_well_series = dataset.iloc[ind]
                    off_well_selected = pd.DataFrame(off_well_series)[1:].sort_values(by=ind)[:offset_qty+1].T
                    off_well_selected['well'] = off_well_selected.columns[0]
                    off_well_selected = off_well_selected.drop(columns= off_well_selected.well, axis=1)

                    dist_titles = ['dist_' + str(num+1) for num in range(offset_qty)]
                    well_titles = ['well_' + str(num+1) for num in range(offset_qty)]

                    col_names = []
                    for i in range(len(off_well_selected.columns[:-1])):
                        col = off_well_selected.columns[i]
                        col_names.append(col)
                        off_well_selected = off_well_selected.rename(columns={col:dist_titles[i]})

                    off_well_names = pd.DataFrame(col_names).T
                    col_names = []
                    for i in range(len(off_well_names.columns)):
                        col = off_well_names.columns[i]
                        col_names.append(col)
                        off_well_names = off_well_names.rename(columns={col:well_titles[i]})
                    
                    concat_well_data = pd.concat([off_well_names.reset_index(drop=True), off_well_selected.reset_index(drop=True)], axis=1)
                    df_lst.append(concat_well_data)
                result = pd.concat(df_lst).reset_index(drop=True)
                return result
            well_dist_data8 = offset_well_names_dist(well_dist_crosstable_8, offset_qty)

            def offset_wells_features_calculation(dataset_dist, dataset_clusters, cluster_algo, cluster_list, fm):
                df_lst = []
                for wellname in dataset_dist.well.unique():
                    data = dataset_dist[dataset_dist.well == wellname]
                    cc = 0
                    for j in data.columns:
                        if 'well_' in j:
                            cc += 1
                            offset_wellname = data[j].values[0]
                            data_cluster = dataset_clusters[(dataset_clusters.well == offset_wellname) & 
                                                                (dataset_clusters[cluster_algo].isin(cluster_list))]
                            var_name = 'phit_wavg_' + str(cc)
                            data[var_name] = ((data_cluster['phit_avg'] * data_cluster['htst']).sum()) / (data_cluster['htst'].sum())
                            var_name = 'vsh_wavg_' + str(cc)
                            data[var_name] = ((data_cluster['vsh_avg'] * data_cluster['htst']).sum()) / (data_cluster['htst'].sum())
                            var_name = 'htst_sum_' + str(cc)
                            data[var_name] = data_cluster['htst'].sum()                
                    df_lst.append(data)
                result = pd.concat(df_lst).reset_index(drop=True)
                result['FORMATION_up'] = fm
                return result
            well_features8 = offset_wells_features_calculation(well_dist_data8, dataset_cluster, cluster_algo, cluster_list, 'Balakhany VIII')

            def target_wells_variable_calculation(dataset_dist, dataset_clusters, cluster_algo, cluster_list, fm):
                df_lst = []
                for wellname in dataset_dist.well.unique():
                    df = pd.DataFrame({'well': [wellname], 'FORMATION_up': [fm], 'phit_wavg_target': [0]})
                    data = dataset_clusters[(dataset_clusters.well == wellname) & 
                                            (dataset_clusters[cluster_algo].isin(cluster_list))]
                    df['phit_wavg_target'] = ((data['phit_avg'] * data['htst']).sum()) / (data['htst'].sum())
                    df_lst.append(df)
                result = pd.concat(df_lst).reset_index(drop=True)
                return result
            well_target8 = target_wells_variable_calculation(well_dist_data8, dataset_cluster, cluster_algo, cluster_list, 'Balakhany VIII')
            
            dataset8 = well_target8.set_index(['well','FORMATION_up']).join(well_features8.set_index(['well','FORMATION_up'])).reset_index()

            result = {'dataset8':dataset8, 'cluster_xy':dataset_cluster_xy, 'well_dist8':well_dist_data8, 'coordinates':coordinates,
                    'target8':well_target8, 'feature8':well_features8, 'dist_crosstable8':well_dist_crosstable_8}
            return result
    input_ph8 = dataset_for_spatial_prediction(df_bal_net2_kh, data_clustered8, 3, 'kmeans', cluster_list)['dataset8']
    print(f'Dataset features {textwrap.fill(str(list(input_ph8.columns)), width=150)}')

    def run_phit_pred_split(dataset, cluster_list, tolerance):
        def model_prediction_split(dataset, selected_model, target, tolerance, model_name, display_flag='display'):
            drop_lst_X = ['well','FORMATION_up', 'well_1', 'well_2', 'well_3']
            drop_lst_y = ['well','FORMATION_up']

            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=55)

            y_train_wnames = y_train[['well','FORMATION_up']].reset_index(drop=True)
            y_test_wnames = y_test[['well','FORMATION_up']].reset_index(drop=True)

            X_train = X_train.drop(drop_lst_X, axis=1)
            X_test = X_test.drop(drop_lst_X, axis=1)
            y_train = y_train.drop(drop_lst_y, axis=1)
            y_test = y_test.drop(drop_lst_y, axis=1)

            model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
            # model = selected_model
            model.fit(X_train, y_train)
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)
            y_train = np.array(y_train).flatten()
            y_test = np.array(y_test).flatten()
            train = pd.DataFrame(zip(y_train,y_pred_train), columns=['y_orig', 'y_pred'])
            train = pd.concat([y_train_wnames, train], axis=1)
            test = pd.DataFrame(zip(y_test,y_pred_test), columns=['y_orig', 'y_pred'])
            test = pd.concat([y_test_wnames, test], axis=1)

            train['up'] = train['y_orig']*(1 + tolerance)
            train['down'] = train['y_orig']*(1 - tolerance)
            train['qc'] = 'out'
            train['dataset'] = 'train'
            train.loc[(train['y_pred'] <= train.up) & (train['y_pred'] >= train.down), 'qc'] = 'in'
            trainqc = train.qc.value_counts(normalize=True)

            test['up'] = test['y_orig']*(1 + tolerance)
            test['down'] = test['y_orig']*(1 - tolerance)
            test['qc'] = 'out'
            test['dataset'] = 'test'
            test.loc[(test['y_pred'] <= test.up) & (test['y_pred'] >= test.down), 'qc'] = 'in'
            testqc = test.qc.value_counts(normalize=True)
            df = pd.concat([train, test])
            df['y_pred'] = df['y_pred'].astype('float')

            result = {'result':df, 'testqc':testqc['in'].round(2), 'trainqc':trainqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            if display_flag == 'display':
                print('features dataset: \n', list(X_train.columns))
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            else: 
                print('test "in":', '{:.2f}'.format(result['testqc'].round(2)),'\t', model_name)
            return result
        def xplot_qc2(data, qc_train, qc_test, y_orig, y_pred, max_val, rng, margin, round, comment):
            data = data.round({y_orig: round, y_pred: round})
            ds_train = data[data.dataset == 'train']
            ds_test = data[data.dataset == 'test']
            up_range = rng + 1
            dwn_range = 1 - rng
            colors = {'in': 'green', 'out': 'red'}
            qc_colors_tr = [colors[qc] for qc in ds_train.qc]
            qc_colors_ts = [colors[qc] for qc in ds_test.qc]
            scatter_train = go.Scatter( x=ds_train['y_orig'], y=ds_train['y_pred'],
                                        mode='markers',
                                        marker=dict(color=qc_colors_tr, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = ds_train[['well',y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            scatter_test = go.Scatter(  x=ds_test[y_orig], y=ds_test[y_pred], 
                                        mode='markers',
                                        marker=dict(color=qc_colors_ts, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = data[['well', y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            line_trace_up = go.Scatter(x=[0, max_val], y=[0 + margin, max_val*up_range + margin], mode='lines+markers', line=dict(color='blue'))
            line_trace_dw = go.Scatter(x=[0, max_val], y=[0 - margin, max_val*dwn_range - margin], mode='lines+markers', marker=dict(color='blue'))
            fig = make_subplots(rows=1, cols=2, subplot_titles=(f'train ds {qc_train}', f'test ds {qc_test}'))
            fig.add_trace(scatter_train,  row=1, col=1)
            fig.add_trace(line_trace_up,  row=1, col=1)
            fig.add_trace(line_trace_dw,  row=1, col=1)
            fig.update_xaxes(title_text='actual', row=1, col=1)
            fig.update_yaxes(title_text='predict', row=1, col=1)
            fig.add_trace(scatter_test,  row=1, col=2)
            fig.add_trace(line_trace_up,  row=1, col=2)
            fig.add_trace(line_trace_dw,  row=1, col=2)
            fig.update_xaxes(title_text='actual', row=1, col=2)
            fig.update_yaxes(title_text='predict', row=1, col=2)
            fig.update_layout(  title_text= (comment), width=700, height=350, 
                                margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
            return fig.show()

        print(f'Cluster list is: {cluster_list}')   
        target = 'phit_wavg_target'
        model1_ph = model_prediction_split(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance, 'RandomForestRegressor','display')
        model2_ph = model_prediction_split(dataset, BayesianRidge(), target, tolerance, 'BayesianRidge', 'dont_display')
        model3_ph = model_prediction_split(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, tolerance, 'XGBRegressor', 'dont_display')
        model4_ph = model_prediction_split(dataset, CatBoostRegressor(random_state=42, verbose=False), target, tolerance, 'CatBoostRegressor', 'dont_display')
        model5_ph = model_prediction_split(dataset, AdaBoostRegressor(random_state=42), target, tolerance, 'AdaBoostRegressor', 'dont_display')
        model6_ph = model_prediction_split(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, tolerance, 'LGBMRegressor', 'dont_display')

        xplot_qc2(model1_ph['result'], model1_ph['trainqc'], model1_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        # xplot_qc2(model2_ph['result'], model2_ph['trainqc'], model2_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        # xplot_qc2(model3_ph['result'], model3_ph['trainqc'], model3_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        # xplot_qc2(model4_ph['result'], model4_ph['trainqc'], model4_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        # xplot_qc2(model5_ph['result'], model5_ph['trainqc'], model5_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        # xplot_qc2(model6_ph['result'], model6_ph['trainqc'], model6_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model1_ph
    model_split = run_phit_pred_split(input_ph8, cluster_list, tolerance=0.05)['result']

    def run_phit_pred_1_to_all(dataset, cluster_list, tolerance):
        def model_prediction_1_to_all(dataset, selected_model, target, tolerance, model_name):
            drop_lst_X = ['well','FORMATION_up', 'well_1', 'well_2', 'well_3']
            drop_lst_y = ['well','FORMATION_up']
            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]
            print(model_name)
            df_lst = []
            for wellname in tqdm(dataset.well.unique()[:]):
                train = dataset[dataset.well != wellname]
                X_train_init = train.drop(target, axis=1)
                y_train_init = train[['well','FORMATION_up', target]]
                X_train = X_train_init.drop(drop_lst_X, axis=1)
                y_train = y_train_init.drop(drop_lst_y, axis=1)
                model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
                model.fit(X_train, y_train)

                test = dataset[dataset.well == wellname]
                y_test_wnames = test[['well','FORMATION_up']].reset_index(drop=True)
                X_test_init = test.drop(target, axis=1)
                y_test_init = test[['well','FORMATION_up', target]]
                X_test = X_test_init.drop(drop_lst_X, axis=1)
                y_test = y_test_init.drop(drop_lst_y, axis=1).values[0]
                y_pred = model.predict(X_test)
                test = pd.DataFrame(zip(y_test, y_pred), columns=['y_orig', 'y_pred'])
                test = pd.concat([y_test_wnames, test], axis=1)
                df_lst.append(test)
                
            result = pd.concat(df_lst).reset_index(drop=True)
            result['up'] = result['y_orig']*(1 + tolerance)
            result['down'] = result['y_orig']*(1 - tolerance)
            result['qc'] = 'out'
            result.loc[(result['y_pred'] <= result.up) & (result['y_pred'] >= result.down), 'qc'] = 'in'
            resultqc = result.qc.value_counts(normalize=True)

            phit_pred = result[['well','FORMATION_up','y_pred']].rename(columns={'y_pred':'phit_pred'})
            dataset_pred = dataset.set_index(['well','FORMATION_up']).join(phit_pred.set_index(['well','FORMATION_up']), how='inner').reset_index()

            result_dict = {'result':result, 'res_full':dataset_pred, 'testqc':resultqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            return result_dict
        def xplot_qc_1_to_all(data, qc_test, y_orig, y_pred, max_val, rng, margin, round, comment):
            data = data.round({y_orig: round, y_pred: round})
            ds_test = data
            up_range = rng + 1
            dwn_range = 1 - rng
            colors = {'in': 'green', 'out': 'red'}
            qc_colors = [colors[qc] for qc in ds_test.qc]
            scatter_test = go.Scatter( x=ds_test['y_orig'], y=ds_test['y_pred'],
                                        mode='markers',
                                        marker=dict(color=qc_colors, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = ds_test[['well',y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            line_trace_up = go.Scatter(x=[0, max_val], y=[0 + margin, max_val*up_range + margin], mode='lines+markers', line=dict(color='blue'))
            line_trace_dw = go.Scatter(x=[0, max_val], y=[0 - margin, max_val*dwn_range - margin], mode='lines+markers', marker=dict(color='blue'))
            fig = make_subplots(rows=1, cols=1, subplot_titles=(f'test qc {qc_test}',))
            fig.add_trace(scatter_test,  row=1, col=1)
            fig.add_trace(line_trace_up,  row=1, col=1)
            fig.add_trace(line_trace_dw,  row=1, col=1)
            fig.update_xaxes(title_text='actual', row=1, col=1)
            fig.update_yaxes(title_text='predict', row=1, col=1)
            fig.update_layout(  title_text= (comment), width=350, height=350, 
                                margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
            return fig.show()

        target = 'phit_wavg_target'
        model1_ph = model_prediction_1_to_all(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance,'RandomForestRegressor')
        # model2_ph = model_prediction_1_to_all(dataset, BayesianRidge(), target, 0.05, 'BayesianRidge')
        # model3_ph = model_prediction_1_to_all(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, 0.05, 'XGBRegressor')
        # model4_ph = model_prediction_1_to_all(dataset, CatBoostRegressor(random_state=42, verbose=False), target, 0.05,'CatBoostRegressor')
        # model5_ph = model_prediction_1_to_all(dataset, AdaBoostRegressor(random_state=42), target, 0.05, 'AdaBoostRegressor')
        # model6_ph = model_prediction_1_to_all(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, 0.05, 'LGBMRegressor')

        xplot_qc_1_to_all(model1_ph['result'], model1_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        # xplot_qc_1_to_all(model2_ph['result'], model2_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        # xplot_qc_1_to_all(model3_ph['result'], model3_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        # xplot_qc_1_to_all(model4_ph['result'], model4_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        # xplot_qc_1_to_all(model5_ph['result'], model5_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        # xplot_qc_1_to_all(model6_ph['result'], model6_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model1_ph
    model = run_phit_pred_1_to_all(input_ph8, cluster_list, tolerance=0.05)

    def concat_prediction_to_khtst_df(data_pred, data_khtst, data_main, cluster_algo):
        phit_pred8 = data_pred['result'][['well','FORMATION_up','y_pred']].rename(columns={'y_pred':'phit_pred'})
        khtst8 = data_khtst[data_khtst[cluster_algo].isin(cluster_list)].groupby(['well','FORMATION_up'])['khtst'].sum().reset_index()

        khtst8_phit_pred8 = khtst8.set_index(['well','FORMATION_up']).join(phit_pred8.set_index(['well','FORMATION_up'])).reset_index()

        phitpred_khtst = khtst8_phit_pred8.set_index(['well','FORMATION_up']).join(data_main.set_index(['well','FORMATION_up']), how='inner').reset_index()

        phitpred_khtst.insert(19, 'phit_pred', phitpred_khtst.pop('phit_pred'))
        phitpred_khtst.insert(19, 'phit_wavg_target', phitpred_khtst.pop('phit_wavg_target'))
        phitpred_khtst.insert(19, 'khtst', phitpred_khtst.pop('khtst'))
        return phitpred_khtst
    phitpred_khtst = concat_prediction_to_khtst_df(model, data_clustered8, input_ph8, 'kmeans')
    print(f'Concat dataset features {textwrap.fill(str(list(phitpred_khtst.columns)), width=150)}')

    print('\nPrediction KHtst: ')
    def run_khtst_pred_split(dataset, cluster_list, tolerance):

        def model_prediction_split(dataset, selected_model, target, tolerance, model_name, display_flag='display'):
            """
            'well', 'FORMATION_up', 'well_1', 'well_2', 'well_3', 'dist_1',
            'dist_2', 'dist_3', 'phit_wavg_1', 'vsh_wavg_1', 'htst_sum_1',
            'phit_wavg_2', 'vsh_wavg_2', 'htst_sum_2', 'phit_wavg_3', 'vsh_wavg_3',
            'htst_sum_3', 'phit_pred', 'phit_wavg_target', 'khtst'
            """
            drop_lst_X = ['well','FORMATION_up',    'well_1', 'well_2', 'well_3', 'dist_1', 'dist_2','dist_3', 'phit_wavg_target']
            drop_lst_y = ['well','FORMATION_up']

            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=55)

            y_train_wnames = y_train[['well','FORMATION_up']].reset_index(drop=True)
            y_test_wnames = y_test[['well','FORMATION_up']].reset_index(drop=True)

            X_train = X_train.drop(drop_lst_X, axis=1)
            X_test = X_test.drop(drop_lst_X, axis=1)
            y_train = y_train.drop(drop_lst_y, axis=1)
            y_test = y_test.drop(drop_lst_y, axis=1)

            # model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
            model = selected_model
            model.fit(X_train, y_train)
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)
            y_train = np.array(y_train).flatten()
            y_test = np.array(y_test).flatten()
            train = pd.DataFrame(zip(y_train,y_pred_train), columns=['y_orig', 'y_pred'])
            train = pd.concat([y_train_wnames, train], axis=1)
            test = pd.DataFrame(zip(y_test,y_pred_test), columns=['y_orig', 'y_pred'])
            test = pd.concat([y_test_wnames, test], axis=1)

            train['up'] = train['y_orig']*(1 + tolerance)
            train['down'] = train['y_orig']*(1 - tolerance)
            train['qc'] = 'out'
            train['dataset'] = 'train'
            train.loc[(train['y_pred'] <= train.up) & (train['y_pred'] >= train.down), 'qc'] = 'in'
            trainqc = train.qc.value_counts(normalize=True)

            test['up'] = test['y_orig']*(1 + tolerance)
            test['down'] = test['y_orig']*(1 - tolerance)
            test['qc'] = 'out'
            test['dataset'] = 'test'
            test.loc[(test['y_pred'] <= test.up) & (test['y_pred'] >= test.down), 'qc'] = 'in'
            testqc = test.qc.value_counts(normalize=True)
            df = pd.concat([train, test])
            df['y_pred'] = df['y_pred'].astype('float')

            result = {'result':df, 'testqc':testqc['in'].round(2), 'trainqc':trainqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            if display_flag == 'display':
                print(f'features dataset: {list(X_train.columns)}')
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            else: 
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            return result

        print(f'Cluster list is: {cluster_list}')
        target = 'khtst'
        model1_kh = model_prediction_split(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance, 'RandomForestRegressor','display')
        model2_kh = model_prediction_split(dataset, BayesianRidge(), target, tolerance, 'BayesianRidge', 'dont_display')
        model3_kh = model_prediction_split(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, tolerance, 'XGBRegressor', 'dont_display')
        model4_kh = model_prediction_split(dataset, CatBoostRegressor(random_state=42, verbose=False), target, tolerance, 'CatBoostRegressor', 'dont_display')
        model5_kh = model_prediction_split(dataset, AdaBoostRegressor(random_state=42), target, tolerance, 'AdaBoostRegressor', 'dont_display')
        model6_kh = model_prediction_split(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, tolerance, 'LGBMRegressor', 'dont_display')

        # xplot_qc2(model1_kh['result'], model1_kh['trainqc'], model1_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        # xplot_qc2(model2_kh['result'], model2_kh['trainqc'], model2_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        # xplot_qc2(model3_kh['result'], model3_kh['trainqc'], model3_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        xplot_qc2(model4_kh['result'], model4_kh['trainqc'], model4_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        # xplot_qc2(model5_kh['result'], model5_kh['trainqc'], model5_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        # xplot_qc2(model6_kh['result'], model6_kh['trainqc'], model6_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model4_kh
    model_khtst = run_khtst_pred_split(phitpred_khtst, cluster_list, 0.25)
    result = {'khtst_pred':model_khtst, 'khtst_data':phitpred_khtst, 'phit_pred':model['result']}
    return result
test_1 = khtst_workflow(cluster_list = [1])

In [None]:
def pairplot_special(dataset, xsize, ysize, flag=1):
    if flag == 1:
        def corrfunc(x, y, **kws):
            r, _ = stats.pearsonr(x, y)
            ax = plt.gca()
            ax.annotate("r = {:.2f}".format(r),
                        xy=(.1, .9), xycoords=ax.transAxes)
        sns.set_context(rc={'axes.labelsize':10, 'lines.linewidth': 0.75})
        g = sns.PairGrid(dataset)
        g.fig.set_size_inches(xsize,ysize)
        g.set(xticklabels=[], yticklabels=[]) 
        g.map_upper(plt.scatter, s=10, alpha=0.5)
        g.map_diag(sns.distplot, kde=False)
        g.map_lower(sns.kdeplot, cmap="Blues_d")
        g.map_lower(corrfunc)
    else:
        pass
drop_lst_X = ['well','FORMATION_up',    'well_1', 'well_2', 'well_3',
                                        'dist_1', 'dist_2', 'dist_3', 'vsh_wavg_1', 'htst_sum_1', 'vsh_wavg_2', 'htst_sum_2', 'vsh_wavg_3', 'htst_sum_3', 'cluster']
pairplot_special(test_1['khtst_data'].drop(drop_lst_X, axis=1), 7, 7, 1)

#### Cluster 2

In [None]:
def khtst_workflow(cluster_list):

    def dataset_for_spatial_prediction(dataset_full, dataset_cluster, offset_qty, cluster_algo, cluster_list):
            
            def joining_coordinates(dataset_full, dataset_cluster, cluster_algo, cluster_list):
                coordinates = dataset_full.groupby(['well','FORMATION_up'])[['X_mean','Y_mean']].apply(lambda x: x.iloc[0]).reset_index()
                dataset_cluster = dataset_cluster[(dataset_cluster[cluster_algo].isin(cluster_list))]
                result = dataset_cluster.set_index(['well','FORMATION_up']).join(coordinates.set_index(['well','FORMATION_up'])).reset_index()
                coordinates = result[['well','FORMATION_up', 'X_mean', 'Y_mean']].groupby(['well','FORMATION_up']).apply(lambda x: x.iloc[0]).reset_index(drop=True)
                return coordinates, result
            coordinates, dataset_cluster_xy = joining_coordinates(dataset_full, dataset_cluster, cluster_algo, cluster_list)
            coordinates = coordinates[~coordinates.well.isin(['A14Y'])]

            def well_distance_calculation(coordinates, fm):
                coordinates_fm = coordinates[coordinates.FORMATION_up == fm]
                df_distance_fm = pd.DataFrame(euclidean_distances(coordinates_fm[['X_mean', 'Y_mean']]), columns=list(coordinates_fm.well))
                well_name_rows = coordinates_fm.well.reset_index().drop(['index'], axis=1)
                result = df_distance_fm.join(well_name_rows).set_index('well').reset_index()
                return result
            well_dist_crosstable_8 = well_distance_calculation(coordinates, 'Balakhany VIII')

            def offset_well_names_dist(dataset, offset_qty):
                df_lst = []
                for ind in range(len(dataset.well.unique())):
                    off_well_series = dataset.iloc[ind]
                    off_well_selected = pd.DataFrame(off_well_series)[1:].sort_values(by=ind)[:offset_qty+1].T
                    off_well_selected['well'] = off_well_selected.columns[0]
                    off_well_selected = off_well_selected.drop(columns= off_well_selected.well, axis=1)

                    dist_titles = ['dist_' + str(num+1) for num in range(offset_qty)]
                    well_titles = ['well_' + str(num+1) for num in range(offset_qty)]

                    col_names = []
                    for i in range(len(off_well_selected.columns[:-1])):
                        col = off_well_selected.columns[i]
                        col_names.append(col)
                        off_well_selected = off_well_selected.rename(columns={col:dist_titles[i]})

                    off_well_names = pd.DataFrame(col_names).T
                    col_names = []
                    for i in range(len(off_well_names.columns)):
                        col = off_well_names.columns[i]
                        col_names.append(col)
                        off_well_names = off_well_names.rename(columns={col:well_titles[i]})
                    
                    concat_well_data = pd.concat([off_well_names.reset_index(drop=True), off_well_selected.reset_index(drop=True)], axis=1)
                    df_lst.append(concat_well_data)
                result = pd.concat(df_lst).reset_index(drop=True)
                return result
            well_dist_data8 = offset_well_names_dist(well_dist_crosstable_8, offset_qty)

            def offset_wells_features_calculation(dataset_dist, dataset_clusters, cluster_algo, cluster_list, fm):
                df_lst = []
                for wellname in dataset_dist.well.unique():
                    data = dataset_dist[dataset_dist.well == wellname]
                    cc = 0
                    for j in data.columns:
                        if 'well_' in j:
                            cc += 1
                            offset_wellname = data[j].values[0]
                            data_cluster = dataset_clusters[(dataset_clusters.well == offset_wellname) & 
                                                                (dataset_clusters[cluster_algo].isin(cluster_list))]
                            var_name = 'phit_wavg_' + str(cc)
                            data[var_name] = ((data_cluster['phit_avg'] * data_cluster['htst']).sum()) / (data_cluster['htst'].sum())
                            var_name = 'vsh_wavg_' + str(cc)
                            data[var_name] = ((data_cluster['vsh_avg'] * data_cluster['htst']).sum()) / (data_cluster['htst'].sum())
                            var_name = 'htst_sum_' + str(cc)
                            data[var_name] = data_cluster['htst'].sum()                
                    df_lst.append(data)
                result = pd.concat(df_lst).reset_index(drop=True)
                result['FORMATION_up'] = fm
                return result
            well_features8 = offset_wells_features_calculation(well_dist_data8, dataset_cluster, cluster_algo, cluster_list, 'Balakhany VIII')

            def target_wells_variable_calculation(dataset_dist, dataset_clusters, cluster_algo, cluster_list, fm):
                df_lst = []
                for wellname in dataset_dist.well.unique():
                    df = pd.DataFrame({'well': [wellname], 'FORMATION_up': [fm], 'phit_wavg_target': [0]})
                    data = dataset_clusters[(dataset_clusters.well == wellname) & 
                                            (dataset_clusters[cluster_algo].isin(cluster_list))]
                    df['phit_wavg_target'] = ((data['phit_avg'] * data['htst']).sum()) / (data['htst'].sum())
                    df_lst.append(df)
                result = pd.concat(df_lst).reset_index(drop=True)
                return result
            well_target8 = target_wells_variable_calculation(well_dist_data8, dataset_cluster, cluster_algo, cluster_list, 'Balakhany VIII')
            
            dataset8 = well_target8.set_index(['well','FORMATION_up']).join(well_features8.set_index(['well','FORMATION_up'])).reset_index()

            result = {'dataset8':dataset8, 'cluster_xy':dataset_cluster_xy, 'well_dist8':well_dist_data8, 'coordinates':coordinates,
                    'target8':well_target8, 'feature8':well_features8, 'dist_crosstable8':well_dist_crosstable_8}
            return result
    input_ph8 = dataset_for_spatial_prediction(df_bal_net2_kh, data_clustered8, 3, 'kmeans', cluster_list)['dataset8']
    print(f'Dataset features {textwrap.fill(str(list(input_ph8.columns)), width=150)}')

    def run_phit_pred_split(dataset, cluster_list, tolerance):
        def model_prediction_split(dataset, selected_model, target, tolerance, model_name, display_flag='display'):
            drop_lst_X = ['well','FORMATION_up', 'well_1', 'well_2', 'well_3']
            drop_lst_y = ['well','FORMATION_up']

            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=55)

            y_train_wnames = y_train[['well','FORMATION_up']].reset_index(drop=True)
            y_test_wnames = y_test[['well','FORMATION_up']].reset_index(drop=True)

            X_train = X_train.drop(drop_lst_X, axis=1)
            X_test = X_test.drop(drop_lst_X, axis=1)
            y_train = y_train.drop(drop_lst_y, axis=1)
            y_test = y_test.drop(drop_lst_y, axis=1)

            model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
            # model = selected_model
            model.fit(X_train, y_train)
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)
            y_train = np.array(y_train).flatten()
            y_test = np.array(y_test).flatten()
            train = pd.DataFrame(zip(y_train,y_pred_train), columns=['y_orig', 'y_pred'])
            train = pd.concat([y_train_wnames, train], axis=1)
            test = pd.DataFrame(zip(y_test,y_pred_test), columns=['y_orig', 'y_pred'])
            test = pd.concat([y_test_wnames, test], axis=1)

            train['up'] = train['y_orig']*(1 + tolerance)
            train['down'] = train['y_orig']*(1 - tolerance)
            train['qc'] = 'out'
            train['dataset'] = 'train'
            train.loc[(train['y_pred'] <= train.up) & (train['y_pred'] >= train.down), 'qc'] = 'in'
            trainqc = train.qc.value_counts(normalize=True)

            test['up'] = test['y_orig']*(1 + tolerance)
            test['down'] = test['y_orig']*(1 - tolerance)
            test['qc'] = 'out'
            test['dataset'] = 'test'
            test.loc[(test['y_pred'] <= test.up) & (test['y_pred'] >= test.down), 'qc'] = 'in'
            testqc = test.qc.value_counts(normalize=True)
            df = pd.concat([train, test])
            df['y_pred'] = df['y_pred'].astype('float')

            result = {'result':df, 'testqc':testqc['in'].round(2), 'trainqc':trainqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            if display_flag == 'display':
                print('features dataset: \n', list(X_train.columns))
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            else: 
                print('test "in":', '{:.2f}'.format(result['testqc'].round(2)),'\t', model_name)
            return result
        def xplot_qc2(data, qc_train, qc_test, y_orig, y_pred, max_val, rng, margin, round, comment):
            data = data.round({y_orig: round, y_pred: round})
            ds_train = data[data.dataset == 'train']
            ds_test = data[data.dataset == 'test']
            up_range = rng + 1
            dwn_range = 1 - rng
            colors = {'in': 'green', 'out': 'red'}
            qc_colors_tr = [colors[qc] for qc in ds_train.qc]
            qc_colors_ts = [colors[qc] for qc in ds_test.qc]
            scatter_train = go.Scatter( x=ds_train['y_orig'], y=ds_train['y_pred'],
                                        mode='markers',
                                        marker=dict(color=qc_colors_tr, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = ds_train[['well',y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            scatter_test = go.Scatter(  x=ds_test[y_orig], y=ds_test[y_pred], 
                                        mode='markers',
                                        marker=dict(color=qc_colors_ts, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = data[['well', y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            line_trace_up = go.Scatter(x=[0, max_val], y=[0 + margin, max_val*up_range + margin], mode='lines+markers', line=dict(color='blue'))
            line_trace_dw = go.Scatter(x=[0, max_val], y=[0 - margin, max_val*dwn_range - margin], mode='lines+markers', marker=dict(color='blue'))
            fig = make_subplots(rows=1, cols=2, subplot_titles=(f'train ds {qc_train}', f'test ds {qc_test}'))
            fig.add_trace(scatter_train,  row=1, col=1)
            fig.add_trace(line_trace_up,  row=1, col=1)
            fig.add_trace(line_trace_dw,  row=1, col=1)
            fig.update_xaxes(title_text='actual', row=1, col=1)
            fig.update_yaxes(title_text='predict', row=1, col=1)
            fig.add_trace(scatter_test,  row=1, col=2)
            fig.add_trace(line_trace_up,  row=1, col=2)
            fig.add_trace(line_trace_dw,  row=1, col=2)
            fig.update_xaxes(title_text='actual', row=1, col=2)
            fig.update_yaxes(title_text='predict', row=1, col=2)
            fig.update_layout(  title_text= (comment), width=700, height=350, 
                                margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
            return fig.show()

        print(f'Cluster list is: {cluster_list}')   
        target = 'phit_wavg_target'
        model1_ph = model_prediction_split(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance, 'RandomForestRegressor','display')
        model2_ph = model_prediction_split(dataset, BayesianRidge(), target, tolerance, 'BayesianRidge', 'dont_display')
        model3_ph = model_prediction_split(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, tolerance, 'XGBRegressor', 'dont_display')
        model4_ph = model_prediction_split(dataset, CatBoostRegressor(random_state=42, verbose=False), target, tolerance, 'CatBoostRegressor', 'dont_display')
        model5_ph = model_prediction_split(dataset, AdaBoostRegressor(random_state=42), target, tolerance, 'AdaBoostRegressor', 'dont_display')
        model6_ph = model_prediction_split(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, tolerance, 'LGBMRegressor', 'dont_display')

        xplot_qc2(model1_ph['result'], model1_ph['trainqc'], model1_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        # xplot_qc2(model2_ph['result'], model2_ph['trainqc'], model2_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        # xplot_qc2(model3_ph['result'], model3_ph['trainqc'], model3_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        # xplot_qc2(model4_ph['result'], model4_ph['trainqc'], model4_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        # xplot_qc2(model5_ph['result'], model5_ph['trainqc'], model5_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        # xplot_qc2(model6_ph['result'], model6_ph['trainqc'], model6_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model1_ph
    model_split = run_phit_pred_split(input_ph8, cluster_list, tolerance=0.05)['result']

    def run_phit_pred_1_to_all(dataset, cluster_list, tolerance):
        def model_prediction_1_to_all(dataset, selected_model, target, tolerance, model_name):
            drop_lst_X = ['well','FORMATION_up', 'well_1', 'well_2', 'well_3']
            drop_lst_y = ['well','FORMATION_up']
            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]
            print(model_name)
            df_lst = []
            for wellname in tqdm(dataset.well.unique()[:]):
                train = dataset[dataset.well != wellname]
                X_train_init = train.drop(target, axis=1)
                y_train_init = train[['well','FORMATION_up', target]]
                X_train = X_train_init.drop(drop_lst_X, axis=1)
                y_train = y_train_init.drop(drop_lst_y, axis=1)
                model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
                model.fit(X_train, y_train)

                test = dataset[dataset.well == wellname]
                y_test_wnames = test[['well','FORMATION_up']].reset_index(drop=True)
                X_test_init = test.drop(target, axis=1)
                y_test_init = test[['well','FORMATION_up', target]]
                X_test = X_test_init.drop(drop_lst_X, axis=1)
                y_test = y_test_init.drop(drop_lst_y, axis=1).values[0]
                y_pred = model.predict(X_test)
                test = pd.DataFrame(zip(y_test, y_pred), columns=['y_orig', 'y_pred'])
                test = pd.concat([y_test_wnames, test], axis=1)
                df_lst.append(test)
                
            result = pd.concat(df_lst).reset_index(drop=True)
            result['up'] = result['y_orig']*(1 + tolerance)
            result['down'] = result['y_orig']*(1 - tolerance)
            result['qc'] = 'out'
            result.loc[(result['y_pred'] <= result.up) & (result['y_pred'] >= result.down), 'qc'] = 'in'
            resultqc = result.qc.value_counts(normalize=True)

            phit_pred = result[['well','FORMATION_up','y_pred']].rename(columns={'y_pred':'phit_pred'})
            dataset_pred = dataset.set_index(['well','FORMATION_up']).join(phit_pred.set_index(['well','FORMATION_up']), how='inner').reset_index()

            result_dict = {'result':result, 'res_full':dataset_pred, 'testqc':resultqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            return result_dict
        def xplot_qc_1_to_all(data, qc_test, y_orig, y_pred, max_val, rng, margin, round, comment):
            data = data.round({y_orig: round, y_pred: round})
            ds_test = data
            up_range = rng + 1
            dwn_range = 1 - rng
            colors = {'in': 'green', 'out': 'red'}
            qc_colors = [colors[qc] for qc in ds_test.qc]
            scatter_test = go.Scatter( x=ds_test['y_orig'], y=ds_test['y_pred'],
                                        mode='markers',
                                        marker=dict(color=qc_colors, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = ds_test[['well',y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            line_trace_up = go.Scatter(x=[0, max_val], y=[0 + margin, max_val*up_range + margin], mode='lines+markers', line=dict(color='blue'))
            line_trace_dw = go.Scatter(x=[0, max_val], y=[0 - margin, max_val*dwn_range - margin], mode='lines+markers', marker=dict(color='blue'))
            fig = make_subplots(rows=1, cols=1, subplot_titles=(f'test qc {qc_test}',))
            fig.add_trace(scatter_test,  row=1, col=1)
            fig.add_trace(line_trace_up,  row=1, col=1)
            fig.add_trace(line_trace_dw,  row=1, col=1)
            fig.update_xaxes(title_text='actual', row=1, col=1)
            fig.update_yaxes(title_text='predict', row=1, col=1)
            fig.update_layout(  title_text= (comment), width=350, height=350, 
                                margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
            return fig.show()

        target = 'phit_wavg_target'
        model1_ph = model_prediction_1_to_all(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance,'RandomForestRegressor')
        # model2_ph = model_prediction_1_to_all(dataset, BayesianRidge(), target, 0.05, 'BayesianRidge')
        # model3_ph = model_prediction_1_to_all(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, 0.05, 'XGBRegressor')
        # model4_ph = model_prediction_1_to_all(dataset, CatBoostRegressor(random_state=42, verbose=False), target, 0.05,'CatBoostRegressor')
        # model5_ph = model_prediction_1_to_all(dataset, AdaBoostRegressor(random_state=42), target, 0.05, 'AdaBoostRegressor')
        # model6_ph = model_prediction_1_to_all(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, 0.05, 'LGBMRegressor')

        xplot_qc_1_to_all(model1_ph['result'], model1_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        # xplot_qc_1_to_all(model2_ph['result'], model2_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        # xplot_qc_1_to_all(model3_ph['result'], model3_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        # xplot_qc_1_to_all(model4_ph['result'], model4_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        # xplot_qc_1_to_all(model5_ph['result'], model5_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        # xplot_qc_1_to_all(model6_ph['result'], model6_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model1_ph
    model = run_phit_pred_1_to_all(input_ph8, cluster_list, tolerance=0.05)

    def concat_prediction_to_khtst_df(data_pred, data_khtst, data_main, cluster_algo):
        phit_pred8 = data_pred['result'][['well','FORMATION_up','y_pred']].rename(columns={'y_pred':'phit_pred'})
        khtst8 = data_khtst[data_khtst[cluster_algo].isin(cluster_list)].groupby(['well','FORMATION_up'])['khtst'].sum().reset_index()

        khtst8_phit_pred8 = khtst8.set_index(['well','FORMATION_up']).join(phit_pred8.set_index(['well','FORMATION_up'])).reset_index()

        phitpred_khtst = khtst8_phit_pred8.set_index(['well','FORMATION_up']).join(data_main.set_index(['well','FORMATION_up']), how='inner').reset_index()

        phitpred_khtst.insert(19, 'phit_pred', phitpred_khtst.pop('phit_pred'))
        phitpred_khtst.insert(19, 'phit_wavg_target', phitpred_khtst.pop('phit_wavg_target'))
        phitpred_khtst.insert(19, 'khtst', phitpred_khtst.pop('khtst'))
        return phitpred_khtst
    phitpred_khtst = concat_prediction_to_khtst_df(model, data_clustered8, input_ph8, 'kmeans')
    print(f'Concat dataset features {textwrap.fill(str(list(phitpred_khtst.columns)), width=150)}')

    print('\nPrediction KHtst: ')
    def run_khtst_pred_split(dataset, cluster_list, tolerance):

        def model_prediction_split(dataset, selected_model, target, tolerance, model_name, display_flag='display'):
            """
            'well', 'FORMATION_up', 'well_1', 'well_2', 'well_3', 'dist_1',
            'dist_2', 'dist_3', 'phit_wavg_1', 'vsh_wavg_1', 'htst_sum_1',
            'phit_wavg_2', 'vsh_wavg_2', 'htst_sum_2', 'phit_wavg_3', 'vsh_wavg_3',
            'htst_sum_3', 'phit_pred', 'phit_wavg_target', 'khtst'
            """
            drop_lst_X = ['well','FORMATION_up',    'well_1', 'well_2', 'well_3', 'dist_1', 'dist_2','dist_3', 'phit_wavg_target']
            drop_lst_y = ['well','FORMATION_up']

            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=55)

            y_train_wnames = y_train[['well','FORMATION_up']].reset_index(drop=True)
            y_test_wnames = y_test[['well','FORMATION_up']].reset_index(drop=True)

            X_train = X_train.drop(drop_lst_X, axis=1)
            X_test = X_test.drop(drop_lst_X, axis=1)
            y_train = y_train.drop(drop_lst_y, axis=1)
            y_test = y_test.drop(drop_lst_y, axis=1)

            # model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
            model = selected_model
            model.fit(X_train, y_train)
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)
            y_train = np.array(y_train).flatten()
            y_test = np.array(y_test).flatten()
            train = pd.DataFrame(zip(y_train,y_pred_train), columns=['y_orig', 'y_pred'])
            train = pd.concat([y_train_wnames, train], axis=1)
            test = pd.DataFrame(zip(y_test,y_pred_test), columns=['y_orig', 'y_pred'])
            test = pd.concat([y_test_wnames, test], axis=1)

            train['up'] = train['y_orig']*(1 + tolerance)
            train['down'] = train['y_orig']*(1 - tolerance)
            train['qc'] = 'out'
            train['dataset'] = 'train'
            train.loc[(train['y_pred'] <= train.up) & (train['y_pred'] >= train.down), 'qc'] = 'in'
            trainqc = train.qc.value_counts(normalize=True)

            test['up'] = test['y_orig']*(1 + tolerance)
            test['down'] = test['y_orig']*(1 - tolerance)
            test['qc'] = 'out'
            test['dataset'] = 'test'
            test.loc[(test['y_pred'] <= test.up) & (test['y_pred'] >= test.down), 'qc'] = 'in'
            testqc = test.qc.value_counts(normalize=True)
            df = pd.concat([train, test])
            df['y_pred'] = df['y_pred'].astype('float')

            result = {'result':df, 'testqc':testqc['in'].round(2), 'trainqc':trainqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            if display_flag == 'display':
                print(f'features dataset: {list(X_train.columns)}')
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            else: 
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            return result

        print(f'Cluster list is: {cluster_list}')
        target = 'khtst'
        model1_kh = model_prediction_split(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance, 'RandomForestRegressor','display')
        model2_kh = model_prediction_split(dataset, BayesianRidge(), target, tolerance, 'BayesianRidge', 'dont_display')
        model3_kh = model_prediction_split(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, tolerance, 'XGBRegressor', 'dont_display')
        model4_kh = model_prediction_split(dataset, CatBoostRegressor(random_state=42, verbose=False), target, tolerance, 'CatBoostRegressor', 'dont_display')
        model5_kh = model_prediction_split(dataset, AdaBoostRegressor(random_state=42), target, tolerance, 'AdaBoostRegressor', 'dont_display')
        model6_kh = model_prediction_split(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, tolerance, 'LGBMRegressor', 'dont_display')

        # xplot_qc2(model1_kh['result'], model1_kh['trainqc'], model1_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        # xplot_qc2(model2_kh['result'], model2_kh['trainqc'], model2_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        # xplot_qc2(model3_kh['result'], model3_kh['trainqc'], model3_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        xplot_qc2(model4_kh['result'], model4_kh['trainqc'], model4_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        # xplot_qc2(model5_kh['result'], model5_kh['trainqc'], model5_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        # xplot_qc2(model6_kh['result'], model6_kh['trainqc'], model6_kh['testqc'], 'y_orig', 'y_pred', 27000, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model4_kh
    model_khtst = run_khtst_pred_split(phitpred_khtst, cluster_list, 0.25)
    result = {'khtst_pred':model_khtst, 'khtst_data':phitpred_khtst, 'phit_pred':model['result']}
    return result
test_2 = khtst_workflow(cluster_list = [2])

In [None]:
def pairplot_special(dataset, xsize, ysize, flag=1):
    if flag == 1:
        def corrfunc(x, y, **kws):
            r, _ = stats.pearsonr(x, y)
            ax = plt.gca()
            ax.annotate("r = {:.2f}".format(r),
                        xy=(.1, .9), xycoords=ax.transAxes)
        sns.set_context(rc={'axes.labelsize':10, 'lines.linewidth': 0.75})
        g = sns.PairGrid(dataset)
        g.fig.set_size_inches(xsize,ysize)
        g.set(xticklabels=[], yticklabels=[]) 
        g.map_upper(plt.scatter, s=10, alpha=0.5)
        g.map_diag(sns.distplot, kde=False)
        g.map_lower(sns.kdeplot, cmap="Blues_d")
        g.map_lower(corrfunc)
    else:
        pass
drop_lst_X = ['well','FORMATION_up',    'well_1', 'well_2', 'well_3',
                                        'dist_1', 'dist_2', 'dist_3', 'vsh_wavg_1', 'htst_sum_1', 'vsh_wavg_2', 'htst_sum_2', 'vsh_wavg_3', 'htst_sum_3','cluster']
pairplot_special(test_2['khtst_data'].drop(drop_lst_X, axis=1), 7, 7, 1)

#### Box cox to khtst

In [None]:
def khtst_workflow(cluster_list):

    def dataset_for_spatial_prediction(dataset_full, dataset_cluster, offset_qty, cluster_algo, cluster_list):
            
            def joining_coordinates(dataset_full, dataset_cluster, cluster_algo, cluster_list):
                coordinates = dataset_full.groupby(['well','FORMATION_up'])[['X_mean','Y_mean']].apply(lambda x: x.iloc[0]).reset_index()
                dataset_cluster = dataset_cluster[(dataset_cluster[cluster_algo].isin(cluster_list))]
                result = dataset_cluster.set_index(['well','FORMATION_up']).join(coordinates.set_index(['well','FORMATION_up'])).reset_index()
                coordinates = result[['well','FORMATION_up', 'X_mean', 'Y_mean']].groupby(['well','FORMATION_up']).apply(lambda x: x.iloc[0]).reset_index(drop=True)
                return coordinates, result
            coordinates, dataset_cluster_xy = joining_coordinates(dataset_full, dataset_cluster, cluster_algo, cluster_list)
            coordinates = coordinates[~coordinates.well.isin(['A14Y'])]

            def well_distance_calculation(coordinates, fm):
                coordinates_fm = coordinates[coordinates.FORMATION_up == fm]
                df_distance_fm = pd.DataFrame(euclidean_distances(coordinates_fm[['X_mean', 'Y_mean']]), columns=list(coordinates_fm.well))
                well_name_rows = coordinates_fm.well.reset_index().drop(['index'], axis=1)
                result = df_distance_fm.join(well_name_rows).set_index('well').reset_index()
                return result
            well_dist_crosstable_8 = well_distance_calculation(coordinates, 'Balakhany VIII')

            def offset_well_names_dist(dataset, offset_qty):
                df_lst = []
                for ind in range(len(dataset.well.unique())):
                    off_well_series = dataset.iloc[ind]
                    off_well_selected = pd.DataFrame(off_well_series)[1:].sort_values(by=ind)[:offset_qty+1].T
                    off_well_selected['well'] = off_well_selected.columns[0]
                    off_well_selected = off_well_selected.drop(columns= off_well_selected.well, axis=1)

                    dist_titles = ['dist_' + str(num+1) for num in range(offset_qty)]
                    well_titles = ['well_' + str(num+1) for num in range(offset_qty)]

                    col_names = []
                    for i in range(len(off_well_selected.columns[:-1])):
                        col = off_well_selected.columns[i]
                        col_names.append(col)
                        off_well_selected = off_well_selected.rename(columns={col:dist_titles[i]})

                    off_well_names = pd.DataFrame(col_names).T
                    col_names = []
                    for i in range(len(off_well_names.columns)):
                        col = off_well_names.columns[i]
                        col_names.append(col)
                        off_well_names = off_well_names.rename(columns={col:well_titles[i]})
                    
                    concat_well_data = pd.concat([off_well_names.reset_index(drop=True), off_well_selected.reset_index(drop=True)], axis=1)
                    df_lst.append(concat_well_data)
                result = pd.concat(df_lst).reset_index(drop=True)
                return result
            well_dist_data8 = offset_well_names_dist(well_dist_crosstable_8, offset_qty)

            def offset_wells_features_calculation(dataset_dist, dataset_clusters, cluster_algo, cluster_list, fm):
                df_lst = []
                for wellname in dataset_dist.well.unique():
                    data = dataset_dist[dataset_dist.well == wellname]
                    cc = 0
                    for j in data.columns:
                        if 'well_' in j:
                            cc += 1
                            offset_wellname = data[j].values[0]
                            data_cluster = dataset_clusters[(dataset_clusters.well == offset_wellname) & 
                                                                (dataset_clusters[cluster_algo].isin(cluster_list))]
                            var_name = 'phit_wavg_' + str(cc)
                            data[var_name] = ((data_cluster['phit_avg'] * data_cluster['htst']).sum()) / (data_cluster['htst'].sum())
                            var_name = 'vsh_wavg_' + str(cc)
                            data[var_name] = ((data_cluster['vsh_avg'] * data_cluster['htst']).sum()) / (data_cluster['htst'].sum())
                            var_name = 'htst_sum_' + str(cc)
                            data[var_name] = data_cluster['htst'].sum()                
                    df_lst.append(data)
                result = pd.concat(df_lst).reset_index(drop=True)
                result['FORMATION_up'] = fm
                return result
            well_features8 = offset_wells_features_calculation(well_dist_data8, dataset_cluster, cluster_algo, cluster_list, 'Balakhany VIII')

            def target_wells_variable_calculation(dataset_dist, dataset_clusters, cluster_algo, cluster_list, fm):
                df_lst = []
                for wellname in dataset_dist.well.unique():
                    df = pd.DataFrame({'well': [wellname], 'FORMATION_up': [fm], 'phit_wavg_target': [0]})
                    data = dataset_clusters[(dataset_clusters.well == wellname) & 
                                            (dataset_clusters[cluster_algo].isin(cluster_list))]
                    df['phit_wavg_target'] = ((data['phit_avg'] * data['htst']).sum()) / (data['htst'].sum())
                    df_lst.append(df)
                result = pd.concat(df_lst).reset_index(drop=True)
                return result
            well_target8 = target_wells_variable_calculation(well_dist_data8, dataset_cluster, cluster_algo, cluster_list, 'Balakhany VIII')
            
            dataset8 = well_target8.set_index(['well','FORMATION_up']).join(well_features8.set_index(['well','FORMATION_up'])).reset_index()

            result = {'dataset8':dataset8, 'cluster_xy':dataset_cluster_xy, 'well_dist8':well_dist_data8, 'coordinates':coordinates,
                    'target8':well_target8, 'feature8':well_features8, 'dist_crosstable8':well_dist_crosstable_8}
            return result
    input_ph8 = dataset_for_spatial_prediction(df_bal_net2_kh, data_clustered8, 3, 'kmeans', cluster_list)['dataset8']
    print(f'Dataset features {textwrap.fill(str(list(input_ph8.columns)), width=150)}')

    def run_phit_pred_split(dataset, cluster_list, tolerance):
        def model_prediction_split(dataset, selected_model, target, tolerance, model_name, display_flag='display'):
            drop_lst_X = ['well','FORMATION_up', 'well_1', 'well_2', 'well_3']
            drop_lst_y = ['well','FORMATION_up']

            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=55)

            y_train_wnames = y_train[['well','FORMATION_up']].reset_index(drop=True)
            y_test_wnames = y_test[['well','FORMATION_up']].reset_index(drop=True)

            X_train = X_train.drop(drop_lst_X, axis=1)
            X_test = X_test.drop(drop_lst_X, axis=1)
            y_train = y_train.drop(drop_lst_y, axis=1)
            y_test = y_test.drop(drop_lst_y, axis=1)

            model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
            # model = selected_model
            model.fit(X_train, y_train)
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)
            y_train = np.array(y_train).flatten()
            y_test = np.array(y_test).flatten()
            train = pd.DataFrame(zip(y_train,y_pred_train), columns=['y_orig', 'y_pred'])
            train = pd.concat([y_train_wnames, train], axis=1)
            test = pd.DataFrame(zip(y_test,y_pred_test), columns=['y_orig', 'y_pred'])
            test = pd.concat([y_test_wnames, test], axis=1)

            train['up'] = train['y_orig']*(1 + tolerance)
            train['down'] = train['y_orig']*(1 - tolerance)
            train['qc'] = 'out'
            train['dataset'] = 'train'
            train.loc[(train['y_pred'] <= train.up) & (train['y_pred'] >= train.down), 'qc'] = 'in'
            trainqc = train.qc.value_counts(normalize=True)

            test['up'] = test['y_orig']*(1 + tolerance)
            test['down'] = test['y_orig']*(1 - tolerance)
            test['qc'] = 'out'
            test['dataset'] = 'test'
            test.loc[(test['y_pred'] <= test.up) & (test['y_pred'] >= test.down), 'qc'] = 'in'
            testqc = test.qc.value_counts(normalize=True)
            df = pd.concat([train, test])
            df['y_pred'] = df['y_pred'].astype('float')

            result = {'result':df, 'testqc':testqc['in'].round(2), 'trainqc':trainqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            if display_flag == 'display':
                print('features dataset: \n', list(X_train.columns))
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            else: 
                print('test "in":', '{:.2f}'.format(result['testqc'].round(2)),'\t', model_name)
            return result
        def xplot_qc2(data, qc_train, qc_test, y_orig, y_pred, max_val, rng, margin, round, comment):
            data = data.round({y_orig: round, y_pred: round})
            ds_train = data[data.dataset == 'train']
            ds_test = data[data.dataset == 'test']
            up_range = rng + 1
            dwn_range = 1 - rng
            colors = {'in': 'green', 'out': 'red'}
            qc_colors_tr = [colors[qc] for qc in ds_train.qc]
            qc_colors_ts = [colors[qc] for qc in ds_test.qc]
            scatter_train = go.Scatter( x=ds_train['y_orig'], y=ds_train['y_pred'],
                                        mode='markers',
                                        marker=dict(color=qc_colors_tr, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = ds_train[['well',y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            scatter_test = go.Scatter(  x=ds_test[y_orig], y=ds_test[y_pred], 
                                        mode='markers',
                                        marker=dict(color=qc_colors_ts, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = data[['well', y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            line_trace_up = go.Scatter(x=[0, max_val], y=[0 + margin, max_val*up_range + margin], mode='lines+markers', line=dict(color='blue'))
            line_trace_dw = go.Scatter(x=[0, max_val], y=[0 - margin, max_val*dwn_range - margin], mode='lines+markers', marker=dict(color='blue'))
            fig = make_subplots(rows=1, cols=2, subplot_titles=(f'train ds {qc_train}', f'test ds {qc_test}'))
            fig.add_trace(scatter_train,  row=1, col=1)
            fig.add_trace(line_trace_up,  row=1, col=1)
            fig.add_trace(line_trace_dw,  row=1, col=1)
            fig.update_xaxes(title_text='actual', row=1, col=1)
            fig.update_yaxes(title_text='predict', row=1, col=1)
            fig.add_trace(scatter_test,  row=1, col=2)
            fig.add_trace(line_trace_up,  row=1, col=2)
            fig.add_trace(line_trace_dw,  row=1, col=2)
            fig.update_xaxes(title_text='actual', row=1, col=2)
            fig.update_yaxes(title_text='predict', row=1, col=2)
            fig.update_layout(  title_text= (comment), width=700, height=350, 
                                margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
            return fig.show()

        print(f'Cluster list is: {cluster_list}')   
        target = 'phit_wavg_target'
        model1_ph = model_prediction_split(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance, 'RandomForestRegressor','display')
        model2_ph = model_prediction_split(dataset, BayesianRidge(), target, tolerance, 'BayesianRidge', 'dont_display')
        model3_ph = model_prediction_split(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, tolerance, 'XGBRegressor', 'dont_display')
        model4_ph = model_prediction_split(dataset, CatBoostRegressor(random_state=42, verbose=False), target, tolerance, 'CatBoostRegressor', 'dont_display')
        model5_ph = model_prediction_split(dataset, AdaBoostRegressor(random_state=42), target, tolerance, 'AdaBoostRegressor', 'dont_display')
        model6_ph = model_prediction_split(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, tolerance, 'LGBMRegressor', 'dont_display')

        # xplot_qc2(model1_ph['result'], model1_ph['trainqc'], model1_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        # xplot_qc2(model2_ph['result'], model2_ph['trainqc'], model2_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        # xplot_qc2(model3_ph['result'], model3_ph['trainqc'], model3_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        # xplot_qc2(model4_ph['result'], model4_ph['trainqc'], model4_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        # xplot_qc2(model5_ph['result'], model5_ph['trainqc'], model5_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        # xplot_qc2(model6_ph['result'], model6_ph['trainqc'], model6_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model1_ph
    model_split = run_phit_pred_split(input_ph8, cluster_list, tolerance=0.05)['result']

    def run_phit_pred_1_to_all(dataset, cluster_list, tolerance):
        def model_prediction_1_to_all(dataset, selected_model, target, tolerance, model_name):
            drop_lst_X = ['well','FORMATION_up', 'well_1', 'well_2', 'well_3']
            drop_lst_y = ['well','FORMATION_up']
            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]
            print(model_name)
            df_lst = []
            for wellname in tqdm(dataset.well.unique()[:]):
                train = dataset[dataset.well != wellname]
                X_train_init = train.drop(target, axis=1)
                y_train_init = train[['well','FORMATION_up', target]]
                X_train = X_train_init.drop(drop_lst_X, axis=1)
                y_train = y_train_init.drop(drop_lst_y, axis=1)
                model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
                model.fit(X_train, y_train)

                test = dataset[dataset.well == wellname]
                y_test_wnames = test[['well','FORMATION_up']].reset_index(drop=True)
                X_test_init = test.drop(target, axis=1)
                y_test_init = test[['well','FORMATION_up', target]]
                X_test = X_test_init.drop(drop_lst_X, axis=1)
                y_test = y_test_init.drop(drop_lst_y, axis=1).values[0]
                y_pred = model.predict(X_test)
                test = pd.DataFrame(zip(y_test, y_pred), columns=['y_orig', 'y_pred'])
                test = pd.concat([y_test_wnames, test], axis=1)
                df_lst.append(test)
                
            result = pd.concat(df_lst).reset_index(drop=True)
            result['up'] = result['y_orig']*(1 + tolerance)
            result['down'] = result['y_orig']*(1 - tolerance)
            result['qc'] = 'out'
            result.loc[(result['y_pred'] <= result.up) & (result['y_pred'] >= result.down), 'qc'] = 'in'
            resultqc = result.qc.value_counts(normalize=True)

            phit_pred = result[['well','FORMATION_up','y_pred']].rename(columns={'y_pred':'phit_pred'})
            dataset_pred = dataset.set_index(['well','FORMATION_up']).join(phit_pred.set_index(['well','FORMATION_up']), how='inner').reset_index()

            result_dict = {'result':result, 'res_full':dataset_pred, 'testqc':resultqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            return result_dict
        def xplot_qc_1_to_all(data, qc_test, y_orig, y_pred, max_val, rng, margin, round, comment):
            data = data.round({y_orig: round, y_pred: round})
            ds_test = data
            up_range = rng + 1
            dwn_range = 1 - rng
            colors = {'in': 'green', 'out': 'red'}
            qc_colors = [colors[qc] for qc in ds_test.qc]
            scatter_test = go.Scatter( x=ds_test['y_orig'], y=ds_test['y_pred'],
                                        mode='markers',
                                        marker=dict(color=qc_colors, size=6, opacity=0.75, line=dict(color='rgb(47, 57, 61)', width=0.5)),
                                        customdata = ds_test[['well',y_orig, y_pred, 'FORMATION_up']],
                                        hovertemplate="".join(
                                        ["w:%{customdata[0]},a:%{customdata[1]}, p:%{customdata[2]}, f:%{customdata[3]}<extra></extra>"])
                                        )
            line_trace_up = go.Scatter(x=[0, max_val], y=[0 + margin, max_val*up_range + margin], mode='lines+markers', line=dict(color='blue'))
            line_trace_dw = go.Scatter(x=[0, max_val], y=[0 - margin, max_val*dwn_range - margin], mode='lines+markers', marker=dict(color='blue'))
            fig = make_subplots(rows=1, cols=1, subplot_titles=(f'test qc {qc_test}',))
            fig.add_trace(scatter_test,  row=1, col=1)
            fig.add_trace(line_trace_up,  row=1, col=1)
            fig.add_trace(line_trace_dw,  row=1, col=1)
            fig.update_xaxes(title_text='actual', row=1, col=1)
            fig.update_yaxes(title_text='predict', row=1, col=1)
            fig.update_layout(  title_text= (comment), width=350, height=350, 
                                margin=dict(l=10,r=10,b=10,t=50), showlegend=False)
            return fig.show()

        target = 'phit_wavg_target'
        model1_ph = model_prediction_1_to_all(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance,'RandomForestRegressor')
        # model2_ph = model_prediction_1_to_all(dataset, BayesianRidge(), target, 0.05, 'BayesianRidge')
        # model3_ph = model_prediction_1_to_all(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, 0.05, 'XGBRegressor')
        # model4_ph = model_prediction_1_to_all(dataset, CatBoostRegressor(random_state=42, verbose=False), target, 0.05,'CatBoostRegressor')
        # model5_ph = model_prediction_1_to_all(dataset, AdaBoostRegressor(random_state=42), target, 0.05, 'AdaBoostRegressor')
        # model6_ph = model_prediction_1_to_all(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, 0.05, 'LGBMRegressor')

        # xplot_qc_1_to_all(model1_ph['result'], model1_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        # xplot_qc_1_to_all(model2_ph['result'], model2_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        # xplot_qc_1_to_all(model3_ph['result'], model3_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        # xplot_qc_1_to_all(model4_ph['result'], model4_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        # xplot_qc_1_to_all(model5_ph['result'], model5_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        # xplot_qc_1_to_all(model6_ph['result'], model6_ph['testqc'], 'y_orig', 'y_pred', 0.3, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model1_ph
    model = run_phit_pred_1_to_all(input_ph8, cluster_list, tolerance=0.05)

    def concat_prediction_to_khtst_df(data_pred, data_khtst, data_main, cluster_algo):
        phit_pred8 = data_pred['result'][['well','FORMATION_up','y_pred']].rename(columns={'y_pred':'phit_pred'})
        khtst8 = data_khtst[data_khtst[cluster_algo].isin(cluster_list)].groupby(['well','FORMATION_up'])['khtst'].sum().reset_index()

        khtst8_phit_pred8 = khtst8.set_index(['well','FORMATION_up']).join(phit_pred8.set_index(['well','FORMATION_up'])).reset_index()

        phitpred_khtst = khtst8_phit_pred8.set_index(['well','FORMATION_up']).join(data_main.set_index(['well','FORMATION_up']), how='inner').reset_index()

        phitpred_khtst.insert(19, 'phit_pred', phitpred_khtst.pop('phit_pred'))
        phitpred_khtst.insert(19, 'phit_wavg_target', phitpred_khtst.pop('phit_wavg_target'))
        phitpred_khtst.insert(19, 'khtst', phitpred_khtst.pop('khtst'))
        return phitpred_khtst
    phitpred_khtst = concat_prediction_to_khtst_df(model, data_clustered8, input_ph8, 'kmeans')
    print(f'Concat dataset features {textwrap.fill(str(list(phitpred_khtst.columns)), width=150)}')

    def boxcox_transform(dataset, var):
        result, lam = boxcox(dataset[var])
        dataset['khtst_boxcox'] = result
        return dataset, lam
    phitpred_khtst_boxcox, lam = boxcox_transform(phitpred_khtst, 'khtst')

    print('\nPrediction KHtst: ')
    def run_khtst_pred_split(dataset, cluster_list, max_range, tolerance, lam):
        def model_prediction_split(dataset, selected_model, target, tolerance, model_name, display_flag='display'):
            """
            'well', 'FORMATION_up', 'well_1', 'well_2', 'well_3', 'dist_1',
            'dist_2', 'dist_3', 'phit_wavg_1', 'vsh_wavg_1', 'htst_sum_1',
            'phit_wavg_2', 'vsh_wavg_2', 'htst_sum_2', 'phit_wavg_3', 'vsh_wavg_3',
            'htst_sum_3', 'phit_pred', 'phit_wavg_target', 'khtst'
            """
            drop_lst_X = ['well','FORMATION_up',    'well_1', 'well_2', 'well_3', 'dist_1', 'dist_2','dist_3', 'phit_wavg_target', 'khtst']
            drop_lst_y = ['well','FORMATION_up']

            X = dataset.drop(target, axis=1)
            y = dataset[['well','FORMATION_up', target]]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=55)

            y_train_wnames = y_train[['well','FORMATION_up']].reset_index(drop=True)
            y_test_wnames = y_test[['well','FORMATION_up']].reset_index(drop=True)

            X_train = X_train.drop(drop_lst_X, axis=1)
            X_test = X_test.drop(drop_lst_X, axis=1)
            y_train = y_train.drop(drop_lst_y, axis=1)
            y_test = y_test.drop(drop_lst_y, axis=1)

            model = Pipeline([("scaler",StandardScaler()),("model", selected_model)])
            model.fit(X_train, y_train)
            y_pred_train = model.predict(X_train)
            y_pred_train = inv_boxcox(y_pred_train, lam)
            y_pred_test = model.predict(X_test)
            y_pred_test = inv_boxcox(y_pred_test, lam)

            y_train = inv_boxcox(y_train, lam)
            y_train = np.array(y_train).flatten()
            y_test = inv_boxcox(y_test, lam)
            y_test = np.array(y_test).flatten()

            train = pd.DataFrame(zip(y_train,y_pred_train), columns=['y_orig', 'y_pred'])
            train = pd.concat([y_train_wnames, train], axis=1)
            test = pd.DataFrame(zip(y_test,y_pred_test), columns=['y_orig', 'y_pred'])
            test = pd.concat([y_test_wnames, test], axis=1)
            

            train['up'] = train['y_orig']*(1 + tolerance)
            train['down'] = train['y_orig']*(1 - tolerance)
            train['qc'] = 'out'
            train['dataset'] = 'train'
            train.loc[(train['y_pred'] <= train.up) & (train['y_pred'] >= train.down), 'qc'] = 'in'
            trainqc = train.qc.value_counts(normalize=True)

            test['up'] = test['y_orig']*(1 + tolerance)
            test['down'] = test['y_orig']*(1 - tolerance)
            test['qc'] = 'out'
            test['dataset'] = 'test'
            test.loc[(test['y_pred'] <= test.up) & (test['y_pred'] >= test.down), 'qc'] = 'in'
            testqc = test.qc.value_counts(normalize=True)
            df = pd.concat([train, test])
            df['y_pred'] = df['y_pred'].astype('float')

            result = {'result':df, 'testqc':testqc['in'].round(2), 'trainqc':trainqc['in'].round(2), 'train_df':X_train.columns, 'model': model}
            if display_flag == 'display':
                print(f'features dataset: {list(X_train.columns)}')
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            else: 
                print('train "in":', '{:.2f}'.format(result['trainqc'].round(2)),
                    'test "in":', '{:.2f}'.format(result['testqc'].round(2)), 
                    '\t', model_name)
            return result
        print(f'Cluster list is: {cluster_list}')
        target = 'khtst_boxcox'
        model1_kh = model_prediction_split(dataset, RandomForestRegressor(n_jobs=-1, random_state=42), target, tolerance, 'RandomForestRegressor','display')
        model2_kh = model_prediction_split(dataset, BayesianRidge(), target, tolerance, 'BayesianRidge', 'dont_display')
        model3_kh = model_prediction_split(dataset, XGBRegressor(n_jobs=-1, random_state=42, verbosity=0), target, tolerance, 'XGBRegressor', 'dont_display')
        model4_kh = model_prediction_split(dataset, CatBoostRegressor(random_state=42, verbose=False), target, tolerance, 'CatBoostRegressor', 'dont_display')
        model5_kh = model_prediction_split(dataset, AdaBoostRegressor(random_state=42), target, tolerance, 'AdaBoostRegressor', 'dont_display')
        model6_kh = model_prediction_split(dataset, LGBMRegressor(n_jobs=-1, random_state=42, verbose=0, verbosity=-1), target, tolerance, 'LGBMRegressor', 'dont_display')

        xplot_qc2(model1_kh['result'], model1_kh['trainqc'], model1_kh['testqc'], 'y_orig', 'y_pred', max_range, tolerance, 0, 3, f'RandomForestRegressor {cluster_list}')
        xplot_qc2(model2_kh['result'], model2_kh['trainqc'], model2_kh['testqc'], 'y_orig', 'y_pred', max_range, tolerance, 0, 3, f'BayesianRidge {cluster_list}')
        xplot_qc2(model3_kh['result'], model3_kh['trainqc'], model3_kh['testqc'], 'y_orig', 'y_pred', max_range, tolerance, 0, 3, f'XGBRegressor {cluster_list}')
        xplot_qc2(model4_kh['result'], model4_kh['trainqc'], model4_kh['testqc'], 'y_orig', 'y_pred', max_range, tolerance, 0, 3, f'CatBoostRegressor {cluster_list}')
        xplot_qc2(model5_kh['result'], model5_kh['trainqc'], model5_kh['testqc'], 'y_orig', 'y_pred', max_range, tolerance, 0, 3, f'AdaBoostRegressor {cluster_list}')
        xplot_qc2(model6_kh['result'], model6_kh['trainqc'], model6_kh['testqc'], 'y_orig', 'y_pred', max_range, tolerance, 0, 3, f'LGBMRegressor {cluster_list}')
        return model1_kh
    model_khtst = run_khtst_pred_split(phitpred_khtst_boxcox, cluster_list, 27000, 0.25, lam)
    result = {'khtst_pred':model_khtst['result'], 'khtst_data':phitpred_khtst, 'phit_pred':model['result'], 'boxcox':lam}

    return result
test_full = khtst_workflow(cluster_list = [0,1,2])

In [None]:
test0 = test_0['khtst_data']
test1 = test_1['khtst_data']
test2 = test_2['khtst_data']
test0['cluster'] = 0
test1['cluster'] = 1
test2['cluster'] = 2
result = pd.concat([test0, test1, test2])
custom_palette = {0:'blue', 1: 'green', 2: 'red'}
sns.histplot(result, x='khtst', hue='cluster', log_scale=(True), kde=True, bins=30, palette=custom_palette)

In [63]:
# input_ph8 = input_ph8.sample(frac=1, random_state=42)
# test = model1_ph['model'].predict(input_ph8.iloc[:,6:])
# test_df = pd.DataFrame({'y_pred':test})
# input_ph8_v2 = pd.concat([input_ph8, test_df], axis=1)
# plt.scatter(input_ph8_v2.phit_wavg_target, input_ph8_v2.y_pred)

In [208]:
# def polynomial_regression(dataset, x_var, y_var, degree):
#     x = np.array(dataset[x_var])
#     y = np.array(dataset[y_var])
    
#     # coefficients = np.polyfit(x, y, degree)
#     # poly_function = np.poly1d(coefficients)
#     # y_pred = poly_function(x)

#     x = x[:, np.newaxis]
#     coefficients = PolynomialFeatures(degree)
#     x_poly = coefficients.fit_transform(x)
#     model = LinearRegression()
#     model.fit(x_poly, y)
#     y_pred = model.predict(x_poly)

#     y_test_wnames = dataset[['well','FORMATION_up']]
#     test = pd.DataFrame(zip(dataset['khtst'],y_pred), columns=['y_orig', 'y_pred'])
#     test = pd.concat([y_test_wnames, test], axis=1)

#     test['up'] = test['y_orig']*(1 + tolerance)
#     test['down'] = test['y_orig']*(1 - tolerance)
#     test['qc'] = 'out'
#     test.loc[(test['y_pred'] <= test.up) & (test['y_pred'] >= test.down), 'qc'] = 'in'
#     testqc = test.qc.value_counts(normalize=True)['in']
#     print(f'precent of "in" {testqc:.2f}')

#     plt.scatter(x, y, color='blue', label='Data')
#     plt.scatter(x, y_pred, color='red', label='Polynomial Fit')
#     plt.xlabel(x_var)
#     plt.ylabel(y_var)
#     plt.title('Polynomial Regression')
#     plt.legend()
#     plt.show()
#     return y_pred, coefficients
# y_pred, coefficients = polynomial_regression(phitpred_khtst, 'phit_wavg_target', 'khtst', 3)

In [182]:
#Checking KHtst based on df_bal_net2_kh
khtst_rows = df_bal_net2_kh[df_bal_net2_kh.KHtst.notna()].groupby(['well','FORMATION_up']).apply(lambda x: x.iloc[0]).reset_index(drop=True)[
                                                                  ['well','FORMATION_up','KHtst']]
khtst_rows8 = khtst_rows[khtst_rows.FORMATION_up == 'Balakhany VIII']

In [None]:
#Check gas oil density & porosity
gas_wells = df_bal_net2_kh[(df_bal_net2_kh.FLUIDS == 1) & (df_bal_net2_kh.FORMATION_up == 'Balakhany VIII')]
oil_wells = df_bal_net2_kh[(df_bal_net2_kh.FLUIDS == 2) & (df_bal_net2_kh.FORMATION_up == 'Balakhany VIII')]
test = pd.concat([gas_wells, oil_wells])[['well','FORMATION_up','FLUIDS','PHIT']]
custom_palette = {1: 'red', 2: 'green'}
sns.histplot(data=test, x='PHIT', hue='FLUIDS', kde=True,  bins=35, palette=custom_palette)

## Verification dataset for CNN

In [142]:
abnormal = pd.read_csv(r'C:\jupyter\SPP\inputoutput\Abnormal_PHIT_VSH_samples.csv').drop('Unnamed: 0', axis=1)

In [None]:
abnormal.well.unique()

In [None]:
# Большинство проблем связано с тем, что запись ГК есть, а ГГКп нет т.е. нет Кп. 
# Надо исключить из обучающей выборки все скважин CHIRAG  и GCA.
# Остальные особенности прокомментированы ниже.
# C33 - небольшой брак по ГК в инт 1626-1630, убрак браковые блок
# A17ST1 - аномально низкая ГК и Кгл, убрать из выборки целиком
# CHIRAG6 - полный брак запики ГГКп, убрать целиком
# C03Z - что то непонятное в интервале 1000-1010, надо удалить этот блок из трейнинг сета, 1055-1065 брак записи ГГКп-НК
# E39 - небольшой рассинхрон по глубине на 2415-2417.5, обратить внимание
# A20 - брак записи ГГКп в инт 320-335, убрак блок
# GCA1 - полный брак записи ГГКп, убрать целиком
# A12W - коллектор на 300-310 перебит огромным плотняком на 7.5м примерно, блок удалить из обучения
# B22 - брак записи ГГКп в инт 1935-1950, удалить блок из выборк
# GCA6Y - брак записи ГГКп, убрать целиком
# A12V - мощные плотняки в коллекторах в инт 42-57, удалить блок из выборки
# C01 - брак записи ГГКп в инт 2310-2330, удалить блок из выборк
# G01Y - срывы на запики ГГКп из за чего Кп=0, срывы надо заполнить средними значениями Кп

In [None]:
# проверить еще раз скважины до A17ST1
def abnormnal_display(dataset, wellname):
    test = dataset[(dataset.well == wellname)]
    fig, ax = plt.subplots(figsize=(2,6))
    ax.plot(test.PHIT, test.TST, c='green', linestyle='dashed', lw=2, zorder=1)
    ax.vlines(0.13, ymin=min(test.TST), ymax=max(test.TST), color='green', linestyle='dashed', lw=1)
    twin = ax.twiny()
    twin.plot(test.VSH, test.TST, color='lightgreen', alpha=0.8, zorder=2)
    twin.set_xlim(-0.1, 1.1)
    ax.set_xlim(0, 0.3)
    ax.invert_yaxis()
    ax.invert_xaxis()
    ax.set_title(wellname)
    ax.grid()
wellname = 'E39'
abnormnal_display(abnormal, wellname)

In [None]:
def well_display_khtst_v2( dataset, wellname, fmname, net_var, comments, ref_depth, fm_flag, depth_step, kh_include, print):
    """
    dataset = df_bal or something else
    net_var = NET or FLUIDS_int
    comments = put what you want
    ref_depth = MD or TST
    fm_flag = 1 if you need a FORMATION_up, 0 if just a simple FORMATION
    depth_step = step for ticks on the diagramm
    kh_include = 1 if we have KHtst in dataset, 0 if there is not KHtst
    print = 1 if we want to print the plot
    """
    if fm_flag == 0:
        data = dataset[(dataset.well==wellname) & (dataset.FORMATION == fmname)]
    if fm_flag == 1:
        data = dataset[(dataset.well==wellname) & (dataset.FORMATION_up == fmname)]
    depth = data[ref_depth]
    grn = data['GR_N']
    vsh = data['VSH']
    rhob = data['RHOB'] 
    npss = data['NPSS']
    rdeep = data['RDEEP']
    phit = data['PHIT'] 
    net = data[net_var]
    perm = data['LPERM']
    if kh_include == 1:
        kh = data['KHtst']
    else:
        data['KHtst'] = 0
        kh = data['KHtst']
    fig, ax = plt.subplots(1,4, figsize=(7,7), sharey=True)
    well_bal_tops = df_bal[(df_bal.well == wellname)].groupby('FORMATION')[ref_depth].apply(lambda x: x.iloc[0]).reset_index()
    ax[0].yaxis.set_ticks(np.arange(min(depth), max(depth), depth_step))
    ax[0].plot(grn, depth, color='lightgreen', lw=3, zorder=10)
    ax[0].invert_yaxis() 
    ax[0].set_xlim(-5, 150) 
    ax[0].grid(axis='y')
    for i in well_bal_tops[well_bal_tops.FORMATION.str.contains(fmname)].FORMATION:
        ax[0].hlines(well_bal_tops[well_bal_tops.FORMATION==i][ref_depth].iloc[0], 
                    xmin=0, xmax=1000, linewidth=2, color='black', lw=2, alpha=0.33)
    twin0 = ax[0].twiny()
    twin0.plot(vsh, depth, color='black', alpha=0.5, zorder=5)
    twin0.vlines(0.5, ymin=min(depth), ymax=max(depth), color='black', lw=1, linestyle='dashed')
    twin0.set_xlim(-0.1, 1.25)
    ax[1].plot(rhob, depth, color='red') 
    ax[1].invert_yaxis() 
    ax[1].xaxis.set_ticks(np.arange(1.65, 2.65, 0.3))
    ax[1].set_xlim(1.65, 2.65)
    ax[1].grid(axis='y'), ax[1].grid(axis='x')
    for i in well_bal_tops[well_bal_tops.FORMATION.str.contains(fmname)].FORMATION:
        ax[1].hlines(well_bal_tops[well_bal_tops.FORMATION==i][ref_depth].iloc[0], 
        xmin=0, xmax=150, linewidth=2, color='black', lw=2, alpha=0.33)
        ax[1].text(1.67, well_bal_tops[well_bal_tops.FORMATION==i][ref_depth].iloc[0]+0.5*depth_step, i, fontsize = 7, color ="black")
    twin1 = ax[1].twiny()
    twin1.plot(npss, depth, color='blue')
    twin1.set_xlim(0.6, 0)
    # ax[2].plot(rdeep, depth, color='black'), ax[2].set_xscale('log'), ax[2].set_xlim(0.1, 50), ax[2].invert_yaxis(), ax[2].grid(axis='x', which='both')
    ax[2].plot(phit, depth, color='green', linestyle='dashed'), ax[2].set_xlim(0.3, 0), ax[2].grid(axis='x'), ax[2].set_xticks([0, 0.1, 0.2, 0.3]) 
    ax[2].invert_yaxis()
    ax[2].grid(axis='y')
    ax[2].vlines(0.13, ymin=min(depth), ymax=max(depth), color='black', linestyle='dashed')
    twin2 = ax[2].twiny()
    twin2.plot(net, depth, color='orange', linewidth=0.5)
    twin2.fill_betweenx(depth,net, color='orange', alpha=0.33)
    twin2.set_xlim(0, 1)
    twin2.set_ylim(min(depth), max(depth))
    ax[3].plot(perm, depth, color='purple', alpha=0.66), ax[3].set_xscale('log'), ax[3].set_xlim(0.1, 1000)
    ax[3].invert_yaxis()
    ax[3].grid(axis='y')
    for i in well_bal_tops[well_bal_tops.FORMATION.str.contains(fmname)].FORMATION:
        ax[3].hlines(well_bal_tops[well_bal_tops.FORMATION==i][ref_depth].iloc[0], xmin=0, xmax=1000, linewidth=2, color='black', lw=2, alpha=0.66)
    twin4 = ax[3].twiny()
    twin4.plot(kh, depth, color='black', alpha=1)
    fig.suptitle(wellname + ' ' + fmname + ' ' + ref_depth + ' ' + str(round(max(kh.dropna()),0)) + ' ' + str(comments), fontsize=14)
    fig.tight_layout()
    if print == 1:
        path = 'C:\\jupyter\\SPP\\inputoutput\\wellplots\\'
        fig.savefig(path + fmname.replace(' ','') + '_' + wellname + '.png')
    else:
        pass
wellname = 'C15'
well_display_khtst_v2(df_bal_net2_kh, wellname, 'Balakhany VIII', 'NET_clp2', 'test', 'TST', 1, 10, 1, 0)

In [None]:
def calculation_tst_per_platform(fm):
    df_lst = []
    for platform in df_bal_net2_kh.field.unique():
        data = df_bal_net2_kh[(df_bal_net2_kh.field == platform) & (df_bal_net2_kh.FORMATION_up == fm)]
        test = data.groupby(['well','FORMATION'])['TST'].apply(lambda x: x.iloc[-1] - x.iloc[0]).reset_index()
        df = pd.DataFrame({'platform':platform, 'TST_mean':test.groupby('FORMATION')['TST'].mean()}).reset_index()
        df['TST_mean'] = df['TST_mean'].round(0) 
        df = df[['platform', 'FORMATION', 'TST_mean']] 
        df_lst.append(df)
    result = pd.concat(df_lst).reset_index(drop=True)
    return result
calculation_tst_per_platform('Balakhany VIII')
# calculation_tst_per_platform('Balakhany X')

## Geopandas Zone

In [None]:
# surf1 = np.loadtxt('C:\jupyter\SPP\input\surfaces\PW_H10_Dec22_CACI_5176_M400000_QLSKPrSDM_SCF_balVIIIs_ismat4')
# X = surf1[:,0] 
# Y = surf1[:,1] 
# Z = surf1[:,2]
# plt.scatter(X,Y, c=Z)
# plt.colorbar(label='depth')

In [9]:
def convert_linestringz_polygon(dataset):
    geom = [x for x in dataset.geometry]
    df_lst = []
    for i in range(len(geom)):
        all_coords = mapping(geom[i])['coordinates']
        lats = [x[1] for x in all_coords]
        lons = [x[0] for x in all_coords]
        polyg = Polygon(zip(lons, lats))
        df = gpd.GeoDataFrame(index=[0], crs='EPSG:2499', geometry=[polyg])
        df_lst.append(df)
    result = pd.concat(df_lst).reset_index(drop=True)
    return result 

bal8_1510_3 = gpd.read_file(r'C:\jupyter\SPP\input\surfaces\petrel\BalakhanyVIII_1510_base_3.shp').set_crs('EPSG:2499')
bal8_20_3 = gpd.read_file(r'C:\jupyter\SPP\input\surfaces\petrel\BalakhanyVIII_20_base_3.shp').set_crs('EPSG:2499')
bal8_30_3 = gpd.read_file(r'C:\jupyter\SPP\input\surfaces\petrel\BalakhanyVIII_30_base_3.shp').set_crs('EPSG:2499')
bal8_1510_3_polygon = convert_linestringz_polygon(bal8_1510_3)
bal8_20_3_polygon = convert_linestringz_polygon(bal8_20_3)
bal8_30_3_polygon = convert_linestringz_polygon(bal8_30_3)

In [10]:
# 'CHIRAG', 'CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI', 'DWG', 'DDGG', 'WEST CHIRAG'
def polygon_by_field(dataset, field, buffer):
    data = dataset[dataset.FORMATION_up == 'Balakhany VIII'][['well','X_mean','Y_mean','field']]
    data = data[data.field == field]
    data = data.drop('field', axis=1).groupby('well').mean().reset_index()
    geometry_fld = [Point(xy) for xy in zip(data['X_mean'], data['Y_mean'])]
    data = gpd.GeoDataFrame(data, geometry=geometry_fld).drop(['X_mean','Y_mean'], axis=1)
    buffers_fld = data.buffer(buffer)
    buffers_fld = gpd.GeoDataFrame(geometry=buffers_fld)
    data = data.join(buffers_fld, rsuffix='_polygon')
    data = gpd.GeoDataFrame(data, geometry='geometry_polygon').set_crs('EPSG:2499')
    field_polygon = gpd.GeoSeries(data['geometry_polygon'].unary_union.convex_hull)
    return field_polygon
dwg = polygon_by_field(df_bal_net2_kh, 'DWG', 500)
chirag = polygon_by_field(df_bal_net2_kh, 'CHIRAG', 500)
wchirag = polygon_by_field(df_bal_net2_kh, 'WEST CHIRAG', 500)
cazeri = polygon_by_field(df_bal_net2_kh, 'CENTRAL AZERI', 500)
wazeri = polygon_by_field(df_bal_net2_kh, 'WEST AZERI', 500)
eazeri = polygon_by_field(df_bal_net2_kh, 'EAST AZERI', 500)

In [38]:
def draw_polygons_n_points(dataset, fm):
    bdl8_xy = dataset[dataset.FORMATION_up == fm][['well','X_mean','Y_mean']]
    bdl8_xy = bdl8_xy.groupby('well').mean().reset_index()
    geometry = [Point(xy) for xy in zip(bdl8_xy['X_mean'], bdl8_xy['Y_mean'])]
    bdl8_xy_gpd = gpd.GeoDataFrame(bdl8_xy, geometry=geometry).drop(['X_mean','Y_mean'], axis=1)

    buffers = bdl8_xy_gpd.buffer(250)
    buffers = gpd.GeoDataFrame(geometry=buffers)
    bdl8_xy_gpd = bdl8_xy_gpd.join(buffers, rsuffix='_polygon')
    bdl8_xy_buff = gpd.GeoDataFrame(bdl8_xy_gpd, geometry='geometry_polygon').drop('geometry', axis=1).set_crs('EPSG:2499')
    bdl8_xy_points = gpd.GeoDataFrame(bdl8_xy_gpd, geometry='geometry').drop('geometry_polygon', axis=1).set_crs('EPSG:2499')

    fields_polyg_hull = gpd.GeoSeries(bdl8_xy_buff.unary_union.convex_hull)
    return bdl8_xy_buff, bdl8_xy_points, fields_polyg_hull
bdl8_xy_buff, bdl8_xy_points, fields_polyg_hull = draw_polygons_n_points(df_bal_net2_kh, 'Balakhany VIII')

In [None]:
def calc_khtst_by_fu():
    well_lst = df_bal_net2_kh[(df_bal_net2_kh.field.isin(['CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI'])) & 
                            (df_bal_net2_kh.FORMATION.str.contains('Balakhany VIII')) & (df_bal_net2_kh.KHtst.notna())]
    result_well_lst = well_lst.groupby(['well','FORMATION'])['KHtst'].apply(lambda x: x.iloc[0] - x.iloc[-1]).reset_index()
    result_well_lst_sum = result_well_lst.groupby('FORMATION')['KHtst'].sum().reset_index()
    result_well_lst_sum = result_well_lst_sum.sort_values(by='KHtst', ascending=False)
    return result_well_lst_sum
calc_khtst_by_fu()

In [None]:
well_into_bal8_25_3 = gpd.sjoin(bdl8_xy_points, bal8_30_3_polygon, op='within')

def gpd_polygons_wells(geobody_polygons, wells_points_df, title):
    fig, ax = plt.subplots(figsize=(10,8))
    geobody_polygons.plot(ax=ax, color='yellow', label='bal8_1510_3', alpha=0.66)

    wells_points_df[['well','geometry']].plot(ax=ax, color='black', marker='*', markersize = 50, alpha=0.5, ec='black')
    # bdl8_xy_buff.plot(ax=ax, color='green', label='wells', alpha=0.5)
    bdl8_xy_points.plot(ax=ax, markersize = 1, color='black', label='wells', alpha=1)

    # fields_polyg_hull.plot(ax=ax, alpha=0.25, label='ACG polygon')
    
    dwg.plot(ax=ax, alpha=0.25, color = 'orange', label='dwg')
    chirag.plot(ax=ax, alpha=0.25, color = 'red', label='chirag')
    wchirag.plot(ax=ax, alpha=0.25, color = 'purple', label='chirag')
    cazeri.plot(ax=ax, alpha=0.25, color = 'green', label='c azeri')
    wazeri.plot(ax=ax, alpha=0.25, color = 'blue', label='w azeri')
    eazeri.plot(ax=ax, alpha=0.25, color = 'turquoise', label='e azeri')
    # ax.grid()
    ax.legend()
    ax.set_title(title)
gpd_polygons_wells(bal8_30_3_polygon, well_into_bal8_25_3, 'Polygons of Balakhany VIII 30 body #3 & wells (buffer 250m)')

In [None]:
def wells_in_out_polygon(dataset, wells_points_df, flow_units, title):   
    geobody_well_lst = wells_points_df.well.values

    well_lst = dataset[(dataset.field.isin(['CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI'])) 
                       & (dataset.FORMATION.isin(flow_units))]
    well_in_geob =  well_lst[well_lst.well.isin(geobody_well_lst)]
    well_out_geob =  well_lst[~well_lst.well.isin(geobody_well_lst)]

    well_in_geob_khtst = well_in_geob.groupby('well')['KHtst'].apply(lambda x: x.iloc[0] - x.iloc[-1]).reset_index()
    well_in_geob_khtst['geobody'] = 'in'
    well_out_geob_khtst = well_out_geob.groupby('well')['KHtst'].apply(lambda x: x.iloc[0] - x.iloc[-1]).reset_index()
    well_out_geob_khtst['geobody'] = 'out'
    concat_df = pd.concat([well_in_geob_khtst, well_out_geob_khtst])

    fig, ax = plt.subplots(1, 2, figsize=(11, 4))
    custom_palette = {'in': 'red', 'out': 'lightgreen'}
    sns.kdeplot(concat_df, x='KHtst', hue='geobody', log_scale=False, palette=custom_palette, ax=ax[0])
    sns.boxplot(concat_df, x="geobody", y="KHtst", palette=custom_palette, ax=ax[1])
    ax[0].set_title(title)
    ax[1].set_title(title)
    ax[0].grid(which='both')
    return wells_points_df, concat_df
well_into_bal8_25_3, concat_df_25 = wells_in_out_polygon(df_bal_net2_kh, well_into_bal8_25_3,['Balakhany VIII 25'],
                                                        'bal8_30_3 polygon')

In [None]:
well_into_bal8_20_3 = gpd.sjoin(bdl8_xy_points, bal8_20_3_polygon, op='within')

def gpd_polygons_wells(geobody_polygons, wells_points_df, title):
    fig, ax = plt.subplots(figsize=(10,8))
    geobody_polygons.plot(ax=ax, color='orange', label='bal8_1510_3', alpha=0.66)

    wells_points_df[['well','geometry']].plot(ax=ax, color='black', marker='*', markersize = 50, alpha=0.5, ec='black')
    # bdl8_xy_buff.plot(ax=ax, color='green', label='wells', alpha=0.5)
    bdl8_xy_points.plot(ax=ax, markersize = 1, color='black', label='wells', alpha=1)

    # fields_polyg_hull.plot(ax=ax, alpha=0.25, label='ACG polygon')
    
    dwg.plot(ax=ax, alpha=0.25, color = 'orange', label='dwg')
    chirag.plot(ax=ax, alpha=0.25, color = 'red', label='chirag')
    wchirag.plot(ax=ax, alpha=0.25, color = 'purple', label='chirag')
    cazeri.plot(ax=ax, alpha=0.25, color = 'green', label='c azeri')
    wazeri.plot(ax=ax, alpha=0.25, color = 'blue', label='w azeri')
    eazeri.plot(ax=ax, alpha=0.25, color = 'turquoise', label='e azeri')
    # ax.grid()
    ax.legend()
    ax.set_title(title)
gpd_polygons_wells(bal8_20_3_polygon, well_into_bal8_20_3, 'Polygons of Balakhany VIII 20 body #3 & wells (buffer 250m)')

In [None]:
def wells_in_out_polygon(dataset, wells_points_df, flow_units, title):   
    geobody_well_lst = wells_points_df.well.values

    well_lst = dataset[(dataset.field.isin(['CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI'])) 
                       & (dataset.FORMATION.isin(flow_units))]
    well_in_geob =  well_lst[well_lst.well.isin(geobody_well_lst)]
    well_out_geob =  well_lst[~well_lst.well.isin(geobody_well_lst)]

    well_in_geob_khtst = well_in_geob.groupby('well')['KHtst'].apply(lambda x: x.iloc[0] - x.iloc[-1]).reset_index()
    well_in_geob_khtst['geobody'] = 'in'
    well_out_geob_khtst = well_out_geob.groupby('well')['KHtst'].apply(lambda x: x.iloc[0] - x.iloc[-1]).reset_index()
    well_out_geob_khtst['geobody'] = 'out'
    concat_df = pd.concat([well_in_geob_khtst, well_out_geob_khtst])

    fig, ax = plt.subplots(1, 2, figsize=(11, 4))
    custom_palette = {'in': 'red', 'out': 'lightgreen'}
    sns.kdeplot(concat_df, x='KHtst', hue='geobody', log_scale=False, palette=custom_palette, ax=ax[0])
    sns.boxplot(concat_df, x="geobody", y="KHtst", palette=custom_palette, ax=ax[1])
    ax[0].set_title(title)
    ax[1].set_title(title)
    ax[0].grid(which='both')
    return wells_points_df
well_into_bal8_20_3 = wells_in_out_polygon(df_bal_net2_kh, well_into_bal8_20_3,['Balakhany VIII 20'],'bal8_20_3 polygon')

In [None]:
well_into_bal8_1510_3 = gpd.sjoin(bdl8_xy_points, bal8_1510_3_polygon, op='within')

def gpd_polygons_wells(geobody_polygons, wells_points_df, title):
    fig, ax = plt.subplots(figsize=(10,8))
    geobody_polygons.plot(ax=ax, color='red', label='bal8_1510_3', alpha=0.66)

    wells_points_df[['well','geometry']].plot(ax=ax, color='black', marker='*', markersize = 50, alpha=0.5, ec='black')
    # bdl8_xy_buff.plot(ax=ax, color='green', label='wells', alpha=0.5)
    bdl8_xy_points.plot(ax=ax, markersize = 1, color='black', label='wells', alpha=1)

    # fields_polyg_hull.plot(ax=ax, alpha=0.25, label='ACG polygon')
    
    dwg.plot(ax=ax, alpha=0.25, color = 'orange', label='dwg')
    chirag.plot(ax=ax, alpha=0.25, color = 'red', label='chirag')
    wchirag.plot(ax=ax, alpha=0.25, color = 'purple', label='chirag')
    cazeri.plot(ax=ax, alpha=0.25, color = 'green', label='c azeri')
    wazeri.plot(ax=ax, alpha=0.25, color = 'blue', label='w azeri')
    eazeri.plot(ax=ax, alpha=0.25, color = 'turquoise', label='e azeri')
    # ax.grid()
    ax.legend()
    ax.set_title(title)
gpd_polygons_wells(bal8_1510_3_polygon, well_into_bal8_1510_3,'Polygons of Balakhany VIII 15 10 body #3 & wells (buffer 250m)')

In [None]:
def wells_in_out_polygon(dataset, wells_points_df, flow_units, title):   
    geobody_well_lst = wells_points_df.well.values

    well_lst = dataset[(dataset.field.isin(['CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI'])) 
                       & (dataset.FORMATION.isin(flow_units))]
    well_in_geob =  well_lst[well_lst.well.isin(geobody_well_lst)]
    well_out_geob =  well_lst[~well_lst.well.isin(geobody_well_lst)]

    well_in_geob_khtst = well_in_geob.groupby('well')['KHtst'].apply(lambda x: x.iloc[0] - x.iloc[-1]).reset_index()
    well_in_geob_khtst['geobody'] = 'in'
    well_out_geob_khtst = well_out_geob.groupby('well')['KHtst'].apply(lambda x: x.iloc[0] - x.iloc[-1]).reset_index()
    well_out_geob_khtst['geobody'] = 'out'
    concat_df = pd.concat([well_in_geob_khtst, well_out_geob_khtst])

    fig, ax = plt.subplots(1, 2, figsize=(11, 4))
    custom_palette = {'in': 'red', 'out': 'lightgreen'}
    sns.kdeplot(concat_df, x='KHtst', hue='geobody', log_scale=False, palette=custom_palette, ax=ax[0])
    sns.boxplot(concat_df, x="geobody", y="KHtst", palette=custom_palette, ax=ax[1])
    ax[0].set_title(title)
    ax[1].set_title(title)
    ax[0].grid(which='both')
    return wells_points_df
well_into_bal8_1510_3 = wells_in_out_polygon(df_bal_net2_kh, well_into_bal8_1510_3, ['Balakhany VIII 15', 'Balakhany VIII 10'],
                                             'bal8_1510_3 polygon')

## Checking results of CNN prediction

In [None]:
# Strench one curve to another one with python

import numpy as np
from scipy.optimize import minimize

# Define two curves (arrays)
curve1 = np.array([1, 2, 3, 4, 5])
curve2 = np.array([1.5, 2.8, 3.3, 4.2, 4.9])

# Normalize curves
curve1_norm = curve1 / curve1.max()
curve2_norm = curve2 / curve2.max()

# Define a function to minimize the difference between the curves
def objective(params):
    scale, shift = params
    return np.sum((curve2_norm - scale * curve1_norm - shift) ** 2)

# Minimize the objective function to find scaling and shifting parameters
initial_guess = [1.0, 0.0]  # Initial guess for scale and shift
result = minimize(objective, initial_guess)

# Extract scaling and shifting parameters
scale, shift = result.x

# Stretch curve1 to match curve2
stretched_curve1 = scale * curve1 + shift

print("Scaling factor:", scale)
print("Shift factor:", shift)
print("Stretched curve 1:", stretched_curve1)

In [None]:
def interpolate_by_depth_fm(dataset_logs, formation_name, step):
    def interpolate_by_depth(one_well, formation_name, step):
        one_well = one_well.sort_values(by='TST')
        well_name = one_well["well"].iloc[0]
        data_range = np.floor((one_well["TST"].max() - one_well["TST"].min())/step)
        starting_tst = one_well["TST"].iloc[0]
        new_TST_values = [starting_tst + i*0.1 for i in range(1,int(data_range))]
        interp_X = interp1d(one_well['TST'], one_well['X_traj'], kind='linear', fill_value="extrapolate")
        interp_Y = interp1d(one_well['TST'], one_well['Y_traj'], kind='linear', fill_value="extrapolate")
        interp_PHIT = interp1d(one_well['TST'], one_well['PHIT'], kind='linear', fill_value="extrapolate")
        interp_TVD = interp1d(one_well['TST'], one_well['TVD_SCS'], kind='linear', fill_value="extrapolate")
        interp_NET_clp2 = interp1d(one_well['TST'], one_well['NET_clp2'], kind='linear', fill_value="extrapolate")
        interp_LPERM = interp1d(one_well['TST'], one_well['LPERM'], kind='linear', fill_value="extrapolate")
        interp_KHtst = interp1d(one_well['TST'], one_well['KHtst'], kind='linear', fill_value="extrapolate")
        # Create a new DataFrame with the interpolated values for new TVD_SCS
        new_data = {
            'well': [well_name for _ in range(len(new_TST_values))],
            'FORMATION_up': [formation_name for _ in range(len(new_TST_values))],
            'tst_index': [_ for _ in range(len(new_TST_values))],
            'TST': new_TST_values,
            'X_traj': interp_X(new_TST_values),
            'Y_traj': interp_Y(new_TST_values),
            'PHIT': interp_PHIT(new_TST_values),
            'TVD_SCS': interp_TVD(new_TST_values),
            'NET_clp2': interp_NET_clp2(new_TST_values),
            'LPERM': interp_LPERM(new_TST_values),
            'KHtst': interp_KHtst(new_TST_values),
        }
        new_df = pd.DataFrame(new_data)
        return new_df
    df_lst = []
    print(f'Start interpolation of {formation_name}')
    for wellnames in tqdm(dataset_logs.well.unique()):
        well_sel = dataset_logs[dataset_logs.well == wellnames]
        well_interp = interpolate_by_depth(well_sel, formation_name, step)
        df_lst.append(well_interp)
    result = pd.concat(df_lst)
    return result
well_bal8_interp = interpolate_by_depth_fm(df_bal_net2_kh[(df_bal_net2_kh.FORMATION_up == 'Balakhany VIII')], 'Balakhany VIII', 0.1)
well_bal8_interp.columns

In [None]:
phit_aecod = pd.read_csv(r'C:\jupyter\SPP\inputoutput\df_bal_net2_kh_with_prediction.csv')
phit_aecod8 = phit_aecod[phit_aecod.FORMATION_up == 'Balakhany VIII'].reset_index(drop=True)
phit_aecod8.columns

In [346]:
phit_aecod8_v2 = phit_aecod8[['well', 'FORMATION_up', 'TST', 'PHIT_predicted', 'NET_clp2']]
phit_aecod8_v2 = phit_aecod8_v2[phit_aecod8_v2.NET_clp2 == 1]
phit_aecod8_v2['TST'] = phit_aecod8_v2['TST'].round(2)

well_bal8_interp_v2 = well_bal8_interp[['well', 'FORMATION_up', 'TST', 'PHIT', 'NET_clp2']]
well_bal8_interp_v2 = well_bal8_interp_v2[well_bal8_interp_v2.NET_clp2 == 1]
well_bal8_interp_v2['TST'] = well_bal8_interp_v2['TST'].round(2)

orig_pred = well_bal8_interp_v2.set_index(['well','TST']).join(phit_aecod8_v2.set_index(['well','TST']), rsuffix='_pred').reset_index()

fields = df_bal_net2_kh[['well','field']].groupby('well').apply(lambda x: x.iloc[0]).reset_index(drop=True)
orig_pred_fields = orig_pred.set_index('well').join(fields.set_index('well')).reset_index()
azeri = orig_pred_fields[orig_pred_fields.field.str.contains('AZERI')]
chirag = orig_pred_fields[~orig_pred_fields.field.str.contains('AZERI')]

In [None]:
min_phit_c = 0
max_phit_c = 0.35
fig, ax = plt.subplots(1, 2, figsize=(18, 6))
sns.kdeplot(data = chirag, x='PHIT', y='PHIT_predicted', ax=ax[0], alpha=0.5)
ax[0].plot([min_phit_c,max_phit_c], [min_phit_c,max_phit_c], ls='--', color='red')
ax[0].plot([min_phit_c,max_phit_c], [min_phit_c,max_phit_c*0.95], ls='--', color='blue')
ax[0].plot([min_phit_c,max_phit_c], [min_phit_c,max_phit_c*1.05], ls='--', color='blue')
ax[0].set_title('Chirag')
ax[0].set_xlim(0.1, 0.35)
ax[0].set_ylim(0.1, 0.35)
min_phit_a = 0
max_phit_a = 0.35
sns.kdeplot(data=azeri, x='PHIT', y='PHIT_predicted', ax=ax[1], alpha=0.5)
ax[1].plot([min_phit_a,max_phit_a], [min_phit_a,max_phit_a], ls='--', color='red')
ax[1].plot([min_phit_a,max_phit_a], [min_phit_a,max_phit_a*0.95], ls='--', color='blue')
ax[1].plot([min_phit_a,max_phit_a], [min_phit_a,max_phit_a*1.05], ls='--', color='blue')
ax[1].set_title('Azeri')
ax[1].set_xlim(0.1, 0.35)
ax[1].set_ylim(0.1, 0.35);

In [321]:
def calculation_phit_wavg_orig_pred():
    def phit_wavg_calc(dataset, var):
        phit_v2 = dataset[['well', 'FORMATION_up', 'TST', var, 'NET_clp2']]
        phit_v2 = phit_v2[phit_v2.NET_clp2 == 1]
        phit_v3 = phit_v2.groupby('well')['NET_clp2'].sum().reset_index()
        phit_v3['NET_clp2'] = phit_v3['NET_clp2']*0.1
        var2 = var + '_v2'
        phit_v2[var2] = phit_v2[var] * 0.1 
        phit_v4 = phit_v2.groupby('well')[var2].sum().reset_index()
        phit_v5 = phit_v4.set_index('well').join(phit_v3.set_index('well')).reset_index()
        phit_v5['phit_wavg'] = phit_v5[var2] / phit_v5['NET_clp2']
        return phit_v5

    phit_aecod8_v2 = phit_aecod8[['well', 'FORMATION_up', 'TST', 'PHIT_predicted', 'NET_clp2']]
    phit_aecod8_v2 = phit_aecod8_v2[phit_aecod8_v2.NET_clp2 == 1]

    well_bal8_interp_v2 = well_bal8_interp[['well', 'FORMATION_up', 'TST', 'PHIT', 'NET_clp2']]
    well_bal8_interp_v2 = well_bal8_interp_v2[well_bal8_interp_v2.NET_clp2 == 1]

    pred = phit_wavg_calc(phit_aecod8_v2, 'PHIT_predicted')
    orig = phit_wavg_calc(well_bal8_interp_v2, 'PHIT')
    fields = df_bal_net2_kh[['well','field']].groupby('well').apply(lambda x: x.iloc[0]).reset_index(drop=True)
    orig_pred = orig.set_index('well').join(pred.set_index('well'), rsuffix='_pred').reset_index()
    orig_pred_fields = orig_pred.set_index('well').join(fields.set_index('well')).reset_index()
    azeri = orig_pred_fields[orig_pred_fields.field.str.contains('AZERI')]
    chirag = orig_pred_fields[~orig_pred_fields.field.str.contains('AZERI')]
    return chirag, azeri
chirag, azeri = calculation_phit_wavg_orig_pred()

In [None]:
def phit_wavg_orig_pred_display():
    min_phit_c = 0.16
    max_phit_c = 0.26
    fig, ax = plt.subplots(1, 2, figsize=(18, 6))
    sns.scatterplot(data=chirag, x='phit_wavg', y='phit_wavg_pred', hue='field', ax=ax[0])
    ax[0].plot([min_phit_c,max_phit_c], [min_phit_c,max_phit_c], ls='--', color='red', alpha=0.5)
    ax[0].plot([min_phit_c,max_phit_c], [min_phit_c,max_phit_c*0.95], ls='--', color='blue', alpha=0.5)
    ax[0].plot([min_phit_c,max_phit_c], [min_phit_c,max_phit_c*1.05], ls='--', color='blue', alpha=0.5)
    min_phit_a = 0.16
    max_phit_a = 0.28
    sns.scatterplot(data=azeri, x='phit_wavg', y='phit_wavg_pred', hue='field', ax=ax[1])
    ax[1].plot([min_phit_a,max_phit_a], [min_phit_a,max_phit_a], ls='--', color='red', alpha=0.5)
    ax[1].plot([min_phit_a,max_phit_a], [min_phit_a,max_phit_a*0.95], ls='--', color='blue', alpha=0.5)
    ax[1].plot([min_phit_a,max_phit_a], [min_phit_a,max_phit_a*1.05], ls='--', color='blue', alpha=0.5);
phit_wavg_orig_pred_display()

In [None]:
def histo_orig_pred(wellname):
    phit_orig = well_bal8_interp.copy()[['well', 'FORMATION_up', 'TST', 'PHIT','NET_clp2', 'LPERM', 'KHtst']]
    phit_orig['TST'] = phit_orig['TST'].round(2)
    phit_pred = phit_aecod8.copy()[['well', 'FORMATION_up', 'TST', 'PHIT_predicted','NET_clp2', 'LPERM', 'KHtst']]
    phit_pred['TST'] = phit_pred['TST'].round(2)
    phit_orig_pred = phit_orig.set_index(['well', 'TST']).join(phit_pred.set_index(['well', 'TST']), rsuffix='_pred').reset_index()
    phit_orig_pred = phit_orig_pred.dropna(subset='PHIT_predicted')
    phit_orig_pred = phit_orig_pred[phit_orig_pred.NET_clp2 == 1]
    phit_orig_pred_well = phit_orig_pred[phit_orig_pred.well == wellname]
    sns.histplot(phit_orig_pred_well.PHIT, color='blue', label='PHIT', alpha=0.5, kde=True)
    sns.histplot(phit_orig_pred_well.PHIT_predicted, color='red', label='PHIT_pred', alpha=0.5, kde=True)
    plt.legend()
    plt.title(wellname)
histo_orig_pred('C12')

In [None]:
def well_gaussian_filter_run(dataset, var, percentage):
    new_var = var + '_gaus_' + str(percentage)
    def well_gaussian_filter(dataset, wellname, fmname, variable, percentage):  
        data = dataset[dataset.well == wellname][variable]
        var_name = variable + '_gaus_' + str(percentage)
        coeff = percentage/100
        sigma = int(round(len(data)*coeff, 0))
        data = data.reset_index().drop('index', axis=1)
        smoothed_data = gaussian_filter(data, sigma=sigma)
        data[var_name] = smoothed_data
        data['well'] = wellname
        data['FORMATION_up'] = fmname
        return data
    df_lst = []
    for wellname in dataset.well.unique():
        smooth_data = well_gaussian_filter(dataset, wellname, 'Balakhany VIII', var,  percentage)
        df_lst.append(smooth_data)
    var_gaus_smooth = pd.concat(df_lst)[new_var].reset_index(drop=True)
    result = pd.concat([dataset, var_gaus_smooth], axis=1)
    return result
phit_aecod8_v2 = well_gaussian_filter_run(phit_aecod8, 'PHIT_predicted', 0.5)

def well_plots_phit_pred_matrix(dataset, platform, var_selected, lims, comment):
    """
    ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J']
    """
    rows = 4
    columns = 9
    wells_letter = [wellname for wellname in dataset.well.unique() if wellname.startswith(platform)]
    fig, ax = plt.subplots(rows,columns, figsize=(16,rows*3))
    counter = 0
    for j in range(0, rows):
        for i in range(0, columns):
            if counter < len(wells_letter):
                well_data = dataset[dataset.well==wells_letter[counter]]
                y_desired = well_data['TST']
                x1 = well_data['PHIT']
                x2 = well_data[var_selected]       
                ax[j,i].plot(x1, y_desired, color='green', lw=0.75, alpha=1, zorder=1)
                ax[j,i].set_xlim(lims)
                twin = ax[j,i].twiny()
                twin.plot(x2, y_desired, color='orange', lw=1.25, alpha=1, zorder=0)
                twin.set_xlim(lims)
                ax[j,i].set_title(wells_letter[counter] + comment)
                ax[j,i].invert_yaxis()
                ax[j,i].grid()
                counter +=1
    return plt.tight_layout()
for letter in ['A']:
    well_plots_phit_pred_matrix(phit_aecod8_v2, letter, 'PHIT_predicted_gaus_0.5', (0.08, 0.3), ' bal8')

## GRcube testing