# Libs

In [341]:
import warnings
warnings.filterwarnings('ignore')

import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.metrics.pairwise import euclidean_distances

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from pykrige.rk import RegressionKriging
from sklearn.model_selection import GridSearchCV
from pykrige.rk import Krige
from tqdm.notebook import tqdm

from sklearn.datasets import fetch_california_housing

pd.options.display.precision = 3
pd.options.display.float_format = lambda x: '%.5f' % x
pd.options.display.max_columns = 15
pd.options.display.max_rows = 6

# Data uploading

In [2]:
df_bal8_v4 = pd.read_csv('C:\jupyter\SPP\inputoutput\general_logs\df_bal8_azr_v4.csv')
df_bal8_v4.columns = df_bal8_v4.columns.str.lower()
df_bal8_v4.loc[df_bal8_v4.formation=='Balakhany VIII sand','formation'] = '1_bal8_sand'
df_bal8_v4.loc[df_bal8_v4.formation=='Balakhany VIII 25','formation'] = '2_bal8_25'
df_bal8_v4.loc[df_bal8_v4.formation=='Balakhany VIII 20','formation'] = '3_bal8_20'
df_bal8_v4.loc[df_bal8_v4.formation=='Balakhany VIII 15','formation'] = '4_bal8_15'
df_bal8_v4.loc[df_bal8_v4.formation=='Balakhany VIII 10','formation'] = '5_bal8_10'
df_bal8_v4.loc[df_bal8_v4.formation=='Balakhany VIII 5','formation'] = '6_bal8_5'
well_phit_flag8 = df_bal8_v4[df_bal8_v4.phit_flag==1].groupby('well')['phit_flag'].apply(lambda x: x.iloc[0]).reset_index().well.unique()
df_bal8_v4_flag = df_bal8_v4[df_bal8_v4.well.isin(well_phit_flag8)]

df_bal10_v4 = pd.read_csv('C:\jupyter\SPP\inputoutput\general_logs\df_bal10_vshclp2_v4.csv')
df_bal10_v4.columns = df_bal10_v4.columns.str.lower()
df_bal10_v4.loc[df_bal10_v4.formation=='Balakhany X sand','formation'] = '1_bal10_sand'
df_bal10_v4.loc[df_bal10_v4.formation=='Balakhany X 50','formation'] = '2_bal10_40'
df_bal10_v4.loc[df_bal10_v4.formation=='Balakhany X 40','formation'] = '2_bal10_40'
df_bal10_v4.loc[df_bal10_v4.formation=='Balakhany X 20','formation'] = '3_bal10_20'
well_phit_flag10 = df_bal10_v4[df_bal10_v4.phit_flag==1].groupby('well')['phit_flag'].apply(lambda x: x.iloc[0]).reset_index().well.unique()
df_bal10_v4_flag = df_bal10_v4[df_bal10_v4.well.isin(well_phit_flag10)]

In [3]:
ntd_top_phi_bot8_bp_v4 = pd.read_csv(r'C:\jupyter\SPP\inputoutput\layers\ntd_top_phi_bot8_bp_v4.csv').drop('Unnamed: 0', axis=1)
ntd_top_phi_bot8_bp_v4.columns = ntd_top_phi_bot8_bp_v4.columns.str.lower()
ntd_top_phi_bot10_bp_v4 = pd.read_csv(r'C:\jupyter\SPP\inputoutput\layers\ntd_top_phi_bot10_bp_v4.csv').drop('Unnamed: 0', axis=1)
ntd_top_phi_bot10_bp_v4.columns = ntd_top_phi_bot10_bp_v4.columns.str.lower()

In [4]:
def well_dist_calc(dataset, fm):
    data = dataset.groupby('well')[['xmean', 'ymean']].first().reset_index().dropna()
    row_name = data.well.reset_index().drop(['index'], axis=1)
    distance_fm = pd.DataFrame(euclidean_distances(data[['xmean', 'ymean']]), columns=list(data.well))
    distance_fm_well = distance_fm.join(row_name).set_index('well')
    distance_fm_well = distance_fm_well.reset_index()
    dist_melt = distance_fm_well.melt(id_vars='well', 
                                var_name='well2', 
                                value_name='dist').rename(columns={'well':'well_offset', 'well2':'well'})
    dist_melt = dist_melt[['well', 'well_offset', 'dist']]
    dist_melt = dist_melt[dist_melt.dist != 0].sort_values(by=['well','dist'])
    dist_melt['FORMATION_up'] = fm
    return dist_melt

dist_bal8 = well_dist_calc(df_bal8_v4_flag, 'Balakhany VIII')
dist_bal10 = well_dist_calc(df_bal10_v4_flag, 'Balakhany X')

# Well pairs

In [None]:
df_lst = []
for wellname in dist_bal8.well.unique():
    data = dist_bal8[dist_bal8.well == wellname].iloc[0:1]
    df_lst.append(data)
well_pairs = pd.concat(df_lst).reset_index(drop=True).drop_duplicates(subset=['dist'])
well_pairs_v2 = well_pairs[well_pairs.dist < 500] # 235m is the max distance between wells selected by elbow plot with distance 500m
# well_pairs.hist(column='dist', bins=50)
well_pairs_v2.sort_values(by='dist', ascending=True)

In [None]:
fig = plt.figure(figsize=(20, 5))
sns.lineplot(data=well_pairs_v2.sort_values(by='dist', ascending=False), x='well', y='dist')
plt.xticks(rotation=90)
plt.grid()

In [None]:
df_lst = []
for wellname in dist_bal8.well.unique():
    data = dist_bal8[dist_bal8.well == wellname].iloc[0:1]
    df_lst.append(data)
well_pairs = pd.concat(df_lst).reset_index(drop=True).drop_duplicates(subset=['dist'])
well_pairs_v3 = well_pairs[well_pairs.dist < 230] # 235m is the max distance between wells selected by elbow plot with distance 500m
# well_pairs.hist(column='dist', bins=50)
well_pairs_v3 = well_pairs_v3.sort_values(by='dist', ascending=True).reset_index(drop=True)
well_pairs_v3

In [None]:
def well_offset_comparison_dashboard_pairs(dataset_wells, dataset_layers, dist_df,  well_target, offset_qty, fm_name, print_flag):
    offset_well_list = dist_df[dist_df.well == well_target].iloc[:offset_qty]['well_offset'].values.tolist()
    # offset_well_list = dist_df[dist_df.well == well_target].iloc[:offset_qty]
    well_list = [well_target] + offset_well_list
    data_logs = dataset_wells[(dataset_wells.well.isin(well_list)) & (dataset_wells.phit != 0)]
    data_layers = dataset_layers[   (dataset_layers.well.isin(well_list)) & 
                                    (dataset_layers.htst > 1)]
    khtst_logs = data_logs.groupby(['well','formation'])[['khtst']].apply(lambda x: x.iloc[0] - x.iloc[-1]).reset_index()

    def khtst_layer_calculation(data_logs):
        data = data_logs[data_logs.net == 1]
        df_lst = []
        for wellname in data.well.unique():
            well_data = data_logs[data_logs.well == wellname]
            well_data['tst_index_rev'] = [i for i in range(len(well_data['tst']))[::-1]]
            df_lst.append(well_data)
        data_logs_khtst = pd.concat(df_lst)
        return data_logs_khtst
    data_logs_khtst = khtst_layer_calculation(data_logs)

    def well_dist_title(dist_df):
        offset_well_list = dist_df[dist_df.well == well_target].iloc[:offset_qty]
        well = offset_well_list['well'].iloc[0]
        well1 = offset_well_list.iloc[0,1]
        dist1 = offset_well_list.iloc[0,2].round(0).astype(int)
        # well2 = offset_well_list.iloc[1,1]
        # dist2 = offset_well_list.iloc[1,2].round(0).astype(int)
        # well3 = offset_well_list.iloc[2,1]
        # dist3 = offset_well_list.iloc[2,2].round(0).astype(int)
        return f"target well {well} : offsets {well1} - {dist1}m orange;"

    fig = plt.figure(figsize=(22, 10))
    gs = gridspec.GridSpec(2, 4, figure=fig)
    ax1 = fig.add_subplot(gs[0, 0])
    ax2 = fig.add_subplot(gs[0, 1])
    ax3 = fig.add_subplot(gs[0, 2])
    ax4 = fig.add_subplot(gs[0, 3])
    ax5 = fig.add_subplot(gs[1, :3])

    custom_palette = {well_target: 'red', offset_well_list[0]: 'orange'}
    sns.histplot(data=data_logs, x='phit', hue='well', bins=50, kde=True, ax=ax1, palette=custom_palette)
    sns.scatterplot(data=data_layers, x='htst', y='perm_avg', hue='well', s=75, ax=ax2, alpha=0.5, ec='black', palette=custom_palette)
    sns.lineplot(data=data_logs_khtst, x='tst_index_rev', y='khtst', hue='well', ax=ax3, palette=custom_palette)
    sns.barplot(data = khtst_logs, x='formation', y='khtst', hue='well', ax=ax4, palette=custom_palette)
    ax1.set_yticklabels(ax1.get_yticklabels(), rotation=90, va='center')
    ax2.set_yscale('log')
    ax2.grid(True, which='both', linestyle='--', linewidth=0.5)
    ax2.set_yticklabels(ax2.get_yticklabels(), rotation=90, va='center')
    ax3.grid(True, which='both', linestyle='--', linewidth=0.5)
    ax3.set_yticklabels(ax3.get_yticklabels(), rotation=90, va='center')

    x = np.arange(len(khtst_logs.formation.unique()))
    fms = khtst_logs.formation.unique()
    ax4.set_xticks(x, fms, rotation=45, fontsize=6)
    ax4.set_yticklabels(ax4.get_yticklabels(), rotation=90, va='center')

    offset_well_list = dist_df[dist_df.well == well_target].iloc[:offset_qty]['well_offset'].values.tolist()
    x = dataset_wells[dataset_wells.phit_flag == 1]['xmean']
    y = dataset_wells[dataset_wells.phit_flag == 1]['ymean']
    x_target = dataset_wells[dataset_wells.well == well_target]['xmean'].iloc[0]
    y_target = dataset_wells[dataset_wells.well == well_target]['ymean'].iloc[0]
    x_well1 = dataset_wells[dataset_wells.well == offset_well_list[0]]['xmean'].iloc[0]
    y_well1 = dataset_wells[dataset_wells.well == offset_well_list[0]]['ymean'].iloc[0]
    # x_well2 = dataset_wells[dataset_wells.well == offset_well_list[1]]['xmean'].iloc[0]
    # y_well2 = dataset_wells[dataset_wells.well == offset_well_list[1]]['ymean'].iloc[0]
    # x_well3 = dataset_wells[dataset_wells.well == offset_well_list[2]]['xmean'].iloc[0]
    # y_well3 = dataset_wells[dataset_wells.well == offset_well_list[2]]['ymean'].iloc[0]
    ax5.scatter(x, y, color='gray', s=10)
    ax5.scatter(x_target, y_target, color='red', s=50, ec='black')
    ax5.scatter(x_well1, y_well1, color='orange')
    # ax5.scatter(x_well2, y_well2, color='green')
    # ax5.scatter(x_well3, y_well3, color='#0797eb')

    plt.suptitle(well_dist_title(dist_df), fontsize=16, y=0.92, x=0.32)
    if print_flag == 'print':
        plt.savefig(f'C:/jupyter/SPP/plots/offset_dashboard/{fm_name}_{well_target}_offset_dashboard.png');

# df_bal8_v4_flag = df_bal8_v4_flag[~df_bal8_v4_flag.well.isin(['E31Z', 'D01Z'])]
dist_bal8 = well_dist_calc(df_bal8_v4_flag, 'Balakhany VIII').round(0)
for wellname in well_pairs_v3.well:
    try:
        well_offset_comparison_dashboard_pairs(df_bal8_v4_flag, ntd_top_phi_bot8_bp_v4, well_pairs_v3, wellname, 1, 'bal8','dontprint')
    except:
        print(f"error in {wellname}")

# RegressionKriging

In [9]:
# Several statistical metrics are commonly used to describe the distribution of a dataset. 
# These metrics provide insights into the shape, central tendency, and spread of the data. Here are some of the key metrics:

# Mean: The average of all data points. It provides a measure of central tendency.

# Median: The middle value when the data points are arranged in order. It is another measure of central tendency that 
# is less affected by outliers than the mean.

# Mode: The most frequently occurring value(s) in the dataset. It can be used to understand the most common or 
# popular values in a distribution.

# Standard Deviation (SD): Measures the amount of variation or dispersion of a set of values. A low SD indicates that 
# the values tend to be close to the mean, while a high SD indicates that the values are spread out over a wider range.

# Variance: The square of the standard deviation. It measures how far each number in the set is from the mean and thus 
# from every other number in the set.

# Range: The difference between the highest and lowest values in the dataset. It gives a sense of the spread of the data.

# Interquartile Range (IQR): The difference between the 75th percentile (Q3) and the 25th percentile (Q1) in the data. 
# It is a measure of statistical dispersion and is less affected by outliers.

# Skewness: A measure of the asymmetry of the probability distribution of a real-valued random variable. Positive skew indicates 
# a distribution with an asymmetric tail extending towards more positive values, while negative skew indicates a tail extending 
# towards more negative values.

# Kurtosis: A measure of the "tailedness" of the probability distribution. High kurtosis means a distribution has heavy tails 
# and a sharp peak, while low kurtosis means a distribution has light tails and a flat peak.

# Percentiles/Quartiles: Points in the distribution below which a certain percentage of the data falls. Quartiles are specific 
# percentiles: the 25th percentile (Q1), the 50th percentile (median or Q2), and the 75th percentile (Q3).

In [None]:
ntd_top_phi_bot8_bp_v4

In [10]:
ntd_top_phi_bot8_bp_v4['htst*phit_avg'] = ntd_top_phi_bot8_bp_v4['htst'] * ntd_top_phi_bot8_bp_v4['phit_avg']
ntd8 = ntd_top_phi_bot8_bp_v4.groupby('well')[['phit_avg','htst', 'htst*phit_avg']].agg({'phit_avg':'mean','htst':'sum', 'htst*phit_avg':'sum'}).reset_index()
xy = df_bal8_v4_flag.groupby('well')[['xmean','ymean']].first().reset_index()
ntd8 = ntd8.merge(xy, on='well').round({'xmean':0, 'ymean':0})
ntd8 = ntd8[~ntd8.well.isin(well_pairs_v3.well.unique())]
ntd8['phit_w_avg'] = ntd8['htst*phit_avg'] / ntd8['htst']
ntd8 = ntd8[['well', 'phit_avg', 'htst', 'phit_w_avg', 'xmean', 'ymean']]

field = df_bal8_v4.groupby('well')['field'].first().reset_index()
ntd8 = ntd8.merge(field, on='well')
ntd8 = ntd8[['well', 'phit_avg', 'htst', 'phit_w_avg', 'xmean', 'ymean', 'field']]
ntd8 = pd.get_dummies(ntd8, columns=['field'])
ntd8 = ntd8[~ntd8.well.isin(well_pairs_v3.well.unique())]

In [None]:
ntd8

## Base case 42 big function

In [None]:
df_lst_pred, df_lst_kriging = [], []
for random_state_value in tqdm(range(1, 101)):
    models_test = [svr_model, rf_model, lr_model]
    def ml_kriging_prediction_bal8(models, random_state_value, test_size_value):
        svr_model = SVR()
        rf_model = RandomForestRegressor()
        lr_model = LinearRegression()

        feature_bal8 = ntd8[['htst']].values.reshape(-1,1)
        coord_bal8 = ntd8[['xmean', 'ymean']].values
        target_bal8 = ntd8['phit_w_avg'].values

        f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
            feature_bal8, coord_bal8, target_bal8, test_size=test_size_value, random_state=random_state_value
        )

        def model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models):
            df_final_lst = []
            for m in models:
                reg_score_lst, rk_score_lst, nn_lst, m_lst = [], [], [], []
                for nn in range(2,16):
                    print("n_closest_points:", nn)
                    m_rk = RegressionKriging(regression_model=m, n_closest_points=nn, verbose=False)
                    m_rk.fit(f_train, xy_train, target_train)
                    reg_score_lst.append(m_rk.regression_model.score(f_test, target_test))
                    rk_score_lst.append(m_rk.score(f_test, xy_test, target_test))
                    nn_lst.append(nn)
                    m_lst.append(m)
                result = pd.DataFrame({'model':m_lst,'n_closest_points':nn_lst, 'reg_score':reg_score_lst,'rk_score':rk_score_lst})
                result['random_state'] = random_state_value
                result['test_size'] = test_size_value
                df_final_lst.append(result)
                df_final = pd.concat(df_final_lst)
            return df_final
        ml_kriging = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)

        def model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, models_df):
            model_sorted = models_df.sort_values(by='rk_score', ascending=False)
            model_name = model_sorted.iloc[0]['model']
            nn_points = model_sorted.iloc[0]['n_closest_points']
            m_rk = RegressionKriging(regression_model=model_name, n_closest_points=nn_points)
            m_rk.fit(f_train, xy_train, target_train)

            pred = m_rk.predict(f_test, xy_test)
            pred_df = pd.DataFrame(zip(pred, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
            pred_df['model'] = model_name
            pred_df['n_closest_points'] = nn_points
            pred_df['random_state'] = random_state_value
            pred_df['test_size'] = test_size_value
            pred_df['up_1.15pu'] = pred_df.phit_w_avg_true+0.0115
            pred_df['down_1.15pu'] = pred_df.phit_w_avg_true-0.0115
            pred_df['qc'] = np.where((pred_df.phit_w_avg_pred >= pred_df['down_1.15pu']) & (pred_df.phit_w_avg_pred <= pred_df['up_1.15pu']), 1, 0)
            return pred_df
        prediction_df = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, ml_kriging)

        return prediction_df, ml_kriging
    prediction_df, ml_kriging = ml_kriging_prediction_bal8(models_test, random_state_value, 0.5)
    df_lst_pred.append(prediction_df)
    df_lst_kriging.append(ml_kriging)
df_pred = pd.concat(df_lst_pred)
df_kriging = pd.concat(df_lst_kriging)

In [None]:
df_pred_v2 = df_pred.groupby(['random_state','qc'])['qc'].count().rename('count').reset_index()
sns.lineplot(data=df_pred_v2, x='random_state', y='count', hue='qc')

## Base case 42

In [None]:
svr_model = SVR(C=0.1, gamma="auto")
rf_model = RandomForestRegressor(n_estimators=10)
lr_model = LinearRegression(copy_X=True, fit_intercept=False)

models = [svr_model, rf_model, lr_model]

feature_bal8 = ntd8[['htst']].values.reshape(-1,1)
coord_bal8 = ntd8[['xmean', 'ymean']].values
target_bal8 = ntd8['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.3, random_state=42
)

def model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models):
    # f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    #     feature_bal8, coord_bal8, target_bal8, test_size=0.3, random_state=42)
    df_final_lst = []
    fig, ax = plt.subplots(1,2, figsize=(12, 5))
    for m in models:
        reg_score_lst, rk_score_lst, nn_lst, m_lst = [], [], [], []
        for nn in range(2,16):
            print("n_closest_points:", nn)
            m_rk = RegressionKriging(regression_model=m, n_closest_points=nn, verbose=False)
            m_rk.fit(f_train, xy_train, target_train)
            reg_score_lst.append(m_rk.regression_model.score(f_test, target_test))
            rk_score_lst.append(m_rk.score(f_test, xy_test, target_test))
            nn_lst.append(nn)
            m_lst.append(m)
        result = pd.DataFrame({'model':m_lst,'n_closest_points':nn_lst, 'reg_score':reg_score_lst,'rk_score':rk_score_lst})
                
        ax[0].plot(result.n_closest_points, result.rk_score)
        ax[0].set_ylabel('rk_score')
        ax[0].set_xlabel('n_closest_points')
        ax[0].grid()
        ax[0].legend(models)
        ax[0].set_title('Model rk_score comparison')
        ax[1].plot(result.n_closest_points, result.reg_score)
        ax[1].set_ylabel('reg_score')
        ax[1].set_xlabel('n_closest_points')
        ax[1].grid()
        ax[1].legend(models)
        ax[1].set_title('Model reg_score comparison')
        df_final_lst.append(result)
        df_final = pd.concat(df_final_lst)
    return df_final
base_42 = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)

In [None]:
def model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, models_df):
    model_sorted = models_df.sort_values(by='rk_score', ascending=False)
    model_name = model_sorted.iloc[0]['model']
    nn_points = model_sorted.iloc[0]['n_closest_points']
    m_rk = RegressionKriging(regression_model=model_name, n_closest_points=nn_points)
    m_rk.fit(f_train, xy_train, target_train)
    print("Model name:", model_name, "n_closer_points:", nn_points)
    print("Regression Score: ", m_rk.regression_model.score(f_test, target_test))
    print("RK score: ", m_rk.score(f_test, xy_test, target_test))

    pred = m_rk.predict(f_test, xy_test)
    pred_df = pd.DataFrame(zip(f_test.flatten(), pred, target_test), columns=['htst','phit_w_avg_pred', 'phit_w_avg_true'])
    pred_df['up5%'] = pred_df.phit_w_avg_true+0.0115
    pred_df['down5%'] = pred_df.phit_w_avg_true-0.0115
    pred_df['qc'] = np.where((pred_df.phit_w_avg_pred >= pred_df['down5%']) & (pred_df.phit_w_avg_pred <= pred_df['up5%']), 1, 0)
    display(pred_df.value_counts('qc', normalize=True))

    fig, ax = plt.subplots(1,2, figsize=(14, 5))
    xy_train_df = pd.DataFrame(xy_train, columns=['x', 'y'])
    xy_train_df['source'] = 'train'
    xy_test_df = pd.DataFrame(xy_test, columns=['x', 'y'])
    xy_test_df['source'] = 'test'
    xy_data_df = pd.concat([xy_train_df, xy_test_df])
    sns.scatterplot(xy_data_df, x='x', y='y', hue='source', ax=ax[0])

    custom_palette = {0: 'red', 1: 'green'}
    sns.scatterplot(pred_df, x='phit_w_avg_true', y='phit_w_avg_pred', hue='qc', s=50, alpha=0.5, ec='black', palette=custom_palette, ax=ax[1])
    ax[1].plot([0.15,0.27], [0.15,0.27], color='blue', ls='--')
    ax[1].plot([0.15,0.27], [0.15+0.0115,0.27+0.0115], color='green', ls='--')
    ax[1].plot([0.15,0.27], [0.15-0.0115,0.27-0.0115], color='green', ls='--')
    ax[1].grid()
    ax[1].set_xlim(0.15,0.27)
    ax[1].set_ylim(0.15,0.27);
    return pred_df
pred_base_42 = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, base_42)

## Base case 1

In [None]:
svr_model = SVR(C=0.1, gamma="auto")
rf_model = RandomForestRegressor(n_estimators=10)
lr_model = LinearRegression(copy_X=True, fit_intercept=False)

models = [svr_model, rf_model, lr_model]

feature_bal8 = ntd8[['htst']].values.reshape(-1,1)
coord_bal8 = ntd8[['xmean', 'ymean']].values
target_bal8 = ntd8['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.3, random_state=1
)

base_1 = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)
pred_base_1 = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, base_1)

## Base 123

In [None]:
svr_model = SVR(C=0.1, gamma="auto")
rf_model = RandomForestRegressor(n_estimators=10)
lr_model = LinearRegression(copy_X=True, fit_intercept=False)

models = [svr_model, rf_model, lr_model]

feature_bal8 = ntd8[['htst']].values.reshape(-1,1)
coord_bal8 = ntd8[['xmean', 'ymean']].values
target_bal8 = ntd8['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.3, random_state=123
)

base_123 = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)
pred_base_123 = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, base_123)

In [None]:
pred_base_1['model'] = 'base_1'
pred_base_42['model'] = 'base_42'
pred_base_123['model'] = 'base_123'
pred_final = pd.concat([pred_base_42, pred_base_1, pred_base_123])
sns.scatterplot(data=pred_final, x='phit_w_avg_true', y='phit_w_avg_pred', hue='model', style='qc', palette='bright')
plt.plot([0.15,0.27], [0.15,0.27], color='red', ls='--')
plt.plot([0.15,0.27], [0.15+0.0115,0.27+0.0115], color='green', ls='--')
plt.plot([0.15,0.27], [0.15-0.0115,0.27-0.0115], color='green', ls='--')
plt.grid()
plt.xlim(0.15,0.27)
plt.ylim(0.15,0.27)

In [None]:
# display(pred_df.value_counts('qc', normalize=True))
qc42 = pred_final[pred_final.model == 'base_42'].qc.value_counts(normalize=True)
qc1 = pred_final[pred_final.model == 'base_1'].qc.value_counts(normalize=True)
qc123 = pred_final[pred_final.model == 'base_123'].qc.value_counts(normalize=True)
print('Quantati of test points:', len(f_test), 'Quantaty of train points:', len(f_train))
print("base_42", qc42, "base_1", qc1, "base_123", qc123)

## Feature 'field' 42

In [None]:
svr_model = SVR(C=0.1, gamma="auto")
rf_model = RandomForestRegressor(n_estimators=10)
lr_model = LinearRegression(copy_X=True, fit_intercept=False)

models = [svr_model, rf_model, lr_model]

feature_bal8 = ntd8[['htst','field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI']].values.reshape(-1,4)
coord_bal8 = ntd8[['xmean', 'ymean']].values
target_bal8 = ntd8['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.3, random_state=42
)

feature_42 = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)
pred_f42 = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, feature_42)

# Model name: SVR(C=0.1, gamma='auto') n_closer_points: 8
# Regression Score:  -0.026579262856369246
# RK score:  0.6251967133689289
# qc
# 1   0.64286
# 0   0.35714

## Feature 'field' f42

In [None]:
svr_model = SVR(C=0.1, gamma="auto")
rf_model = RandomForestRegressor(n_estimators=10)
lr_model = LinearRegression(copy_X=True, fit_intercept=False)

models = [svr_model, rf_model, lr_model]

feature_bal8 = ntd8[['htst','field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI']].values.reshape(-1,4)
coord_bal8 = ntd8[['xmean', 'ymean']].values
target_bal8 = ntd8['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.3, random_state=242
)

feature_242 = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)
pred_f242 = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, feature_242)

# Model name: SVR(C=0.1, gamma='auto') n_closer_points: 8
# Regression Score:  -0.026579262856369246
# RK score:  0.6251967133689289
# qc
# 1   0.64286
# 0   0.35714

## Feature 'field' 1

In [None]:
svr_model = SVR(C=0.1, gamma="auto")
rf_model = RandomForestRegressor(n_estimators=10)
lr_model = LinearRegression(copy_X=True, fit_intercept=False)

models = [svr_model, rf_model, lr_model]

feature_bal8 = ntd8[['htst','field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI']].values.reshape(-1,4)
coord_bal8 = ntd8[['xmean', 'ymean']].values
target_bal8 = ntd8['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.3, random_state=1
)

feature_f1 = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)
pred_f1 = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, feature_242)

In [None]:
pred_f42['model'] = 'feature_42'
pred_f242['model'] = 'feature_242'
pred_f1['model'] = 'feature_1'
pred_ffinal = pd.concat([pred_f42, pred_f242, pred_f1])

sns.scatterplot(data=pred_ffinal, x='phit_w_avg_true', y='phit_w_avg_pred', hue='model', style='qc', palette='bright')
plt.plot([0.15,0.27], [0.15,0.27], color='red', ls='--')
plt.plot([0.15,0.27], [0.15+0.0115,0.27+0.0115], color='green', ls='--')
plt.plot([0.15,0.27], [0.15-0.0115,0.27-0.0115], color='green', ls='--')
plt.grid()
plt.xlim(0.15,0.27)
plt.ylim(0.15,0.27)

In [None]:
# 0.64 v/v of 28 = ~18, 0.75 v/v of 28 = 21, variations 3 points is equal 0.11 v/v
qc_f42 = pred_f42[pred_f42.model == 'feature_42'].qc.value_counts(normalize=True)
qc_f242 = pred_f242[pred_f242.model == 'feature_242'].qc.value_counts(normalize=True)
qc_f1 = pred_f1[pred_f1.model == 'feature_1'].qc.value_counts(normalize=True)
print('Quantati of test points:', len(f_test), 'Quantaty of train points:', len(f_train))
print("feature_42", qc_f42, "feature_242", qc_f242, "feature_1", qc_f1)

# RegKrig experiments: method + var_model

## Kriging base 42

In [None]:
svr_model = SVR(C=0.1, gamma="auto")
rf_model = RandomForestRegressor(n_estimators=10)
lr_model = LinearRegression(copy_X=True, fit_intercept=False)

models = [svr_model, rf_model, lr_model]

feature_bal8 = ntd8[['htst']].values.reshape(-1,1)
coord_bal8 = ntd8[['xmean', 'ymean']].values
target_bal8 = ntd8['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.3, random_state=42
)

def model_mlkrige_run_kriging(f_train, f_test, xy_train, xy_test, target_train, target_test, models, rk_method, vmodel):
    df_final_lst = []
    for m in models:
        reg_score_lst, rk_score_lst, nn_lst, m_lst = [], [], [], []
        for nn in range(2,16):
            print("n_closest_points:", nn)
            m_rk = RegressionKriging(regression_model=m, method=rk_method, n_closest_points=nn, variogram_model=vmodel, verbose=False)
            m_rk.fit(f_train, xy_train, target_train)
            reg_score_lst.append(m_rk.regression_model.score(f_test, target_test))
            rk_score_lst.append(m_rk.score(f_test, xy_test, target_test))
            nn_lst.append(nn)
            m_lst.append(m)
        result = pd.DataFrame({'model':m_lst,
                               'n_closest_points':nn_lst, 
                               'reg_score':reg_score_lst,
                               'rk_score':rk_score_lst,
                               'rk_method':rk_method,
                               'vmodel':vmodel})
        df_final_lst.append(result)
        df_final = pd.concat(df_final_lst)

    return df_final

total_result_42 = []
for method in ['universal', 'ordinary']:
    for vmodel in ['linear', 'power', 'gaussian', 'spherical', 'exponential']:
        base_42_kriging = model_mlkrige_run_kriging(f_train, f_test, xy_train, xy_test, target_train, target_test, models, method, vmodel)
        total_result_42.append(base_42_kriging)
total_result_42 = pd.concat(total_result_42)
total_result_42.sort_values(by='rk_score', ascending=False)

In [None]:
def model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, models_df):
    model_sorted = models_df.sort_values(by='rk_score', ascending=False)
    model_name = model_sorted.iloc[0]['model']
    nn_points = model_sorted.iloc[0]['n_closest_points']
    rk_method = model_sorted.iloc[0]['rk_method']
    vmodel = model_sorted.iloc[0]['vmodel']
    m_rk = RegressionKriging(regression_model=model_name,method=rk_method, n_closest_points=nn_points, variogram_model=vmodel)
    m_rk.fit(f_train, xy_train, target_train)
    print("Model name:", model_name, "n_closer_points:", nn_points, "rk_method:", rk_method, "vmodel:", vmodel)
    print("Regression Score: ", m_rk.regression_model.score(f_test, target_test))
    print("RK score: ", m_rk.score(f_test, xy_test, target_test))

    pred = m_rk.predict(f_test, xy_test)
    pred_df = pd.DataFrame(zip(pred, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
    pred_df['up5%'] = pred_df.phit_w_avg_true+0.0115
    pred_df['down5%'] = pred_df.phit_w_avg_true-0.0115
    pred_df['qc'] = np.where((pred_df.phit_w_avg_pred >= pred_df['down5%']) & (pred_df.phit_w_avg_pred <= pred_df['up5%']), 1, 0)
    display(pred_df.value_counts('qc', normalize=True))

    fig, ax = plt.subplots(1,2, figsize=(14, 5))
    xy_train_df = pd.DataFrame(xy_train, columns=['x', 'y'])
    xy_train_df['source'] = 'train'
    xy_test_df = pd.DataFrame(xy_test, columns=['x', 'y'])
    xy_test_df['source'] = 'test'
    xy_data_df = pd.concat([xy_train_df, xy_test_df])
    sns.scatterplot(xy_data_df, x='x', y='y', hue='source', ax=ax[0])

    custom_palette = {0: 'red', 1: 'green'}
    sns.scatterplot(pred_df, x='phit_w_avg_true', y='phit_w_avg_pred', hue='qc', s=50, alpha=0.5, ec='black', palette=custom_palette, ax=ax[1])
    ax[1].plot([0.15,0.27], [0.15,0.27], color='blue', ls='--')
    ax[1].plot([0.15,0.27], [0.15+0.0115,0.27+0.0115], color='green', ls='--')
    ax[1].plot([0.15,0.27], [0.15-0.0115,0.27-0.0115], color='green', ls='--')
    ax[1].grid()
    ax[1].set_xlim(0.15,0.27)
    ax[1].set_ylim(0.15,0.27);
    return pred_df

pred_kbase_42= model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, total_result_42)

## Kriging feature f42

In [None]:
svr_model = SVR(C=0.1, gamma="auto")
rf_model = RandomForestRegressor(n_estimators=10)
lr_model = LinearRegression(copy_X=True, fit_intercept=False)

models = [svr_model, rf_model, lr_model]

feature_bal8 = ntd8[['htst','field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI']].values.reshape(-1,4)
coord_bal8 = ntd8[['xmean', 'ymean']].values
target_bal8 = ntd8['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.3, random_state=42
)

def model_mlkrige_run_kriging(f_train, f_test, xy_train, xy_test, target_train, target_test, models, rk_method, vmodel):
    # f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    #     feature_bal8, coord_bal8, target_bal8, test_size=0.3, random_state=42)
    df_final_lst = []
    # fig, ax = plt.subplots(1,2, figsize=(12, 5))
    for m in models:
        reg_score_lst, rk_score_lst, nn_lst, m_lst = [], [], [], []
        for nn in range(2,16):
            print("n_closest_points:", nn)
            m_rk = RegressionKriging(regression_model=m, method=rk_method, n_closest_points=nn, variogram_model=vmodel, verbose=False)
            m_rk.fit(f_train, xy_train, target_train)
            reg_score_lst.append(m_rk.regression_model.score(f_test, target_test))
            rk_score_lst.append(m_rk.score(f_test, xy_test, target_test))
            nn_lst.append(nn)
            m_lst.append(m)
        result = pd.DataFrame({'model':m_lst,
                               'n_closest_points':nn_lst, 
                               'reg_score':reg_score_lst,
                               'rk_score':rk_score_lst,
                               'rk_method':rk_method,
                               'vmodel':vmodel})
        df_final_lst.append(result)
        df_final = pd.concat(df_final_lst)

    return df_final

total_result_f42 = []
for method in ['universal', 'ordinary']:
    for vmodel in ['linear', 'power', 'gaussian', 'spherical', 'exponential']:
        base_f42_kriging = model_mlkrige_run_kriging(f_train, f_test, xy_train, xy_test, target_train, target_test, models, method, vmodel)
        total_result_f42.append(base_f42_kriging)
total_result_f42 = pd.concat(total_result_f42)
total_result_f42.sort_values(by='rk_score', ascending=False)

In [None]:
pred_kbase_f42 = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, total_result_f42)

## Kriging feature f33

In [None]:
svr_model = SVR(C=0.1, gamma="auto")
rf_model = RandomForestRegressor(n_estimators=10)
lr_model = LinearRegression(copy_X=True, fit_intercept=False)

models = [svr_model, rf_model, lr_model]

feature_bal8 = ntd8[['htst','field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI']].values.reshape(-1,4)
coord_bal8 = ntd8[['xmean', 'ymean']].values
target_bal8 = ntd8['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.3, random_state=33
)

def model_mlkrige_run_kriging(f_train, f_test, xy_train, xy_test, target_train, target_test, models, rk_method, vmodel):
    # f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    #     feature_bal8, coord_bal8, target_bal8, test_size=0.3, random_state=42)
    df_final_lst = []
    # fig, ax = plt.subplots(1,2, figsize=(12, 5))
    for m in models:
        reg_score_lst, rk_score_lst, nn_lst, m_lst = [], [], [], []
        for nn in range(2,16):
            print("n_closest_points:", nn)
            m_rk = RegressionKriging(regression_model=m, method=rk_method, n_closest_points=nn, variogram_model=vmodel, verbose=False)
            m_rk.fit(f_train, xy_train, target_train)
            reg_score_lst.append(m_rk.regression_model.score(f_test, target_test))
            rk_score_lst.append(m_rk.score(f_test, xy_test, target_test))
            nn_lst.append(nn)
            m_lst.append(m)
        result = pd.DataFrame({'model':m_lst,
                               'n_closest_points':nn_lst, 
                               'reg_score':reg_score_lst,
                               'rk_score':rk_score_lst,
                               'rk_method':rk_method,
                               'vmodel':vmodel})
        df_final_lst.append(result)
        df_final = pd.concat(df_final_lst)

    return df_final

total_result_f33 = []
for method in ['universal', 'ordinary']:
    for vmodel in ['linear', 'power', 'gaussian', 'spherical', 'exponential']:
        base_f33_kriging = model_mlkrige_run_kriging(f_train, f_test, xy_train, xy_test, target_train, target_test, models, method, vmodel)
        total_result_f33.append(base_f33_kriging)
total_result_f33 = pd.concat(total_result_f33)
total_result_f33.sort_values(by='rk_score', ascending=False)

In [None]:
pred_kbase_f33 = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, total_result_f33)

In [None]:
pred_kbase_42['model'] = 'feature_42'
pred_kbase_f42['model'] = 'feature_f42'
pred_kbase_f33['model'] = 'feature_f33'
pred_kfinal = pd.concat([pred_kbase_42, pred_kbase_f42, pred_kbase_f33])

sns.scatterplot(data=pred_kfinal, x='phit_w_avg_true', y='phit_w_avg_pred', hue='model', style='qc', palette='bright')
plt.plot([0.15,0.27], [0.15,0.27], color='red', ls='--')
plt.plot([0.15,0.27], [0.15+0.0115,0.27+0.0115], color='green', ls='--')
plt.plot([0.15,0.27], [0.15-0.0115,0.27-0.0115], color='green', ls='--')
plt.grid()
plt.xlim(0.15,0.27)
plt.ylim(0.15,0.27)

In [None]:
qc_k42 = pred_kbase_42[pred_kbase_42.model == 'feature_42'].qc.value_counts(normalize=True)
qc_kf42 = pred_kbase_f42[pred_kbase_f42.model == 'feature_f42'].qc.value_counts(normalize=True)
qc_kf33 = pred_kbase_f33[pred_kbase_f33.model == 'feature_f33'].qc.value_counts(normalize=True)
print('Quantati of test points:', len(f_test), 'Quantaty of train points:', len(f_train))
print("feature_k42", qc_k42, "feature_kf42", qc_kf42, "feature_kf33", qc_kf33)

## Anithotropy test 42

In [None]:
param_dict = {
    "method": ["ordinary", "universal"],
    "variogram_model": ["linear", "exponencial", "power", "gaussian", "spherical"],
    "n_closest_points": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
    "anisotropy_scaling": [(0,1), (0.5,1), (1,1), (1,0.5), (1,0)],
    "anisotropy_angle": [(0, 0, 0), (30, 0, 0), (60, 0, 0), (90, 0, 0)],
}

feature_bal8 = ntd8[['htst','field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI']].values.reshape(-1,4)
coord_bal8 = ntd8[['xmean', 'ymean']].values
target_bal8 = ntd8['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.3, random_state=42
)

estimator = GridSearchCV(Krige(), param_dict, verbose=True, return_train_score=True)
estimator.fit(X=xy_train, y=target_train)

if hasattr(estimator, "best_score_"):
    print("best_score R² = {:.3f}".format(estimator.best_score_))
    print("best_params = ", estimator.best_params_)

In [157]:
# Посчитать отдельно линейную регрессию чтоб сравнить результат с кригингом ниже !!!

In [None]:
best_params = estimator.best_params_
model = Krige(**best_params)
model.fit(xy_train, target_train)
pred_krige = model.predict(xy_test)
pred_krige_df = pd.DataFrame(zip(pred_krige, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
xy_krige_df = pd.DataFrame(xy_test, columns=['x', 'y'])
pred_krige = pd.concat([xy_krige_df, pred_krige_df], axis=1)
pred_krige['up5%'] = pred_krige.phit_w_avg_true+0.0115
pred_krige['down5%'] = pred_krige.phit_w_avg_true-0.0115
pred_krige['qc'] = np.where((pred_krige.phit_w_avg_pred >= pred_krige['down5%']) & (pred_krige.phit_w_avg_pred <= pred_krige['up5%']), 1, 0)
pred_krige['model'] = 'krige_42'
display(pred_krige.value_counts('qc', normalize=True))

custom_palette = {0: 'red', 1: 'green'}
sns.scatterplot(pred_krige, x='phit_w_avg_true', y='phit_w_avg_pred', hue='qc', s=50, alpha=0.5, ec='black', palette=custom_palette)
plt.plot([0.15,0.27], [0.15,0.27], color='red', ls='--')
plt.plot([0.15,0.27], [0.15+0.0115,0.27+0.0115], color='green', ls='--')
plt.plot([0.15,0.27], [0.15-0.0115,0.27-0.0115], color='green', ls='--')
plt.grid()
plt.xlim(0.15,0.27)
plt.ylim(0.15,0.27)

## Summary

In [None]:
qc_k42 = pred_kbase_42[pred_kbase_42.model == 'feature_42'].qc.value_counts().reset_index()
qc_k42['norm'] = (qc_k42['count'] / qc_k42['count'].sum()).round(2)
qc_kf42 = pred_kbase_f42[pred_kbase_f42.model == 'feature_f42'].qc.value_counts().reset_index()
qc_kf42['norm'] = (qc_kf42['count'] / qc_kf42['count'].sum()).round(2)
qc_kf33 = pred_kbase_f33[pred_kbase_f33.model == 'feature_f33'].qc.value_counts().reset_index()
qc_kf33['norm'] = (qc_kf33['count'] / qc_kf33['count'].sum()).round(2)
qc_krige = pred_krige[pred_krige.model == 'krige_42'].qc.value_counts().reset_index()
qc_krige['norm'] = (qc_krige['count'] / qc_krige['count'].sum()).round(2)
print('Quantati of test points:', len(f_test), 'Quantaty of train points:', len(f_train))
print("\nKriging base 42 Model name: SVR(C=0.1, gamma='auto') n_closer_points: 9 rk_method: ordinary vmodel: exponential\n", 
      qc_k42, 
      "\nKriging feature f42 Model name: LinearRegression(fit_intercept=False) n_closer_points: 9 rk_method: ordinary vmodel: gaussian\n", 
      qc_kf42, 
      "\nKriging feature f33 Model name: SVR(C=0.1, gamma='auto') n_closer_points: 2 rk_method: ordinary vmodel: power\n", 
      qc_kf33, 
      "\nAnithotropy test 42 {'anisotropy_angle': (60, 0, 0), 'anisotropy_scaling': (0.5, 1), 'method': 'universal', 'n_closest_points': 2, 'variogram_model': 'gaussian'}\n", 
      qc_krige)

# Testing influence ml, kriging, ml+kriging to prediction

In [None]:
# 1. написать функцию для расчета предикшена с разными ramdom_state для базовых настроек кригинга и модели              +
# 2. посчитать стабильность предсказаний для 100 разных random_state                                                    +
# 3. посчитать предсказание отдельно для регрессии, регрессии с кригингом и кригинга - везде базовые настройки          +
# 4. добавить расчет поросити по соседним скважинам

## No features

### ml + kriging

In [None]:
def ml_kriging_prediction_bal8(models, random_state_value, test_size_value):
    svr_model = SVR()
    rf_model = RandomForestRegressor()
    lr_model = LinearRegression()

    feature_bal8 = ntd8[['htst']].values.reshape(-1,1)
    coord_bal8 = ntd8[['xmean', 'ymean']].values
    target_bal8 = ntd8['phit_w_avg'].values

    f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
        feature_bal8, coord_bal8, target_bal8, test_size=test_size_value, random_state=random_state_value
    )

    def model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models):
        df_final_lst = []
        for m in models:
            reg_score_lst, rk_score_lst, nn_lst, m_lst = [], [], [], []
            for nn in range(2,16):
                print("n_closest_points:", nn)
                m_rk = RegressionKriging(regression_model=m, n_closest_points=nn, verbose=False)
                m_rk.fit(f_train, xy_train, target_train)
                reg_score_lst.append(m_rk.regression_model.score(f_test, target_test))
                rk_score_lst.append(m_rk.score(f_test, xy_test, target_test))
                nn_lst.append(nn)
                m_lst.append(m)
            result = pd.DataFrame({'model':m_lst,'n_closest_points':nn_lst, 'reg_score':reg_score_lst,'rk_score':rk_score_lst})
            result['random_state'] = random_state_value
            result['test_size'] = test_size_value
            df_final_lst.append(result)
            df_final = pd.concat(df_final_lst)
        return df_final
    ml_kriging = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)

    def model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, models_df):
        model_sorted = models_df.sort_values(by='rk_score', ascending=False)
        model_name = model_sorted.iloc[0]['model']
        nn_points = model_sorted.iloc[0]['n_closest_points']
        m_rk = RegressionKriging(regression_model=model_name, n_closest_points=nn_points)
        m_rk.fit(f_train, xy_train, target_train)

        pred = m_rk.predict(f_test, xy_test)
        pred_df = pd.DataFrame(zip(pred, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
        pred_df['model'] = model_name
        pred_df['n_closest_points'] = nn_points
        pred_df['random_state'] = random_state_value
        pred_df['test_size'] = test_size_value
        pred_df['up_1.15pu'] = pred_df.phit_w_avg_true+0.0115
        pred_df['down_1.15pu'] = pred_df.phit_w_avg_true-0.0115
        pred_df['qc'] = np.where((pred_df.phit_w_avg_pred >= pred_df['down_1.15pu']) & (pred_df.phit_w_avg_pred <= pred_df['up_1.15pu']), 1, 0)
        return pred_df
    prediction_df = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, ml_kriging)

    return prediction_df, ml_kriging
models_test = [svr_model, rf_model, lr_model]
prediction_df, ml_kriging = ml_kriging_prediction_bal8(models_test, 42, 0.5)
qc_calc = prediction_df.qc.value_counts(normalize=True).reset_index()
custom_palette = {0: 'red', 1: 'green'}
sns.scatterplot(data=prediction_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
plt.title('qc = 1 ' + str(qc_calc[qc_calc.qc==1]['proportion'].values[0].round(2)));

In [None]:
ml_kriging.sort_values(by='rk_score', ascending=False).head(2)

In [None]:
prediction_df.head(2)

### ml

In [None]:
svr_model = SVR()

feature_bal8 = ntd8[['htst']].values.reshape(-1,1)
coord_bal8 = ntd8[['xmean', 'ymean']].values
target_bal8 = ntd8['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42
)
svr_model.fit(f_train, target_train)
pred_svr = svr_model.predict(f_test)
pred_svr_df = pd.DataFrame(zip(pred_svr, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
pred_svr_df['model'] = svr_model.__class__.__name__
# pred_svr_df['n_closest_points'] = nn_points
pred_svr_df['random_state'] = 42
pred_svr_df['test_size'] = 0.5
pred_svr_df['up_1.15pu'] = pred_svr_df.phit_w_avg_true+0.0115
pred_svr_df['down_1.15pu'] = pred_svr_df.phit_w_avg_true-0.0115
pred_svr_df['qc'] = np.where((pred_svr_df.phit_w_avg_pred >= pred_svr_df['down_1.15pu']) & (pred_svr_df.phit_w_avg_pred <= pred_svr_df['up_1.15pu']), 1, 0)

qc_calc = pred_svr_df.qc.value_counts(normalize=True).reset_index()
custom_palette = {0: 'red', 1: 'green'}
sns.scatterplot(data=pred_svr_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
plt.title('qc = 1 ' + str(qc_calc[qc_calc.qc==1]['proportion'].values[0].round(2)));

### kriging

In [None]:
feature_bal8 = ntd8[['htst']].values.reshape(-1,1)
coord_bal8 = ntd8[['xmean', 'ymean']].values
target_bal8 = ntd8['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42
)

kriging = Krige(n_closest_points=2, method='ordinary', variogram_model='linear')
kriging.fit(x=xy_train, y=target_train)
pred_kriging = kriging.predict(xy_test)

kriging_df = pd.DataFrame(zip(pred_kriging, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
kriging_df['model'] = kriging.__class__.__name__
kriging_df['n_closest_points'] = 2
kriging_df['random_state'] = 42
kriging_df['test_size'] = 0.5
kriging_df['up_1.15pu'] = kriging_df.phit_w_avg_true+0.0115
kriging_df['down_1.15pu'] = kriging_df.phit_w_avg_true-0.0115
kriging_df['qc'] = np.where((kriging_df.phit_w_avg_pred >= kriging_df['down_1.15pu']) & (kriging_df.phit_w_avg_pred <= kriging_df['up_1.15pu']), 1, 0)
kriging_df

qc_calc = kriging_df.qc.value_counts(normalize=True).reset_index()
custom_palette = {0: 'red', 1: 'green'}
sns.scatterplot(data=kriging_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
plt.title('qc = 1 ' + str(qc_calc[qc_calc.qc==1]['proportion'].values[0].round(2)));

## 1 simple feature

### ml+kriging

In [None]:
def ml_kriging_prediction_feature_bal8(models, random_state_value, test_size_value):
    svr_model = SVR()
    rf_model = RandomForestRegressor()
    lr_model = LinearRegression()

    feature_bal8 = ntd8[['htst','field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI']].values.reshape(-1,4)
    coord_bal8 = ntd8[['xmean', 'ymean']].values
    target_bal8 = ntd8['phit_w_avg'].values

    f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
        feature_bal8, coord_bal8, target_bal8, test_size=test_size_value, random_state=random_state_value
    )

    def model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models):
        df_final_lst = []
        for m in models:
            reg_score_lst, rk_score_lst, nn_lst, m_lst = [], [], [], []
            for nn in range(2,16):
                print("n_closest_points:", nn)
                m_rk = RegressionKriging(regression_model=m, n_closest_points=nn, verbose=False)
                m_rk.fit(f_train, xy_train, target_train)
                reg_score_lst.append(m_rk.regression_model.score(f_test, target_test))
                rk_score_lst.append(m_rk.score(f_test, xy_test, target_test))
                nn_lst.append(nn)
                m_lst.append(m)
            result = pd.DataFrame({'model':m_lst,'n_closest_points':nn_lst, 'reg_score':reg_score_lst,'rk_score':rk_score_lst})
            result['random_state'] = random_state_value
            result['test_size'] = test_size_value
            df_final_lst.append(result)
            df_final = pd.concat(df_final_lst)
        return df_final
    ml_kriging = model_mlkrige_run(f_train, f_test, xy_train, xy_test, target_train, target_test, models)

    def model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, models_df):
        model_sorted = models_df.sort_values(by='rk_score', ascending=False)
        model_name = model_sorted.iloc[0]['model']
        nn_points = model_sorted.iloc[0]['n_closest_points']
        m_rk = RegressionKriging(regression_model=model_name, n_closest_points=nn_points)
        m_rk.fit(f_train, xy_train, target_train)

        pred = m_rk.predict(f_test, xy_test)
        pred_df = pd.DataFrame(zip(pred, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
        pred_df['model'] = model_name
        pred_df['n_closest_points'] = nn_points
        pred_df['random_state'] = random_state_value
        pred_df['test_size'] = test_size_value
        pred_df['up_1.15pu'] = pred_df.phit_w_avg_true+0.0115
        pred_df['down_1.15pu'] = pred_df.phit_w_avg_true-0.0115
        pred_df['qc'] = np.where((pred_df.phit_w_avg_pred >= pred_df['down_1.15pu']) & (pred_df.phit_w_avg_pred <= pred_df['up_1.15pu']), 1, 0)
        return pred_df
    prediction_df = model_mlkrige_best_res(f_train, f_test, xy_train, xy_test, target_train, target_test, ml_kriging)

    return prediction_df, ml_kriging
models_test = [svr_model, rf_model, lr_model]
prediction_f_df, ml_f_kriging = ml_kriging_prediction_feature_bal8(models_test, 42, 0.5)
qc_calc_ml_kri = prediction_f_df.qc.value_counts(normalize=True).reset_index()
custom_palette = {0: 'red', 1: 'green'}
sns.scatterplot(data=prediction_f_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
plt.title('qc = 1 ' + str(qc_calc_ml_kri[qc_calc_ml_kri.qc==1]['proportion'].values[0].round(2)));

In [None]:
ml_f_kriging.sort_values(by='rk_score', ascending=False).head(2)

In [None]:
prediction_df.head(2)

### ml

In [None]:
lr_model = LinearRegression(fit_intercept=False)

feature_bal8 = ntd8[['htst','field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI']].values.reshape(-1,4)
coord_bal8 = ntd8[['xmean', 'ymean']].values
target_bal8 = ntd8['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42
)
lr_model.fit(f_train, target_train)
pred_lr = lr_model.predict(f_test)
pred_lr_df = pd.DataFrame(zip(pred_lr, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
pred_lr_df['model'] = pred_lr.__class__.__name__
# pred_svr_df['n_closest_points'] = nn_points
pred_lr_df['random_state'] = 42
pred_lr_df['test_size'] = 0.5
pred_lr_df['up_1.15pu'] = pred_lr_df.phit_w_avg_true+0.0115
pred_lr_df['down_1.15pu'] = pred_lr_df.phit_w_avg_true-0.0115
pred_lr_df['qc'] = np.where((pred_lr_df.phit_w_avg_pred >= pred_lr_df['down_1.15pu']) & (pred_lr_df.phit_w_avg_pred <= pred_lr_df['up_1.15pu']), 1, 0)

qc_calc_ml = pred_svr_df.qc.value_counts(normalize=True).reset_index()
custom_palette = {0: 'red', 1: 'green'}
sns.scatterplot(data=pred_lr_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
plt.title('qc = 1 ' + str(qc_calc_ml[qc_calc_ml.qc==1]['proportion'].values[0].round(2)));

### kriging

In [None]:
feature_bal8 = ntd8[['htst','field_CENTRAL AZERI', 'field_EAST AZERI', 'field_WEST AZERI']].values.reshape(-1,4)
coord_bal8 = ntd8[['xmean', 'ymean']].values
target_bal8 = ntd8['phit_w_avg'].values

f_train, f_test, xy_train, xy_test, target_train, target_test = train_test_split(
    feature_bal8, coord_bal8, target_bal8, test_size=0.5, random_state=42
)

kriging = Krige(n_closest_points=2, method='ordinary', variogram_model='linear')
kriging.fit(x=xy_train, y=target_train)
pred_kriging = kriging.predict(xy_test)

kriging_df = pd.DataFrame(zip(pred_kriging, target_test), columns=['phit_w_avg_pred', 'phit_w_avg_true'])
kriging_df['model'] = kriging.__class__.__name__
kriging_df['n_closest_points'] = 2
kriging_df['random_state'] = 42
kriging_df['test_size'] = 0.5
kriging_df['up_1.15pu'] = kriging_df.phit_w_avg_true+0.0115
kriging_df['down_1.15pu'] = kriging_df.phit_w_avg_true-0.0115
kriging_df['qc'] = np.where((kriging_df.phit_w_avg_pred >= kriging_df['down_1.15pu']) & (kriging_df.phit_w_avg_pred <= kriging_df['up_1.15pu']), 1, 0)

qc_calc_kriging = kriging_df.qc.value_counts(normalize=True).reset_index()
custom_palette = {0: 'red', 1: 'green'}
sns.scatterplot(data=kriging_df, x='phit_w_avg_true', y='phit_w_avg_pred', s=30, hue='qc', alpha=0.5, ec='black', palette=custom_palette)
sns.lineplot(x=[0.16,0.27], y=[0.16,0.27], color='blue', ls='--')
plt.title('qc = 1 ' + str(qc_calc_kriging[qc_calc_kriging.qc==1]['proportion'].values[0].round(2)));

In [None]:
print('phit_w_avg prediction\nkriging:', qc_calc_kriging[qc_calc_kriging.qc==1]['proportion'].values[0].round(3), 
      'ml:', qc_calc_ml[qc_calc_ml.qc==1]['proportion'].values[0].round(3), 
      'ml + kriging:', qc_calc_ml_kri[qc_calc_ml_kri.qc==1]['proportion'].values[0].round(3))

### synthetic data

In [None]:
svr_model = SVR(C=0.1, gamma="auto")
rf_model = RandomForestRegressor(n_estimators=100)
lr_model = LinearRegression(copy_X=True, fit_intercept=False)

models = [svr_model, rf_model, lr_model]
housing = fetch_california_housing()

# take the first 5000 as Kriging is memory intensive
p = housing["data"][:5000, :-2]
x = housing["data"][:5000, -2:]
target = housing["target"][:5000]

p_train, p_test, x_train, x_test, target_train, target_test = train_test_split(
    p, x, target, test_size=0.3, random_state=42
)

for m in models:
    print("=" * 40)
    print("regression model:", m.__class__.__name__)
    m_rk = RegressionKriging(regression_model=m, n_closest_points=10)
    m_rk.fit(p_train, x_train, target_train)
    print("Regression Score: ", m_rk.regression_model.score(p_test, target_test))
    print("RK score: ", m_rk.score(p_test, x_test, target_test))

In [None]:
rf_model = RandomForestRegressor(n_estimators=100)
p = housing["data"][:5000, :-2]
x = housing["data"][:5000, -2:]
target = housing["target"][:5000]

p_train, p_test, x_train, x_test, target_train, target_test = train_test_split(
    p, x, target, test_size=0.3, random_state=42
)

rf_model.fit(p_train, target_train)
pred_rf = rf_model.predict(p_test)
score_rf = rf_model.score(p_test, target_test).round(3)
print("RandomForest Score: ", score_rf)

In [None]:
p = housing["data"][:5000, :-2]
x = housing["data"][:5000, -2:]
target = housing["target"][:5000]

p_train, p_test, x_train, x_test, target_train, target_test = train_test_split(
    p, x, target, test_size=0.3, random_state=42
)

kriging = Krige(method='ordinary', variogram_model='linear')
kriging.fit(x=x_train, y=target_train)
pred_kriging = kriging.predict(x_test)
score_kriging = kriging.score(x_test, target_test).round(3)
print("Kriging Score: ", score_kriging)
print('RFR + kriging:',)

In [None]:
rf_model = RandomForestRegressor(n_estimators=100)
p = housing["data"][:5000, :-2]
x = housing["data"][:5000, -2:]
target = housing["target"][:5000]

p_train, p_test, x_train, x_test, target_train, target_test = train_test_split(
    p, x, target, test_size=0.3, random_state=42
)

m_rk = RegressionKriging(regression_model=rf_model)
m_rk.fit(p_train, x_train, target_train)
score_rf_kriging = m_rk.score(p_test, x_test, target_test).round(3)
print("RK score: ", score_rf_kriging)

In [None]:
print('synthetic dataset\nkriging:', score_kriging, 'ml:', score_rf, 'ml + kriging:', score_rf_kriging)

In [None]:
# Проверка модели на разных количествах фичей
rf_model = RandomForestRegressor(n_estimators=100)
for i in range(2,8,1):
    p = housing["data"][:5000, :-i]
    x = housing["data"][:5000, -2:]
    target = housing["target"][:5000]

    p_train, p_test, x_train, x_test, target_train, target_test = train_test_split(
        p, x, target, test_size=0.3, random_state=42
    )

    print("=" * 40)
    print('i:', i)
    print("regression model:", rf_model.__class__.__name__)
    m_rk = RegressionKriging(regression_model=rf_model, n_closest_points=10)
    m_rk.fit(p_train, x_train, target_train)

    print("Regression Score: ", m_rk.regression_model.score(p_test, target_test))
    print("RK score: ", m_rk.score(p_test, x_test, target_test))
