## Import libs

In [1]:
#Import libs 
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import statistics as st
import math
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from tqdm import tqdm
import textwrap
from statistics import mean
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score as r2, mean_absolute_error as mae, mean_squared_error as mse, accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.inspection import permutation_importance
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
import random
import mlflow
import mlflow.sklearn
pd.set_option("display.precision", 3)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', 18)

## Upload main data

In [2]:
# # Loading the ACG_wells_JOINT_BEST_v6.csv file
# path = 'C:\\jupyter\\SPP\\input\\'
# data_init = pd.read_csv(path + 'ACG_wells_JOINT_BEST_v6.csv', sep=',')
# # Data cleaning of TL-dataset
# df = data_init.copy()
# df = df[1:]
# #Select only neccessary data
# df_cln = df[['wellName', 'DEPTH', 'AREA', 'BADPORLOG', 'Casings', 'FORMATION',
#             'FLANK1', 'FLANK2', 'Fluidcode', 'Fluidcode_mod', 'FLUIDCODE_PP',
#             'LPERM', 'PHIT', 'NET', 
#             'GR_N', 'GRMATRIX', 'GRSHALE','VSH', 'NPSS', 'RHOB', 'RHOF', 'RHOMA', 
#             'RDEEP',  'SON', 'SONSH', 
#             'TVD_SCS','TST', 'DEVI','HAZI','X', 'Y', 'Dip_Azimuth', 'Dip_TRU']]
# #Fill up nan and -9999 values with 0
# df_cln = df_cln.fillna(0)
# df_cln = df_cln.replace(-9999, 0)
# df_cln = df_cln.replace('-9999', '0')
# #Assing proper datatypes for df
# dicttypes = {'wellName':'string', 'DEPTH':'float', 'AREA':'int', 'BADPORLOG':'int', 'Casings':'float', 'FLANK1':'int', 'FLANK2':'int',
#              'Fluidcode':'int', 'Fluidcode_mod':'int','FLUIDCODE_PP':'int','FORMATION':'string', 'GR_N':'float', 'GRMATRIX':'float', 
#              'GRSHALE':'float', 'LPERM':'float', 'NPSS':'float',
#              'PHIT':'float', 'NET':'float', 'RDEEP':'float', 'RHOB':'float', 'RHOF':'float', 'RHOMA':'float', 'TVD_SCS':'float', 'TST':'float',
#              'VSH':'float', 'X':'float', 'Y':'float', 'Dip_Azimuth':'float', 'Dip_TRU':'float'}
# df_cln = df_cln.astype(dicttypes, errors='ignore')
# df_cln.loc[df_cln.FORMATION=='0', 'FORMATION']='None'
# #Save data to parquet
# df_cln.to_parquet('ACG_wells_JOINT_BEST_v6.parquet.gzip', compression='gzip')

#Loading metadata, distribution wells per Platforms and all the that.
path = 'C:\\jupyter\\SPP\\input\\'
metadata_init = pd.read_csv(path + 'ACG_wells_metadata.csv', sep=',')
metadata = metadata_init.copy()
metadata = metadata.rename(columns={'X':'X_wellhead', 'Y':'Y_wellhead'})
metadata.Status = metadata.Status.str.strip()
metadata.Status = metadata.Status.str.lower()
metadata.loc[metadata.Status == 'oil', 'Status' ] = 'production oil'
metadata.loc[metadata.Status == 'oil producer', 'Status' ] = 'production oil'
metadata.loc[metadata.Status == 'production', 'Status' ] = 'production oil'
metadata.loc[metadata.Status == 'produiction oil', 'Status' ] = 'production oil'
metadata.loc[metadata.Status == 'production_oil', 'Status' ] = 'production oil'
metadata.loc[metadata.Status == 'abandoned production oil', 'Status' ] = 'abandoned oil'
metadata.loc[metadata.Status == 'abandoned  oil', 'Status' ] = 'abandoned oil'
metadata.loc[metadata.Status == 'abandoned oi', 'Status' ] = 'abandoned oil'
metadata.loc[metadata.Status == 'injector  - water', 'Status' ] = 'injector - water'
metadata.loc[metadata.Status == 'injector water', 'Status' ] = 'injector - water'
metadata.loc[metadata.Status == 'injetor  - water', 'Status' ] = 'injector - water'
metadata.loc[metadata.Status == 'abandoned injector - water per b', 'Status' ] = 'abandoned injector - water'
metadata.loc[metadata.Status == 'plugged and abandoned', 'Status' ] = 'p&a'
metadata.loc[metadata.X_wellhead==118.270, 'X_wellhead'] = 526258.84
metadata.loc[metadata.Y_wellhead==526261.510, 'Y_wellhead'] = 4435802.01
metadata.loc[metadata.well=='C39', 'X_wellhead'] = 526258.840
metadata.loc[metadata.well=='C39', 'Y_wellhead'] = 4435802.010
metadata.loc[metadata.field=='West Azeri', 'field'] = 'WEST AZERI'
metadata.loc[metadata.field=='COP', 'field'] = 'WEST CHIRAG'
metadata.loc[metadata.well=='AZERI2', 'field'] = 'WEST AZERI'
metadata.loc[metadata.well=='AZERI3', 'field'] = 'WEST AZERI'
metadata.loc[metadata.well=='B31', 'field'] = 'CENTRAL AZERI'
metadata.loc[metadata.well=='J28_bpQIP', 'field'] = 'WEST CHIRAG'

#Read data from parquet
path = 'C:\\jupyter\\SPP\\input\\'
df_prq = pd.read_parquet(path + 'ACG_wells_JOINT_BEST_v6.parquet.gzip')
df_prq.rename(columns={'wellName':'well'}, inplace=True)
df_prq = df_prq.set_index('well').join(metadata.set_index('well')).reset_index()
# print('wells in df totally:', len(df_prq.well.unique()))
# Filter data with bad_well_list 
bad_well_list = ['E10Z','Predrill_J01Z', 'Predrill_J08', 'J28_bpQIP']
df_prq = df_prq[~df_prq.well.isin(bad_well_list)]
#Assign any Fluidcode_mod number by variable gross_pay=1 and gross_pay=0 if Fluidcode_mod as NaN
df_prq.loc[df_prq.Fluidcode_mod>0, 'gross_pay'] = 1
df_prq.loc[df_prq.Fluidcode_mod<=0, 'gross_pay'] = 0
df_prq.gross_pay = df_prq.gross_pay.astype('int')
#Getting XY coords of Balakhany formation tops
xy_coord = df_prq[['well', 'FORMATION', 'X', 'Y']]
xy_coord = xy_coord.groupby(['well', 'FORMATION']).apply(lambda x: x.iloc[0]).drop(columns=['well', 'FORMATION']).reset_index()
xy_coord = xy_coord[xy_coord.FORMATION.str.contains('Balakhany') & (xy_coord.X>0) & (xy_coord.Y>0)]
#Find top TVD_SCS for each formation
df_prq_tvdss = df_prq[['well','DEPTH','FORMATION','TVD_SCS']].groupby(['well','FORMATION']).apply(lambda x: x.iloc[0])
df_prq_tvdss = df_prq_tvdss.drop(['well','FORMATION'], axis=1).reset_index()
df_prq_tvdss = df_prq_tvdss[df_prq_tvdss.TVD_SCS>0]

# Assigning numerical values insted text names
df_fu_tst_final.loc[df_fu_tst_final.field == 'DDGG', 'field_num'] = 1
df_fu_tst_final.loc[df_fu_tst_final.field == 'DWG', 'field_num'] = 2
df_fu_tst_final.loc[df_fu_tst_final.field == 'WEST CHIRAG', 'field_num'] = 3
df_fu_tst_final.loc[df_fu_tst_final.field == 'CHIRAG', 'field_num'] = 4
df_fu_tst_final.loc[df_fu_tst_final.field == 'WEST AZERI', 'field_num'] = 5
df_fu_tst_final.loc[df_fu_tst_final.field == 'CENTRAL AZERI', 'field_num'] = 6
df_fu_tst_final.loc[df_fu_tst_final.field == 'EAST AZERI', 'field_num'] = 7

## Data preparation

### Cleaning dataset for outliers

In [5]:
#Reading csv with initial KHtst_v3, joining xy-coord & TVD_SCS tops of formation
path = 'C:\\jupyter\\SPP\\inputoutput\\'
df_khtst = pd.read_csv(path + 'df_prq_khtst_v3.csv')
df_khtst_xy = df_khtst.set_index(['well','FORMATION']).join(xy_coord.set_index(['well','FORMATION'])).reset_index()
df_khtst_xy_tvd = df_khtst_xy.set_index(['well', 'FORMATION']).join(df_prq_tvdss.set_index(['well','FORMATION'])).reset_index()
df_khtst_xy_tvd_fld = df_khtst_xy_tvd.set_index('well').join(df_prq.groupby('well')['field'].apply(lambda x: x.iloc[0])).reset_index()
#Clean dataset for outliers for Balakhany VIII & X  for AZR and CHG fields by rule 1.5 * IQR
fm_list_8_10 = ['Balakhany VIII', 'Balakhany VIII sand', 'Balakhany VIII 25','Balakhany VIII 20', 
                'Balakhany VIII 15', 'Balakhany VIII 10', 'Balakhany VIII 5',
                'Balakhany X', 'Balakhany X sand', 'Balakhany X 40', 'Balakhany X 20'] 
azr_lst = ['CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI']
chg_lst = ['CHIRAG', 'DWG', 'DDGG', 'WEST CHIRAG']
df_lst = []
for fm in fm_list_8_10:
    df_khtst_fm = df_khtst_xy_tvd_fld[(df_khtst_xy_tvd_fld.FORMATION == fm) & (df_khtst_xy_tvd_fld.field.isin(azr_lst))]
    Q1 = df_khtst_fm['KHtst'].quantile(0.25)
    Q3 = df_khtst_fm['KHtst'].quantile(0.75)
    IQR = Q3 - Q1
    # print(f'bal {fm} azr IQR', IQR, 'bot limit:', (Q1 - 1.5 * IQR), 'top limit:', (Q3 + 1.5 * IQR))
    df_khtst_fm_qcl = df_khtst_fm[~((df_khtst_fm['KHtst'] < (Q1 - 1.5 * IQR)) | (df_khtst_fm['KHtst'] > (Q3 + 1.5 * IQR)))]
    df_lst.append(df_khtst_fm_qcl)
for fm in fm_list_8_10:
    df_khtst_fm = df_khtst_xy_tvd_fld[(df_khtst_xy_tvd_fld.FORMATION == fm) & (df_khtst_xy_tvd_fld.field.isin(chg_lst))]
    Q1 = df_khtst_fm['KHtst'].quantile(0.25)
    Q3 = df_khtst_fm['KHtst'].quantile(0.75)
    IQR = Q3 - Q1
    # print(f'bal {fm} chg IQR', IQR, 'bot limit:', (Q1 - 1.5 * IQR), 'top limit:', (Q3 + 1.5 * IQR))
    df_khtst_fm_qcl = df_khtst_fm[~((df_khtst_fm['KHtst'] < (Q1 - 1.5 * IQR)) | (df_khtst_fm['KHtst'] > (Q3 + 1.5 * IQR)))]
    df_lst.append(df_khtst_fm_qcl)
df_khtst_bal_qcl = pd.concat(df_lst)

### Getting TST-thick Bal VIII & X + uploading df_prq_htst_avgprop_v1

In [57]:
#Distribution tst-thickness Balaknany VIII / X over Chirag and Azeri zones
#Calculation of TST-thickness Balakhany VIII & X
df_fu_tst = df_prq[(df_prq.FORMATION.str.contains('Balakhany VIII')) | (df_prq.FORMATION.str.contains('Balakhany X'))]
df_fu_tst = df_fu_tst[['well', 'DEPTH','FORMATION','TST']]
df_fu_tst_top = df_fu_tst.groupby(['well','FORMATION'])['TST'].apply(lambda x: x.iloc[0]).reset_index()
df_fu_tst_top.rename(columns={'TST':'TST_top'}, inplace=True)
df_fu_tst_bot = df_fu_tst.groupby(['well','FORMATION'])['TST'].apply(lambda x: x.iloc[-1]).reset_index()
df_fu_tst_bot.rename(columns={'TST':'TST_bot'}, inplace=True)
df_fu_tst_final = df_fu_tst_top.set_index(['well','FORMATION']).join(df_fu_tst_bot.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final['TST_interv'] = round((df_fu_tst_final.TST_bot - df_fu_tst_final.TST_top),0)
df_fu_tst_final = df_fu_tst_final.set_index(['well','FORMATION']).join(xy_coord.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final = df_fu_tst_final.set_index(['well', 'FORMATION']).join(df_prq_tvdss.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final = df_fu_tst_final.set_index('well').join(df_prq.groupby('well')['field'].apply(lambda x: x.iloc[0])).reset_index()
df_fu_tst_final = df_fu_tst_final[(df_fu_tst_final.TST_interv > 0)]
#Reading df_prq_htst_avgprop_v1 and getting outliers
path = 'C:\\jupyter\\SPP\\inputoutput\\' 
df_htst_avgprop = pd.read_csv(path + 'df_prq_htst_avgprop_v1.csv')
well_no_outliers8 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand'].well.unique()
well_no_outliers10 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany X sand'].well.unique()
#Preparation weighted average df_htst_avgprop-dataset
cutoff_h_tst = 0.5
cutoff_perm_avg = 5
#Applying filtration to dataset with cutoffs
df_htst_avgprop_nz = df_htst_avgprop[(df_htst_avgprop.h_tst > cutoff_h_tst) & (df_htst_avgprop.md_perm_avg > cutoff_perm_avg)]
#Multiplaying htst by resprop values
df_htst_avgprop_nz['kavg_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_perm_avg
df_htst_avgprop_nz['phit_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_phit_avg
df_htst_avgprop_nz['vsh_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_vsh_avg
#Summarizing h_tst via well & formation
df_htst_fm = df_htst_avgprop_nz.groupby(['well','FORMATION'])['h_tst'].sum().reset_index()
df_htst_fm.rename(columns={'h_tst':'gross_tst'}, inplace=True)
#Calculating weighted averages
df_htst_avgprop_nz_avgpropsum = df_htst_avgprop_nz.groupby(['well','FORMATION'])[['phit_htst','vsh_htst']].sum().reset_index()
df_htst_avgprop_nz_avgpropsum_join = df_htst_avgprop_nz_avgpropsum.set_index(
                                     ['well','FORMATION']).join(df_htst_fm.set_index(['well','FORMATION'])).reset_index()
df_htst_avgprop_nz_avgpropsum_join['phit_wavg'] = df_htst_avgprop_nz_avgpropsum_join.phit_htst / df_htst_avgprop_nz_avgpropsum_join.gross_tst
df_htst_avgprop_nz_avgpropsum_join['vsh_wavg'] = df_htst_avgprop_nz_avgpropsum_join.vsh_htst / df_htst_avgprop_nz_avgpropsum_join.gross_tst
df_8bal_hpv = df_htst_avgprop_nz_avgpropsum_join[
              df_htst_avgprop_nz_avgpropsum_join.FORMATION == 'Balakhany VIII sand'][['well','FORMATION','gross_tst','phit_wavg','vsh_wavg']]
df_8bal_permh = df_htst_avgprop_nz[df_htst_avgprop_nz.FORMATION == 'Balakhany VIII sand'].groupby(['well','FORMATION'])['kavg_htst'].sum().reset_index()
df_8bal_phhpv = df_8bal_hpv.set_index(['well','FORMATION']).join(df_8bal_permh.set_index(['well','FORMATION'])).reset_index()
df_10bal_hpv = df_htst_avgprop_nz_avgpropsum_join[
              df_htst_avgprop_nz_avgpropsum_join.FORMATION == 'Balakhany X sand'][['well','FORMATION','gross_tst','phit_wavg','vsh_wavg']]
df_10bal_permh = df_htst_avgprop_nz[df_htst_avgprop_nz.FORMATION == 'Balakhany X sand'].groupby(['well','FORMATION'])['kavg_htst'].sum().reset_index()
df_10bal_phhpv = df_10bal_hpv.set_index(['well','FORMATION']).join(df_10bal_permh.set_index(['well','FORMATION'])).reset_index()
# #Preparing x,y matrices for ML
df_8bal_phhpv_tstint = df_8bal_phhpv.set_index(['well','FORMATION']).join(df_fu_tst_final.set_index(['well','FORMATION'])).reset_index()
df_8bal_phhpv_tstint = df_8bal_phhpv_tstint[['well', 'FORMATION', 'X', 'Y', 'DEPTH', 'TVD_SCS', 'field', 'gross_tst',
                                             'TST_interv', 'kavg_htst', 'phit_wavg', 'vsh_wavg']]
df_8bal_phhpv_tstint.rename(columns={'TST_interv':'interv_tst'}, inplace=True)
df_avgprop8_final_wa = df_8bal_phhpv_tstint.copy()
df_10bal_phhpv_tstint = df_10bal_phhpv.set_index(['well','FORMATION']).join(df_fu_tst_final.set_index(['well','FORMATION'])).reset_index()
df_10bal_phhpv_tstint = df_10bal_phhpv_tstint[['well', 'FORMATION', 'X', 'Y', 'DEPTH', 'TVD_SCS', 'field', 'gross_tst',
                                             'TST_interv', 'kavg_htst', 'phit_wavg', 'vsh_wavg']]
df_10bal_phhpv_tstint.rename(columns={'TST_interv':'interv_tst'}, inplace=True)
df_avgprop10_final_wa = df_10bal_phhpv_tstint.copy()
#Selecting data for Bal8 & Bal10 
df_avgprop_bal10_wa = df_avgprop10_final_wa[df_avgprop10_final_wa.FORMATION.str.contains('Balakhany X sand') & 
                                          df_avgprop10_final_wa.well.isin(well_no_outliers10)]
df_avgprop_bal8_wa = df_avgprop8_final_wa[df_avgprop8_final_wa.FORMATION.str.contains('Balakhany VIII sand') & 
                                          df_avgprop8_final_wa.well.isin(well_no_outliers8)]

### Preparation dataset for X_train/x_test

In [10]:
# Preparation dataset for X_train/x_test data splitting based on outliers cleaned data
azr_lst = ['CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI']
chg_lst = ['CHIRAG', 'DWG', 'DDGG', 'WEST CHIRAG']
well_clean_azr = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand') & 
                                  (df_khtst_bal_qcl.field.isin(azr_lst))].well
well_clean_all = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand')].well
#Calculation of Euclidean Distances for the top of Balakhany VIII sand.
def well_dist_calc(formation='Balakhany VIII sand'):
    data = df_khtst_xy_tvd[(df_khtst_xy_tvd.FORMATION == formation) & 
                            (df_khtst_xy_tvd.X > 0) & (df_khtst_xy_tvd.Y > 0) &
                            (~df_khtst_xy_tvd.TVD_SCS.isna())]
    row_name = data.well.reset_index().drop(['index'], axis=1)
    distance_fm = pd.DataFrame(euclidean_distances(data[['X', 'Y', 'TVD_SCS']]), columns=list(data.well))
    distance_fm_well = distance_fm.join(row_name).set_index('well')
    distance_fm_well.reset_index()
    return distance_fm_well.reset_index()
dist_bal8 = well_dist_calc('Balakhany VIII sand')
dist_bal10 = well_dist_calc('Balakhany X sand')  

### EuclDist Dist based dataset Balakhany VIII sand + X sand

In [58]:
#Uploading k_htst data from csv-file & Calculation of Euclidean Distances
path = 'C:\\jupyter\\SPP\\inputoutput\\'
df_khtst = pd.read_csv(path + 'df_prq_khtst_v3.csv')
df_khtst_xy = df_khtst.set_index(['well','FORMATION']).join(
                                 df_prq[['well','FORMATION','X','Y','TVD_SCS']].groupby(
                                 ['well','FORMATION']).apply(lambda x: x.iloc[0]).drop(
                                 ['well','FORMATION'], axis=1)).reset_index()
#Calculation of Euclidean Distances for the top of Balakhany VIII sand & Balakhany X sand
def well_dist_calc(formation='Balakhany VIII sand'):
    data = df_khtst_xy[(df_khtst_xy.FORMATION == formation) & (df_khtst_xy.X > 0) & (df_khtst_xy.Y > 0)]
    row_name = data.well.reset_index().drop(['index'], axis=1)
    distance_fm = pd.DataFrame(euclidean_distances(data[['X', 'Y', 'TVD_SCS']]), columns=list(data.well))
    distance_fm_well = distance_fm.join(row_name).set_index('well')
    distance_fm_well.reset_index()
    return distance_fm_well.reset_index()
dist_bal8 = well_dist_calc('Balakhany VIII sand')
dist_bal10 = well_dist_calc('Balakhany X sand')    
# Preparation dataset for X_train/x_test data splitting
well_clean_8 = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand')].well
well_clean_10 = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany X sand')].well

df_collect8 = []
for num, well_name in enumerate(dist_bal8.well[:]):
    well_dist3 = dist_bal8[dist_bal8.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()
    well_dist3_res = well_dist3.T[1:].reset_index().drop('index', axis=1)
    well_dist3_res.columns =['dist1', 'dist2', 'dist3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3['index']))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany VIII sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_res, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect8.append(result)
df_well_kh_dist8 = pd.concat(df_collect8).reset_index().drop('index', axis=1)
df_khtst_xy_bal8 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany VIII sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_dist_bal8 = df_well_kh_dist8.set_index('well').join(df_khtst_xy_bal8.set_index('well')).reset_index()
df_well_kh_dist_bal8_fld = df_well_kh_dist_bal8.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
df_well_kh_dist_bal8_fld = df_well_kh_dist_bal8_fld[(df_well_kh_dist_bal8_fld.well.isin(well_clean_8)) &
                                                    (df_well_kh_dist_bal8_fld.kh1>0) &
                                                    (df_well_kh_dist_bal8_fld.kh2>0) &
                                                    (df_well_kh_dist_bal8_fld.kh3>0) &
                                                    (df_well_kh_dist_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_collect10 = []
for num, well_name in enumerate(dist_bal10.well):
    well_dist3 = dist_bal10[dist_bal10.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()
    well_dist3_res = well_dist3.T[1:].reset_index().drop('index', axis=1)
    well_dist3_res.columns =['dist1', 'dist2', 'dist3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3['index']))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany X sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_res, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect10.append(result)
df_well_kh_dist10 = pd.concat(df_collect10).reset_index().drop('index', axis=1)
df_khtst_xy_bal10 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany X sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_dist_bal10 = df_well_kh_dist10.set_index('well').join(df_khtst_xy_bal10.set_index('well')).reset_index()
df_well_kh_dist_bal10_fld = df_well_kh_dist_bal10.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
df_well_kh_dist_bal10_fld = df_well_kh_dist_bal10_fld[(df_well_kh_dist_bal10_fld.well.isin(well_clean_10)) &
                                                    (df_well_kh_dist_bal10_fld.kh1>0) &
                                                    (df_well_kh_dist_bal10_fld.kh2>0) &
                                                    (df_well_kh_dist_bal10_fld.kh3>0) &
                                                    (df_well_kh_dist_bal10_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_well_kh_dist_all = pd.concat([df_well_kh_dist_bal8_fld, df_well_kh_dist_bal10_fld])
# df_well_kh_dist_all

### XY based on EuclDist Balakhany VIII sand & X sand

In [None]:
#Collecting XY based on Euclidean Distances for the top of Balakhany VIII sand.
df_collect = []
for num, well_name in enumerate(dist_bal8.well[:]):
    well_dist3 = dist_bal8[dist_bal8.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()['index']
    data = df_khtst_xy[(df_khtst_xy.FORMATION == 'Balakhany VIII sand') & (df_khtst_xy.X > 0) & (df_khtst_xy.Y > 0)]
    data[data.well.isin(well_dist3)][['well','X','Y']].T[1:]
    well_dist3_x = data[data.well.isin(well_dist3)][['well','X','Y']].T[1:2].reset_index().drop('index', axis=1)
    well_dist3_y = data[data.well.isin(well_dist3)][['well','X','Y']].T[2:3].reset_index().drop('index', axis=1)
    well_dist3_y.columns =['y1', 'y2', 'y3']
    well_dist3_x.columns =['x1', 'x2', 'x3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany VIII sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_x, well_dist3_y, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect.append(result)
df_well_kh_xy = pd.concat(df_collect).reset_index().drop('index', axis=1)
df_khtst_xy_bal8 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany VIII sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_xy_bal8 = df_well_kh_xy.set_index('well').join(df_khtst_xy_bal8.set_index('well')).reset_index()
df_well_kh_xy_bal8_fld = df_well_kh_xy_bal8.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
# Making up dataset with xy for azeri field
df_well_kh_xy_bal8_fld_azr = df_well_kh_xy_bal8_fld[(df_well_kh_xy_bal8_fld.field.isin(azr_lst)) & 
                                                    (df_well_kh_xy_bal8_fld.well.isin(well_clean_all)) &
                                                    (df_well_kh_xy_bal8_fld.kh1>0) &
                                                    (df_well_kh_xy_bal8_fld.kh2>0) &
                                                    (df_well_kh_xy_bal8_fld.kh3>0) &
                                                    (df_well_kh_xy_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
# Making up dataset with xy for chirag & azeri fields
df_well_kh_xy_bal8_fld_all = df_well_kh_xy_bal8_fld[(df_well_kh_xy_bal8_fld.well.isin(well_clean_all)) &
                                                    (df_well_kh_xy_bal8_fld.kh1>0) &
                                                    (df_well_kh_xy_bal8_fld.kh2>0) &
                                                    (df_well_kh_xy_bal8_fld.kh3>0) &
                                                    (df_well_kh_xy_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_well_kh_xy_bal8_fld_all.head(3)

In [None]:
#Collecting XY based on Euclidean Distances for the top of Balakhany X sand.
df_collect = []
for num, well_name in enumerate(dist_bal10.well[:]):
    well_dist3 = dist_bal10[dist_bal10.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()['index']
    data = df_khtst_xy[(df_khtst_xy.FORMATION == 'Balakhany X sand') & (df_khtst_xy.X > 0) & (df_khtst_xy.Y > 0)]
    data[data.well.isin(well_dist3)][['well','X','Y']].T[1:]
    well_dist3_x = data[data.well.isin(well_dist3)][['well','X','Y']].T[1:2].reset_index().drop('index', axis=1)
    well_dist3_y = data[data.well.isin(well_dist3)][['well','X','Y']].T[2:3].reset_index().drop('index', axis=1)
    well_dist3_y.columns =['y1', 'y2', 'y3']
    well_dist3_x.columns =['x1', 'x2', 'x3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany X sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_x, well_dist3_y, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect.append(result)
df_well_kh_xy = pd.concat(df_collect).reset_index().drop('index', axis=1)
df_khtst_xy_bal10 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany X sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_xy_bal10 = df_well_kh_xy.set_index('well').join(df_khtst_xy_bal10.set_index('well')).reset_index()
df_well_kh_xy_bal10_fld = df_well_kh_xy_bal10.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
# Making up dataset with xy for azeri field
df_well_kh_xy_bal10_fld_azr = df_well_kh_xy_bal10_fld[(df_well_kh_xy_bal10_fld.field.isin(azr_lst)) & 
                                                    (df_well_kh_xy_bal10_fld.well.isin(well_clean_all)) &
                                                    (df_well_kh_xy_bal10_fld.kh1>0) &
                                                    (df_well_kh_xy_bal10_fld.kh2>0) &
                                                    (df_well_kh_xy_bal10_fld.kh3>0) &
                                                    (df_well_kh_xy_bal10_fld.KHtst > 0)].reset_index().drop('index', axis=1)
# Making up dataset with xy for chirag & azeri fields
df_well_kh_xy_bal10_fld_all = df_well_kh_xy_bal10_fld[(df_well_kh_xy_bal10_fld.well.isin(well_clean_all)) &
                                                    (df_well_kh_xy_bal10_fld.kh1>0) &
                                                    (df_well_kh_xy_bal10_fld.kh2>0) &
                                                    (df_well_kh_xy_bal10_fld.kh3>0) &
                                                    (df_well_kh_xy_bal10_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_well_kh_xy_bal10_fld_all.head(3)

In [13]:
# # Storage for obsolte code
# svr = SVR(kernel = 'rbf', C=5, coef0=0.001, degree=1, gamma='auto')
# sc_X = StandardScaler()
# sc_y = StandardScaler()
# X_train = sc_X.fit_transform(X_train)
# X_test = sc_X.transform(X_test)
# y_train = sc_y.fit_transform(y_train.reshape(-1, 1))
# y_test = sc_y.transform(y_test.reshape(-1, 1)) 

# # If you're using rbf (Radial basis function) kernal, you can use 
# # sklearn.inspection.permutation_importance as follows to get feature importance.
# from sklearn.inspection import permutation_importance
# import numpy as np
# import matplotlib.pyplot as plt
# svc =  SVC(kernel='rbf', C=2)
# svc.fit(X_train, y_train)
# perm_importance = permutation_importance(svc, X_test, y_test)
# feature_names = ['feature1', 'feature2', 'feature3', ...... ]
# features = np.array(feature_names)
# sorted_idx = perm_importance.importances_mean.argsort()
# plt.barh(features[sorted_idx], perm_importance.importances_mean[sorted_idx])
# plt.xlabel("Permutation Importance")

## Prediction with SVR

### The 70/30 split for xy-kh of Bal8+10 

#### Run SVR

In [None]:
#Joining vsh_wavg data
df_well_kh_xy_bal8_fld_vsh_all = df_well_kh_xy_bal8_fld_all.set_index('well').join(df_avgprop_bal8_wa[['well','vsh_wavg']].set_index('well')).reset_index()
df_well_kh_xy_bal10_fld_vsh_all = df_well_kh_xy_bal10_fld_all.set_index('well').join(df_avgprop_bal10_wa[['well','vsh_wavg']].set_index('well')).reset_index()
df_well_kh_xy_bal8_fld_vsh_all = df_well_kh_xy_bal8_fld_vsh_all[~df_well_kh_xy_bal8_fld_vsh_all.vsh_wavg.isna()].reset_index()
df_well_kh_xy_bal10_fld_vsh_all = df_well_kh_xy_bal10_fld_vsh_all[~df_well_kh_xy_bal10_fld_vsh_all.vsh_wavg.isna()].reset_index()
#Joining Bal VIII + Bal X
df_well_kh_xy_fld_vsh_all = pd.concat([df_well_kh_xy_bal8_fld_vsh_all,df_well_kh_xy_bal10_fld_vsh_all]).reset_index()
df_well_kh_xy_fld_vsh_all.drop(['level_0',	'index'], axis=1, inplace=True)
# Working with 70/30 split and united df Bal VIII and Bal X
df_well_kh_xy_fld_vsh_all['FORMATION'] = df_well_kh_xy_fld_vsh_all.FORMATION.astype('string')
df_well_kh_xy_fld_vsh_all['field'] = df_well_kh_xy_fld_vsh_all.field.astype('string')  
df_kh_xy_vsh_ohe = pd.get_dummies(df_well_kh_xy_fld_vsh_all, columns = ['FORMATION', 'field'])
# X_train/x_test data splitting
y = np.array(df_kh_xy_vsh_ohe[['well','KHtst']])
x = np.array(df_kh_xy_vsh_ohe[['well','x1', 'x2', 'x3', 'y1', 'y2', 'y3', 'kh1', 'kh2', 'kh3', 'vsh_wavg', 
                               'FORMATION_Balakhany VIII sand','FORMATION_Balakhany X sand', 'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG', 
                               'field_DWG', 'field_EAST AZERI', 'field_WEST AZERI', 'field_WEST CHIRAG']])
num = random.randint(0,100)
print('num', num)
x_train_init, x_test_init, y_train_init, y_test_init = train_test_split(x, y, test_size=0.3, random_state=num)
# Taking well names from train/test datasets
x_train_wells = x_train_init[:,1]
x_test_wells = x_test_init[:,1]
y_train_wells = y_train_init[:,0]
y_test_wells = y_train_init[:,0]
x_train = x_train_init[:,1:]
x_test = x_test_init[:,1:]
y_train = y_train_init[:,1]
y_test = y_test_init[:,1]
# GridSearch for ML-model
svr_gr_sr = SVR()
grid_param_SVR = {'kernel' : (['rbf']),
                  'C' : [10, 100, 500, 1000, 2000, 3000],
                  'gamma':[0.005, 0.01, 0.5],
                  'epsilon': [0.001,0.01, 1, 5]}
scorer = make_scorer(mae, greater_is_better=False)
gd_sr_SVR = GridSearchCV(estimator = svr_gr_sr, param_grid = grid_param_SVR, scoring=scorer, cv = 15)
gd_sr_SVR.fit(x_train, y_train)
GS_setting = gd_sr_SVR.best_params_
print(GS_setting)
# Applying Pipeline for ML-model
svr = Pipeline([("scaler",StandardScaler()),("svr",SVR(kernel = 'rbf', C=GS_setting['C'], 
                                                       gamma = GS_setting['gamma'], epsilon=GS_setting['epsilon']))])
svr.fit(x_train, y_train)
y_pred_train = svr.predict(x_train)
y_pred_test = svr.predict(x_test)
print('---------------------')
print('r2_train', r2(y_train, y_pred_train).round(2), 'x_train', x_train.shape)
print('r2_test', r2(y_test, y_pred_test).round(2), 'x_test', x_test.shape)
print('mae_train', mae(y_train, y_pred_train).round(0))
print('mae_test', mae(y_test, y_pred_test).round(0))
# QC of predicted values for train & test datasets
df_res_train = pd.DataFrame(zip(y_train_wells, y_train, y_pred_train), columns=['well', 'actual','predict'])
df_res_train['l_limit'] = df_res_train.actual*0.75
df_res_train['h_limit'] = df_res_train.actual*1.25
df_res_train['qc'] = 'out'
df_res_train.loc[(df_res_train.predict >= df_res_train.l_limit) & (df_res_train.predict <= df_res_train.h_limit), 'qc'] = 'in'
df_res_test = pd.DataFrame(zip(y_test_wells, y_test, y_pred_test), columns=['well', 'actual','predict'])
df_res_test['l_limit'] = df_res_test.actual*0.75
df_res_test['h_limit'] = df_res_test.actual*1.25
df_res_test['qc'] = 'out'
df_res_test.loc[(df_res_test.predict >= df_res_test.l_limit) & (df_res_test.predict <= df_res_test.h_limit), 'qc'] = 'in'

#### Reporting

In [None]:
# Permutation importance train set
result_pi_test = permutation_importance(svr, x_train, y_train, n_repeats=10, random_state=num, n_jobs=2)
sorted_importances_idx = result_pi_test.importances_mean.argsort()
importances = pd.DataFrame(result_pi_test.importances[sorted_importances_idx].T,
                           columns=df_kh_xy_vsh_ohe.drop(['well','KHtst'], axis=1).columns[sorted_importances_idx])
ax = importances.plot.box(vert=False, whis=10)
ax.set_title("Permut Imp (train set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()

In [None]:
# Permutation importance test set
result_pi_test = permutation_importance(svr, x_test, y_test, n_repeats=10, random_state=num, n_jobs=2)
sorted_importances_idx = result_pi_test.importances_mean.argsort()
importances = pd.DataFrame(result_pi_test.importances[sorted_importances_idx].T,
                           columns=df_kh_xy_vsh_ohe.drop(['well','KHtst'], axis=1).columns[sorted_importances_idx])
ax = importances.plot.box(vert=False, whis=10)
ax.set_title("Permut Imp (test set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()

In [None]:
# Making up the x-plot for train dataset
print('wells total:', df_res_train.shape[0])
print('wells unpredicted:', df_res_train['qc'].value_counts()['out'], (df_res_train['qc'].value_counts()['out']/df_res_train.shape[0]).round(3), 'v/v')
print('wells predicted:', df_res_train['qc'].value_counts()['in'], (df_res_train['qc'].value_counts()['in']/df_res_train.shape[0]).round(3), 'v/v')
max_val = 14000
fig1_ml = px.scatter(df_res_train, x='actual', y='predict', 
                     color='qc', 
                     hover_data=['well'], 
                     width=400, height=400,
                     color_discrete_sequence=["green", "red"])
fig1_ml.update_traces(marker=dict(size=10,opacity=0.75,line=dict(color='rgb(47, 57, 61)', width=1)))
fig2_ml=px.line(x=[0,max_val], y=[0,max_val])
fig2_1_ml=px.line(x=[0,max_val], y=[0,max_val*1.25])
fig2_2_ml=px.line(x=[0,max_val], y=[0,max_val*0.75])
fig2_ml.update_traces(line=dict(color = 'blue'))
fig2_1_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig2_2_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig3_ml = go.Figure(data = fig1_ml.data + fig2_ml.data + fig2_1_ml.data + fig2_2_ml.data)
fig3_ml.update_layout(title = 'Comparison Actual vs Pred Train',width=600,height=400, xaxis_title='actual', yaxis_title='predict',
                      margin=dict(l=10,r=10,b=10,t=40))

In [None]:
# Making up the x-plot for test dataset
print('wells total:', df_res_test.shape[0])
print('wells unpredicted:', df_res_test['qc'].value_counts()['out'], (df_res_test['qc'].value_counts()['out']/df_res_test.shape[0]).round(3), 'v/v')
print('wells predicted:', df_res_test['qc'].value_counts()['in'], (df_res_test['qc'].value_counts()['in']/df_res_test.shape[0]).round(3), 'v/v')
max_val = 14000
fig1_ml = px.scatter(df_res_test, x='actual', y='predict', 
                     color='qc', 
                     hover_data=['well'], 
                     width=400, height=400,
                     color_discrete_sequence=["green", "red"])
fig1_ml.update_traces(marker=dict(size=10,opacity=0.75,line=dict(color='rgb(47, 57, 61)', width=1)))
fig2_ml=px.line(x=[0,max_val], y=[0,max_val])
fig2_1_ml=px.line(x=[0,max_val], y=[0,max_val*1.25])
fig2_2_ml=px.line(x=[0,max_val], y=[0,max_val*0.75])
fig2_ml.update_traces(line=dict(color = 'blue'))
fig2_1_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig2_2_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig3_ml = go.Figure(data = fig1_ml.data + fig2_ml.data + fig2_1_ml.data + fig2_2_ml.data)
fig3_ml.update_layout(title = 'Comparison Actual vs Pred',width=600,height=400, xaxis_title='actual', yaxis_title='predict',
                      margin=dict(l=10,r=10,b=10,t=40))

### The loop for XY-dataset of united Balakhany VIII+X

#### Data preparation

In [19]:
# #Part for log10(kh)
# res['test_nat'] = 10**(res.test)
# res['predict_nat'] = 10**(res.predict)
# res['l_test'] = res.test_nat*0.75
# res['h_test'] = res.test_nat*1.25
# res['qc'] = 'out'
# res.loc[(res.predict_nat >= res.l_test) & (res.predict_nat <= res.h_test), 'qc'] = 'in'
# print('wells total:', res.shape[0])
# print('wells unpredicted:', res['qc'].value_counts()['out'], (res['qc'].value_counts()['out']/res.shape[0]).round(3), 'v/v')
# print('wells predicted:', res['qc'].value_counts()['in'], (res['qc'].value_counts()['in']/res.shape[0]).round(3), 'v/v')


In [20]:
# # Basic version of calculation script
# # Starting of the loop for Balakhany VIII chirag & azeri
# y_test_lst = []
# y_pred_test_lst = []
# well_exclude_lst = []
# gs_settings_lst = []
# metrics_r2_mae_lst = []
# df_well_kh_xy_bal8_fld_vsh_all = df_well_kh_xy_bal8_fld_vsh_all.sample(frac = 1).reset_index()
# for i in tqdm(range(len(df_well_kh_xy_bal8_fld_vsh_all))):
#     #Making up the feature and target datasets
#     df_wo_well = df_well_kh_xy_bal8_fld_vsh_all.drop([i])
#     well_exclude = df_well_kh_xy_bal8_fld_vsh_all.iloc[i]['well']
#     well_exclude_lst.append(well_exclude)
#     y_train = np.array(df_wo_well['KHtst'])
#     x_train = np.array(df_wo_well[['x1', 'x2', 'x3', 'y1', 'y2', 'y3', 'kh1', 'kh2', 'kh3', 'vsh_wavg']])
#     well_train = np.array(df_wo_well['well'])
#     y_test = np.array(df_well_kh_xy_bal8_fld_vsh_all.iloc[i]['KHtst'])
#     y_test_lst.append(y_test)
#     x_test = np.array(df_well_kh_xy_bal8_fld_vsh_all.iloc[i][['x1', 'x2', 'x3', 'y1', 'y2', 'y3', 'kh1', 'kh2', 'kh3', 'vsh_wavg']])
#     # GridSearch for ML-model
#     svr_gr_sr = SVR()
#     grid_param_SVR = {'kernel' : (['rbf']),
#                       'C' : [0.1, 1, 10, 100, 300],
#                       'gamma':[0.01, 0.5, 1, 5],
#                       'epsilon': [0.001,0.01, 1]}
#     scorer = make_scorer(mae, greater_is_better=False)
#     gd_sr_SVR = GridSearchCV(estimator = svr_gr_sr, param_grid = grid_param_SVR, scoring=scorer, cv = 10)
#     gd_sr_SVR.fit(x_train, y_train)
#     GS_setting = gd_sr_SVR.best_params_
#     gs_settings_lst.append((GS_setting['C'],GS_setting['gamma'], GS_setting['epsilon']))
#     # Statement Pipeline of ML-model
#     # svr = SVR(kernel = 'rbf', C=500, epsilon=0.001, gamma=0.5)
#     svr = SVR(kernel = 'rbf', C=GS_setting['C'], epsilon=GS_setting['epsilon'], gamma=GS_setting['gamma'])   
#     # # Apply StandardScaller to X & Y
#     sc_x = StandardScaler()
#     sc_y = StandardScaler()
#     x_train = sc_x.fit_transform(x_train)
#     x_test = sc_x.transform([x_test])
#     y_train = sc_y.fit_transform(y_train.reshape(-1, 1))
#     y_test = sc_y.transform(y_test.reshape(-1, 1)) 
#     ## Fitting the ML-model
#     svr.fit(x_train, y_train)
#     y_pred_train = svr.predict(x_train)
#     y_pred_test = svr.predict(x_test)
#     # # Block of data naturalization
#     y_pred_test_nat = sc_y.inverse_transform(y_pred_test.reshape(-1, 1))
#     y_pred_test_lst.append(y_pred_test_nat[0][0])
#     # Metrics computation for the ML-model
#     r2_train = r2(y_train, y_pred_train).round(2)
#     mae_train = mae(y_train, y_pred_train)
#     mae_train_nat = sc_y.inverse_transform([[mae_train]])
#     metrics_r2_mae_lst.append((r2_train, mae_train_nat[0][0]))
# # Building up of dataframe
# res = pd.DataFrame(zip(y_test_lst,y_pred_test_lst,well_exclude_lst, gs_settings_lst, metrics_r2_mae_lst), 
#                    columns = ['test','predict','well_excl','gs_setting', 'metrics_r2_mae'])
# res['l_test'] = res.test*0.75
# res['h_test'] = res.test*1.25
# res['qc'] = 'out'
# res.loc[(res.predict >= res.l_test) & (res.predict <= res.h_test), 'qc'] = 'in'
# print('wells total:', res.shape[0])
# print('wells unpredicted:', res['qc'].value_counts()['out'])
# print('wells predicted:', res['qc'].value_counts()['in'])

In [21]:
#Joining vsh_wavg data
df_well_kh_xy_bal8_fld_vsh_all = df_well_kh_xy_bal8_fld_all.set_index('well').join(df_avgprop_bal8_wa[['well','vsh_wavg']].set_index('well')).reset_index()
df_well_kh_xy_bal10_fld_vsh_all = df_well_kh_xy_bal10_fld_all.set_index('well').join(df_avgprop_bal10_wa[['well','vsh_wavg']].set_index('well')).reset_index()
df_well_kh_xy_bal8_fld_vsh_all = df_well_kh_xy_bal8_fld_vsh_all[~df_well_kh_xy_bal8_fld_vsh_all.vsh_wavg.isna()].reset_index()
df_well_kh_xy_bal10_fld_vsh_all = df_well_kh_xy_bal10_fld_vsh_all[~df_well_kh_xy_bal10_fld_vsh_all.vsh_wavg.isna()].reset_index()
#Joining Bal VIII + Bal X
df_well_kh_xy_fld_vsh_all = pd.concat([df_well_kh_xy_bal8_fld_vsh_all,df_well_kh_xy_bal10_fld_vsh_all]).reset_index()
df_well_kh_xy_fld_vsh_all.drop(['level_0',	'index'], axis=1, inplace=True)
df_kh_xy_vsh_ohe = pd.get_dummies(df_well_kh_xy_fld_vsh_all, columns = ['FORMATION', 'field'])

#### Run SVR

In [None]:
# Starting of the loop for Balakhany VIII chirag & azeri
y_test_lst = []
y_pred_test_lst = []
well_exclude_lst = []
gs_settings_lst = []
metrics_r2_mae_lst = []
df_kh_xy_vsh_ohe = df_kh_xy_vsh_ohe.sample(frac = 1).reset_index()
for i in tqdm(range(len(df_kh_xy_vsh_ohe))):
    #Making up the feature and target datasets
    df_wo_well = df_kh_xy_vsh_ohe.drop([i])
    well_exclude = df_kh_xy_vsh_ohe.iloc[i]['well']
    well_exclude_lst.append(well_exclude)
    y_train = np.array(df_wo_well['KHtst'])
    x_train = np.array(df_wo_well[[ 'x1', 'x2', 'x3', 'y1', 'y2', 'y3', 'kh1', 'kh2', 'kh3', 
                                    'vsh_wavg','FORMATION_Balakhany VIII sand','FORMATION_Balakhany X sand', 
                                    'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG', 
                                    'field_DWG', 'field_EAST AZERI', 'field_WEST AZERI', 
                                    'field_WEST CHIRAG']])
    well_train = np.array(df_wo_well['well'])
    y_test = np.array(df_kh_xy_vsh_ohe.iloc[i]['KHtst'])
    y_test_lst.append(y_test)
    x_test = np.array(df_kh_xy_vsh_ohe.iloc[i][[   'x1', 'x2', 'x3', 'y1', 'y2', 'y3', 'kh1', 'kh2', 'kh3', 
                                                    'vsh_wavg','FORMATION_Balakhany VIII sand','FORMATION_Balakhany X sand', 
                                                    'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG', 
                                                    'field_DWG', 'field_EAST AZERI', 'field_WEST AZERI', 'field_WEST CHIRAG']])
    # GridSearch for ML-model
    gbr_gr_svr = SVR()
    grid_param_SVR = {'kernel' : (['rbf']),
                      'C' : [500],
                      'gamma':[0.005],
                      'epsilon': [0.01]}
    scorer = make_scorer(mae, greater_is_better=False)
    gd_sr_SVR = GridSearchCV(estimator = gbr_gr_svr, param_grid = grid_param_SVR, scoring=scorer, cv = 5)
    gd_sr_SVR.fit(x_train, y_train)
    GS_setting = gd_sr_SVR.best_params_
    gs_settings_lst.append((GS_setting['C'],GS_setting['gamma'], GS_setting['epsilon']))
    # Statement Pipeline of ML-model
    # svr = SVR(kernel = 'rbf', C=500, epsilon=0.001, gamma=0.5)
    svr = SVR(kernel = 'rbf', C=GS_setting['C'],  gamma=GS_setting['gamma'], epsilon=GS_setting['epsilon'])   
    # Apply StandardScaller to X & Y
    sc_x = StandardScaler()
    sc_y = StandardScaler()
    x_train = sc_x.fit_transform(x_train)
    x_test = sc_x.transform([x_test])
    y_train = sc_y.fit_transform(y_train.reshape(-1, 1))
    y_test = sc_y.transform(y_test.reshape(-1, 1)) 
    # Fitting the ML-model
    svr.fit(x_train, y_train)
    y_pred_train = svr.predict(x_train)
    y_pred_test = svr.predict(x_test)
    # Block of data naturalization
    y_pred_test_nat = sc_y.inverse_transform(y_pred_test.reshape(-1, 1))
    y_pred_test_lst.append(y_pred_test_nat[0][0])
    # Metrics computation for the ML-model
    r2_train = r2(y_train, y_pred_train).round(2)
    mae_train = mae(y_train, y_pred_train)
    mae_train_nat = sc_y.inverse_transform([[mae_train]])
    metrics_r2_mae_lst.append((r2_train, mae_train_nat[0][0].round(5)))
# Building up of dataframe
res_loop = pd.DataFrame(zip(y_test_lst,y_pred_test_lst,well_exclude_lst, gs_settings_lst, metrics_r2_mae_lst), 
                   columns = ['test','predict','well_excl','gs_setting', 'metrics_r2_mae'])
res_loop['l_test'] = res_loop.test*0.75
res_loop['h_test'] = res_loop.test*1.25
res_loop['qc'] = 'out'
res_loop.loc[(res_loop.predict >= res_loop.l_test) & (res_loop.predict <= res_loop.h_test), 'qc'] = 'in'
print('wells total:', res_loop.shape[0])
print('wells unpredicted:', res_loop['qc'].value_counts()['out'], (res_loop['qc'].value_counts()['out']/res_loop.shape[0]).round(3), 'v/v')
print('wells predicted:', res_loop['qc'].value_counts()['in'], (res_loop['qc'].value_counts()['in']/res_loop.shape[0]).round(3), 'v/v')
mae_df_xy = mae(res_loop.test, res_loop.predict).round(0)
r2_df_xy = r2(res_loop.test, res_loop.predict).round(2)
print('mae:', mae_df_xy, 'mDm')
print('r2:', r2_df_xy)

#### Reporting

In [None]:
# Making up the final x-plot
max_val = 14000
fig1_ml = px.scatter(res_loop, x='test', y='predict', 
                     color='qc', 
                     hover_data=['well_excl'], 
                     width=400, height=400,
                     color_discrete_sequence=["green", "red"])
fig1_ml.update_traces(marker=dict(size=10,opacity=0.75,line=dict(color='rgb(47, 57, 61)', width=1)))
fig2_ml=px.line(x=[0,max_val], y=[0,max_val])
fig2_1_ml=px.line(x=[0,max_val], y=[0,max_val*1.25])
fig2_2_ml=px.line(x=[0,max_val], y=[0,max_val*0.75])
fig2_ml.update_traces(line=dict(color = 'blue'))
fig2_1_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig2_2_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig3_ml = go.Figure(data = fig1_ml.data + fig2_ml.data + fig2_1_ml.data + fig2_2_ml.data)
fig3_ml.update_layout(title = 'Comparison Actual vs Pred xy-kh SVR',width=600,height=400, xaxis_title='test', yaxis_title='predict',
                      margin=dict(l=10,r=10,b=10,t=40))

In [None]:
#Drawind map distribution of predicted data
res_xy = df_khtst_xy_tvd[(df_khtst_xy_tvd.FORMATION == 'Balakhany VIII sand') & 
                            (df_khtst_xy_tvd.X > 0) & (df_khtst_xy_tvd.Y > 0) &
                            (~df_khtst_xy_tvd.TVD_SCS.isna())][['well','FORMATION','X','Y']]
res_xy_qc = res_loop.set_index('well_excl').join(res_xy.set_index('well')).reset_index()
res_xy_qc['predict'] = res_xy_qc.predict.round(0)
field_avg_coord = metadata.groupby('field')[['X_wellhead','Y_wellhead']].mean().reset_index()
fig_map = px.scatter(res_xy_qc, x='X', y='Y', 
                     color='qc', 
                     hover_data=['well_excl', 'test', 'predict'], 
                     color_discrete_sequence=["green", "red"])
fig_map.update_traces(marker=dict(size=10,opacity=0.75,line=dict(color='rgb(47, 57, 61)', width=1)))
fig_pltfm = px.scatter(field_avg_coord, x='X_wellhead', y='Y_wellhead',hover_data=['field'])
fig_pltfm.update_traces(marker=dict(size=15,opacity=0.75,color='black', symbol="square"))
fig_final = go.Figure(data = fig_map.data + fig_pltfm.data)
fig_final.update_layout(title = 'Comparison Actual vs Pred on map, Balakhany VIII, SVR()', width=1000, height=500,
                        margin=dict(l=0, r=0, t=40, b=0))

### The loop for dist-dataset of united Balakhany VIII+X

#### Data preparation

In [25]:
#Uploading k_htst data from csv-file & Calculation of Euclidean Distances
path = 'C:\\jupyter\\SPP\\inputoutput\\'
df_khtst = pd.read_csv(path + 'df_prq_khtst_v3.csv')
df_khtst_xy = df_khtst.set_index(['well','FORMATION']).join(
                                 df_prq[['well','FORMATION','X','Y','TVD_SCS']].groupby(
                                 ['well','FORMATION']).apply(lambda x: x.iloc[0]).drop(
                                 ['well','FORMATION'], axis=1)).reset_index()
#Calculation of Euclidean Distances for the top of Balakhany VIII sand & Balakhany X sand
def well_dist_calc(formation='Balakhany VIII sand'):
    data = df_khtst_xy[(df_khtst_xy.FORMATION == formation) & (df_khtst_xy.X > 0) & (df_khtst_xy.Y > 0)]
    row_name = data.well.reset_index().drop(['index'], axis=1)
    distance_fm = pd.DataFrame(euclidean_distances(data[['X', 'Y', 'TVD_SCS']]), columns=list(data.well))
    distance_fm_well = distance_fm.join(row_name).set_index('well')
    distance_fm_well.reset_index()
    return distance_fm_well.reset_index()
dist_bal8 = well_dist_calc('Balakhany VIII sand')
dist_bal10 = well_dist_calc('Balakhany X sand')    
# Preparation dataset for X_train/x_test data splitting
well_clean_8 = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand')].well
well_clean_10 = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany X sand')].well

df_collect8 = []
for num, well_name in enumerate(dist_bal8.well[:]):
    well_dist3 = dist_bal8[dist_bal8.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()
    well_dist3_res = well_dist3.T[1:].reset_index().drop('index', axis=1)
    well_dist3_res.columns =['dist1', 'dist2', 'dist3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3['index']))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany VIII sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_res, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect8.append(result)
df_well_kh_dist8 = pd.concat(df_collect8).reset_index().drop('index', axis=1)
df_khtst_xy_bal8 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany VIII sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_dist_bal8 = df_well_kh_dist8.set_index('well').join(df_khtst_xy_bal8.set_index('well')).reset_index()
df_well_kh_dist_bal8_fld = df_well_kh_dist_bal8.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
df_well_kh_dist_bal8_fld = df_well_kh_dist_bal8_fld[(df_well_kh_dist_bal8_fld.well.isin(well_clean_8)) &
                                                    (df_well_kh_dist_bal8_fld.kh1>0) &
                                                    (df_well_kh_dist_bal8_fld.kh2>0) &
                                                    (df_well_kh_dist_bal8_fld.kh3>0) &
                                                    (df_well_kh_dist_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_collect10 = []
for num, well_name in enumerate(dist_bal10.well):
    well_dist3 = dist_bal10[dist_bal10.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()
    well_dist3_res = well_dist3.T[1:].reset_index().drop('index', axis=1)
    well_dist3_res.columns =['dist1', 'dist2', 'dist3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3['index']))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany X sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_res, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect10.append(result)
df_well_kh_dist10 = pd.concat(df_collect10).reset_index().drop('index', axis=1)
df_khtst_xy_bal10 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany X sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_dist_bal10 = df_well_kh_dist10.set_index('well').join(df_khtst_xy_bal10.set_index('well')).reset_index()
df_well_kh_dist_bal10_fld = df_well_kh_dist_bal10.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
df_well_kh_dist_bal10_fld = df_well_kh_dist_bal10_fld[(df_well_kh_dist_bal10_fld.well.isin(well_clean_10)) &
                                                    (df_well_kh_dist_bal10_fld.kh1>0) &
                                                    (df_well_kh_dist_bal10_fld.kh2>0) &
                                                    (df_well_kh_dist_bal10_fld.kh3>0) &
                                                    (df_well_kh_dist_bal10_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_well_kh_dist_all = pd.concat([df_well_kh_dist_bal8_fld, df_well_kh_dist_bal10_fld])
# df_well_kh_dist_all

In [None]:
# Distribution average dist for offsets wells into dataset
dist_avg_all = df_well_kh_dist_all.iloc[:, :4]
dist_avg_all['avg'] = dist_avg_all.iloc[:,1:4].mean(axis=1)

fig = go.Figure()
fig.add_trace(go.Histogram(
              x=dist_avg_all.avg, 
              xbins=dict(start=0, end=3000, size=50), marker_color='red', name='grey'))
fig.update_traces(opacity=0.5)
fig.update_layout(title_text='Distribution average dist for offsets wells into dataset',
                  xaxis_title_text='tst_thickness', yaxis_title_text='Count',
                  autosize=True, width=1000, height=300, margin=dict(l=10,r=10,b=10,t=40))
fig.update_xaxes(nticks=40, showgrid=True)

#### Run SVR

In [None]:
# Starting of the loop for Balakhany VIII chirag & azeri
df_dist_all_ohe = pd.get_dummies(df_well_kh_dist_all, columns = ['FORMATION','field']).reset_index()
y_test_lst = []
y_pred_test_lst = []
well_exclude_lst = []
gs_settings_lst = []
metrics_r2_mae_lst = []
df_dist_all_ohe = df_dist_all_ohe.sample(frac = 1).reset_index()
for i in tqdm(range(len(df_dist_all_ohe))[:]):
    #Making up the feature and target datasets
    df_wo_well = df_dist_all_ohe.drop([i])
    well_exclude = df_dist_all_ohe.iloc[i]['well']
    well_exclude_lst.append(well_exclude)
    y_train = np.array(df_wo_well['KHtst'])
    x_train = np.array(df_wo_well[[ 'dist1', 'dist2', 'dist3', 'kh1', 'kh2', 'kh3',
                                    'FORMATION_Balakhany VIII sand', 'FORMATION_Balakhany X sand', 
                                    'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG',
                                    'field_DWG', 'field_EAST AZERI', 'field_WEST AZERI',
                                    'field_WEST CHIRAG']])
    well_train = np.array(df_wo_well['well'])
    y_test = np.array(df_dist_all_ohe.iloc[i]['KHtst'])
    y_test_lst.append(y_test)
    x_test = np.array(df_dist_all_ohe.iloc[i][[     'dist1', 'dist2', 'dist3', 'kh1', 'kh2', 'kh3',
                                                    'FORMATION_Balakhany VIII sand', 'FORMATION_Balakhany X sand', 
                                                    'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG',
                                                    'field_DWG', 'field_EAST AZERI', 'field_WEST AZERI',
                                                    'field_WEST CHIRAG']])
    # GridSearch for ML-model
    svr_gr_sr = SVR()
    grid_param_SVR = {'kernel' : (['rbf']),
                      'C' : [500],
                      'gamma':[0.005],
                      'epsilon': [0.001]}
    scorer = make_scorer(mae, greater_is_better=False)
    gd_sr_SVR = GridSearchCV(estimator = svr_gr_sr, param_grid = grid_param_SVR, scoring=scorer, cv = 5)
    gd_sr_SVR.fit(x_train, y_train)
    GS_setting = gd_sr_SVR.best_params_
    gs_settings_lst.append((GS_setting['C'],GS_setting['gamma'], GS_setting['epsilon']))
    # Statement Pipeline of ML-model
    # svr = SVR(kernel = 'rbf', C=500, epsilon=0.001, gamma=0.5)
    svr = SVR(kernel = 'rbf', C=GS_setting['C'],  gamma=GS_setting['gamma'], epsilon=GS_setting['epsilon'])   
    # Apply StandardScaller to X & Y
    sc_x = StandardScaler()
    sc_y = StandardScaler()
    x_train = sc_x.fit_transform(x_train)
    x_test = sc_x.transform([x_test])
    y_train = sc_y.fit_transform(y_train.reshape(-1, 1))
    y_test = sc_y.transform(y_test.reshape(-1, 1)) 
    # Fitting the ML-model
    svr.fit(x_train, y_train)
    y_pred_train = svr.predict(x_train)   
    y_pred_test = svr.predict(x_test)
    # Block of data naturalization
    y_pred_train_nat = sc_y.inverse_transform(y_pred_train.reshape(1, -1))
    y_pred_test_nat = sc_y.inverse_transform(y_pred_test.reshape(-1, 1))
    y_pred_test_lst.append(y_pred_test_nat[0][0])
    # Metrics computation for the ML-model
    r2_train = r2(y_train, y_pred_train).round(2)
    mae_train = mae(y_train, y_pred_train)
    mae_train_nat = sc_y.inverse_transform([[mae_train]])
    metrics_r2_mae_lst.append((r2_train, mae_train_nat[0][0].round(0)))
# Building up of dataframe
res_split = pd.DataFrame(zip(y_test_lst,y_pred_test_lst,well_exclude_lst, gs_settings_lst, metrics_r2_mae_lst), 
                            columns = ['test','predict','well_excl','gs_setting', 'metrics_r2_mae'])
res_split['l_test'] = res_split.test*0.75
res_split['h_test'] = res_split.test*1.25
res_split['qc'] = 'out'
res_split.loc[(res_split.predict >= res_split.l_test) & (res_split.predict <= res_split.h_test), 'qc'] = 'in'
print('wells total:', res_split.shape[0])
print('wells unpredicted:', res_split['qc'].value_counts()['out'], (res_split['qc'].value_counts()['out']/res_split.shape[0]).round(3), 'v/v')
print('wells predicted:', res_split['qc'].value_counts()['in'], (res_split['qc'].value_counts()['in']/res_split.shape[0]).round(3), 'v/v')
mae_df_xy = mae(res_split.test, res_split.predict).round(0)
r2_df_xy = r2(res_split.test, res_split.predict).round(2)
print('mae:', mae_df_xy, 'mDm')
print('r2:', r2_df_xy)

#### Reporting

In [None]:
# Making up the final x-plot
max_val = 14000
fig1_ml = px.scatter(res_split, x='test', y='predict', 
                     color='qc', 
                     hover_data=['well_excl'], 
                     width=400, height=400,
                     color_discrete_sequence=["red", "green"])
fig1_ml.update_traces(marker=dict(size=10,opacity=0.75,line=dict(color='rgb(47, 57, 61)', width=1)))
fig2_ml=px.line(x=[0,max_val], y=[0,max_val])
fig2_1_ml=px.line(x=[0,max_val], y=[0,max_val*1.25])
fig2_2_ml=px.line(x=[0,max_val], y=[0,max_val*0.75])
fig2_ml.update_traces(line=dict(color = 'blue'))
fig2_1_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig2_2_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig3_ml = go.Figure(data = fig1_ml.data + fig2_ml.data + fig2_1_ml.data + fig2_2_ml.data)
fig3_ml.update_layout(title = 'Comparison Actual vs Pred dist-kh SVR',width=600,height=400, xaxis_title='test', yaxis_title='pred',
                      margin=dict(l=10,r=10,b=10,t=40))

### C04-test or Influence of x_train.shape on prediction

In [None]:
i=5
print('well:', df_kh_xy_vsh_ohe.iloc[i][0], 'KHtst:', df_kh_xy_vsh_ohe.iloc[i][10])
y_test = np.array(df_kh_xy_vsh_ohe.iloc[i]['KHtst'])
x_test = np.array(df_kh_xy_vsh_ohe.iloc[i][[   'x1', 'x2', 'x3', 'y1', 'y2', 'y3', 'kh1', 'kh2', 'kh3', 
                                                    'vsh_wavg', 
                                                    'FORMATION_Balakhany VIII sand','FORMATION_Balakhany X sand', 
                                                    'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG', 
                                                    'field_DWG', 'field_EAST AZERI', 'field_WEST AZERI', 'field_WEST CHIRAG']])
final_lst = []
for j in range(len(df_kh_xy_vsh_ohe))[16::5]:
    df_wo_well = df_kh_xy_vsh_ohe.iloc[6:j]
    y_train = np.array(df_wo_well['KHtst'])
    x_train = np.array(df_wo_well[[ 'x1', 'x2', 'x3', 'y1', 'y2', 'y3', 'kh1', 'kh2', 'kh3', 
                                    'vsh_wavg', 
                                    'FORMATION_Balakhany VIII sand','FORMATION_Balakhany X sand', 
                                    'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG', 
                                    'field_DWG', 'field_EAST AZERI', 'field_WEST AZERI', 'field_WEST CHIRAG']])
    grid_param_SVR = {  'kernel' : (['rbf']),
                    'C' : [500],
                    'gamma':[0.005],
                    'epsilon': [0.001]}
    scorer = make_scorer(mae, greater_is_better=False)
    svr = Pipeline([("scaler",StandardScaler()),("svr",SVR(kernel = 'rbf', C=GS_setting['C'], 
                                                    gamma = GS_setting['gamma'], epsilon=GS_setting['epsilon']))])
    # Fitting the ML-model
    svr.fit(x_train, y_train)
    y_pred_train = svr.predict(x_train)
    y_pred_test = svr.predict([x_test])
    final_lst.append((y_test, y_pred_test[0], x_train.shape[0]))

c04test = pd.DataFrame(final_lst, columns=['actual', 'predict', 'N_dataset'])
predict = px.line(c04test, x='N_dataset', y='predict')
actual = px.line(c04test, x='N_dataset', y='actual')
predict.update_traces(line=dict(color = 'blue'))
actual.update_traces(line=dict(color = 'red'))
figC04 = go.Figure(data = predict.data + actual.data)
figC04.update_layout(title = 'Influence of x_train.shape on prediction ' + str(df_kh_xy_vsh_ohe.iloc[i][0]),
                    width=1000,height=300, 
                    xaxis_title='x_train.shape', yaxis_title='KHtst',
                    margin=dict(l=10,r=10,b=10,t=40))
figC04.update_yaxes(range=[0,c04test.predict.max()+100])
figC04.show()

### Nadir's dataset original

In [30]:
# Upload df_xr_yr_vsh.csv
path = 'C:\\jupyter\\SPP\\input\\'
df_nadir_xy = pd.read_csv(path + 'df_xr_yr_vsh.csv')

In [None]:
# Starting of the loop for Balakhany VIII chirag & azeri
y_test_lst = []
y_pred_test_lst = []
well_exclude_lst = []
gs_settings_lst = []
metrics_r2_mae_lst = []
df_kh_xy_vsh_ohe = df_nadir_xy.sample(frac = 1)
for i in tqdm(range(len(df_kh_xy_vsh_ohe))):
    #Making up the feature and target datasets
    df_wo_well = df_kh_xy_vsh_ohe.drop([i])
    well_exclude = df_kh_xy_vsh_ohe.iloc[i]['well']
    well_exclude_lst.append(well_exclude)
    y_train = np.array(df_wo_well['KHtst'])
    x_train = np.array(df_wo_well[[ 'X', 'Y', 'TVD_SCS', 'vsh_avg_md',
                                    'X_new', 'Y_new', 'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG',
                                    'field_DWG', 'field_EAST AZERI', 'field_WEST AZERI',
                                    'field_WEST CHIRAG']])
    well_train = np.array(df_wo_well['well'])
    y_test = np.array(df_kh_xy_vsh_ohe.iloc[i]['KHtst'])
    y_test_lst.append(y_test)
    x_test = np.array(df_kh_xy_vsh_ohe.iloc[i][[    'X', 'Y', 'TVD_SCS', 'vsh_avg_md',
                                                    'X_new', 'Y_new', 'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG',
                                                    'field_DWG', 'field_EAST AZERI', 'field_WEST AZERI',
                                                    'field_WEST CHIRAG']])
    # GridSearch for ML-model
    svr_gr_sr = SVR()
    # 2000 0.005 5
    grid_param_SVR = {'kernel' : (['rbf']),
                      'C' : [500],
                      'gamma':[0.005],
                      'epsilon': [0.001]}
    scorer = make_scorer(mae, greater_is_better=False)
    gd_sr_SVR = GridSearchCV(estimator = svr_gr_sr, param_grid = grid_param_SVR, scoring=scorer, cv = 5)
    gd_sr_SVR.fit(x_train, y_train)
    GS_setting = gd_sr_SVR.best_params_
    gs_settings_lst.append((GS_setting['C'],GS_setting['gamma'], GS_setting['epsilon']))
    # Statement Pipeline of ML-model
    # svr = SVR(kernel = 'rbf', C=500, epsilon=0.001, gamma=0.5)
    svr = SVR(kernel = 'rbf', C=GS_setting['C'],  gamma=GS_setting['gamma'], epsilon=GS_setting['epsilon'])   
    # Apply StandardScaller to X & Y
    sc_x = StandardScaler()
    sc_y = StandardScaler()
    x_train = sc_x.fit_transform(x_train)
    x_test = sc_x.transform([x_test])
    y_train = sc_y.fit_transform(y_train.reshape(-1, 1))
    y_test = sc_y.transform(y_test.reshape(-1, 1)) 
    # Fitting the ML-model
    svr.fit(x_train, y_train)
    y_pred_train = svr.predict(x_train)
    y_pred_test = svr.predict(x_test)
    # Block of data naturalization
    y_pred_test_nat = sc_y.inverse_transform(y_pred_test.reshape(-1, 1))
    y_pred_test_lst.append(y_pred_test_nat[0][0])
    # Metrics computation for the ML-model
    r2_train = r2(y_train, y_pred_train).round(2)
    mae_train = mae(y_train, y_pred_train)
    mae_train_nat = sc_y.inverse_transform([[mae_train]])
    metrics_r2_mae_lst.append((r2_train, mae_train_nat[0][0].round(5)))
# Building up of dataframe
res_nadir = pd.DataFrame(zip(y_test_lst,y_pred_test_lst,well_exclude_lst, gs_settings_lst, metrics_r2_mae_lst), 
                   columns = ['test','predict','well_excl','gs_setting', 'metrics_r2_mae'])
res_nadir['l_test'] = res_nadir.test*0.75
res_nadir['h_test'] = res_nadir.test*1.25
res_nadir['qc'] = 'out'
res_nadir.loc[(res_nadir.predict >= res_nadir.l_test) & (res_nadir.predict <= res_nadir.h_test), 'qc'] = 'in'
print('wells total:', res_nadir.shape[0])
print('wells unpredicted:', res_nadir['qc'].value_counts()['out'], (res_nadir['qc'].value_counts()['out']/res_nadir.shape[0]).round(3), 'v/v')
print('wells predicted:', res_nadir['qc'].value_counts()['in'], (res_nadir['qc'].value_counts()['in']/res_nadir.shape[0]).round(3), 'v/v')
mae_df_xy = mae(res_nadir.test, res_nadir.predict).round(0)
r2_df_xy = r2(res_nadir.test, res_nadir.predict).round(2)
print('mae:', mae_df_xy, 'mDm')
print('r2:', r2_df_xy)

In [None]:
# Making up the final x-plot
max_val = 14000
fig1_ml = px.scatter(res_nadir, x='test', y='predict', 
                     color='qc', 
                     hover_data=['well_excl'], 
                     width=400, height=400,
                     color_discrete_sequence=["green", "red"])
fig1_ml.update_traces(marker=dict(size=10,opacity=0.75,line=dict(color='rgb(47, 57, 61)', width=1)))
fig2_ml=px.line(x=[0,max_val], y=[0,max_val])
fig2_1_ml=px.line(x=[0,max_val], y=[0,max_val*1.25])
fig2_2_ml=px.line(x=[0,max_val], y=[0,max_val*0.75])
fig2_ml.update_traces(line=dict(color = 'blue'))
fig2_1_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig2_2_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig3_ml = go.Figure(data = fig1_ml.data + fig2_ml.data + fig2_1_ml.data + fig2_2_ml.data)
fig3_ml.update_layout(title = 'Comparison Actual vs Pred Nadirs dataset',width=600,height=400, xaxis_title='test', yaxis_title='pred',
                      margin=dict(l=10,r=10,b=10,t=40))

### Nadir's dataset based on ALL Bal8+10 FU

#### Data preparation

In [None]:
#Reading csv with initial KHtst_v3, joining xy-coord & TVD_SCS tops of formation
path = 'C:\\jupyter\\SPP\\inputoutput\\'
df_khtst = pd.read_csv(path + 'df_prq_khtst_v3.csv')
df_khtst_xy = df_khtst.set_index(['well','FORMATION']).join(xy_coord.set_index(['well','FORMATION'])).reset_index()
df_khtst_xy_tvd = df_khtst_xy.set_index(['well', 'FORMATION']).join(df_prq_tvdss.set_index(['well','FORMATION'])).reset_index()
df_khtst_xy_tvd_fld = df_khtst_xy_tvd.set_index('well').join(df_prq.groupby('well')['field'].apply(lambda x: x.iloc[0])).reset_index()
# Preparation dataset for X_train/x_test data splitting based on outliers cleaned data
well_clean_all = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION.str.contains('Balakhany VIII'))].well
df_khtst_xy_tvd_fld_bal = df_khtst_xy_tvd_fld[  df_khtst_xy_tvd_fld.FORMATION.str.contains('Balakhany VIII') |
                                                df_khtst_xy_tvd_fld.FORMATION.str.contains('Balakhany X')].drop('DEPTH', axis=1)
#Calculation of TST-thickness for ALL Balakhany FU
df_fu_tst = df_prq[(df_prq.FORMATION.str.contains('Balakhany VIII')) | (df_prq.FORMATION.str.contains('Balakhany X'))]
df_fu_tst = df_fu_tst[['well', 'DEPTH','FORMATION','TST']]
df_fu_tst_top = df_fu_tst.groupby(['well','FORMATION'])['TST'].apply(lambda x: x.iloc[0]).reset_index()
df_fu_tst_top.rename(columns={'TST':'TST_top'}, inplace=True)
df_fu_tst_bot = df_fu_tst.groupby(['well','FORMATION'])['TST'].apply(lambda x: x.iloc[-1]).reset_index()
df_fu_tst_bot.rename(columns={'TST':'TST_bot'}, inplace=True)
df_fu_tst_final = df_fu_tst_top.set_index(['well','FORMATION']).join(df_fu_tst_bot.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final['TST_interv'] = round((df_fu_tst_final.TST_bot - df_fu_tst_final.TST_top),0)
df_fu_tst_final = df_fu_tst_final.set_index(['well','FORMATION']).join(xy_coord.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final = df_fu_tst_final.set_index(['well', 'FORMATION']).join(df_prq_tvdss.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final = df_fu_tst_final.set_index('well').join(df_prq.groupby('well')['field'].apply(lambda x: x.iloc[0])).reset_index()
df_fu_tst_final = df_fu_tst_final[(df_fu_tst_final.TST_interv > 0)]
#Reading df_prq_htst_avgprop_v1 and getting outliers
path = 'C:\\jupyter\\SPP\\inputoutput\\' 
df_htst_avgprop = pd.read_csv(path + 'df_prq_htst_avgprop_v1.csv')
well_no_outliers8 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand'].well.unique()
well_no_outliers10 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany X sand'].well.unique()
#Preparation weighted average df_htst_avgprop-dataset
cutoff_h_tst = 0.5
cutoff_perm_avg = 5
#Applying filtration to dataset with cutoffs
df_htst_avgprop_nz = df_htst_avgprop[(df_htst_avgprop.h_tst > cutoff_h_tst) & (df_htst_avgprop.md_perm_avg > cutoff_perm_avg)]
#Multiplaying htst by resprop values
df_htst_avgprop_nz['kavg_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_perm_avg
df_htst_avgprop_nz['phit_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_phit_avg
df_htst_avgprop_nz['vsh_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_vsh_avg
#Summarizing h_tst via well & formation
df_htst_fm = df_htst_avgprop_nz.groupby(['well','FORMATION'])['h_tst'].sum().reset_index()
df_htst_fm.rename(columns={'h_tst':'gross_tst'}, inplace=True)
#Calculating weighted averages
df_htst_avgprop_nz_avgpropsum = df_htst_avgprop_nz.groupby(['well','FORMATION'])[['phit_htst','vsh_htst']].sum().reset_index()
df_htst_avgprop_nz_avgpropsum_join = df_htst_avgprop_nz_avgpropsum.set_index(
                                     ['well','FORMATION']).join(df_htst_fm.set_index(['well','FORMATION'])).reset_index()
df_htst_avgprop_nz_avgpropsum_join['phit_wavg'] = df_htst_avgprop_nz_avgpropsum_join.phit_htst / df_htst_avgprop_nz_avgpropsum_join.gross_tst
df_htst_avgprop_nz_avgpropsum_join['vsh_wavg'] = df_htst_avgprop_nz_avgpropsum_join.vsh_htst / df_htst_avgprop_nz_avgpropsum_join.gross_tst
df_bal_hpv = df_htst_avgprop_nz_avgpropsum_join[
              df_htst_avgprop_nz_avgpropsum_join.FORMATION.str.contains('Balakhany')][['well','FORMATION','gross_tst','phit_wavg','vsh_wavg']]     
df_bal_permh = df_htst_avgprop_nz[df_htst_avgprop_nz.FORMATION.str.contains('Balakhany')].groupby(['well','FORMATION'])['kavg_htst'].sum().reset_index()
df_bal_phhpv = df_bal_hpv.set_index(['well','FORMATION']).join(df_bal_permh.set_index(['well','FORMATION'])).reset_index()
# df_bal_phhpv
#Preparing x,y matrices for ML
df_bal_phhpv_tstint = df_bal_phhpv.set_index(['well','FORMATION']).join(df_fu_tst_final.set_index(['well','FORMATION'])).reset_index()
df_bal_phhpv_tstint.rename(columns={'TST_interv':'interv_tst'}, inplace=True)
df_bal_phhpv_tstint.rename(columns={'gross_tst':'rock_tst'}, inplace=True)
df_bal_phhpv_tstint = df_bal_phhpv_tstint[['well','FORMATION','X', 'Y','TVD_SCS','field','interv_tst','rock_tst', 'vsh_wavg', 'kavg_htst']]
df_bal_avgprop = df_bal_phhpv_tstint[df_bal_phhpv_tstint.X.notna() & df_bal_phhpv_tstint.Y.notna() & df_bal_phhpv_tstint.TVD_SCS.notna()]
df_bal_avgprop_ohe = pd.get_dummies(df_bal_avgprop, columns = ['FORMATION', 'field'])
# Rotating field across the middle to reflect x and y more geologically sensible
def rotate(x,y): #rotate x,y around xo,yo by theta (rad)
    theta = (math.pi/180)*34
    xo = st.median(np.array(df_khtst_xy['X'].to_list()))
    yo = st.median(np.array(df_khtst_xy['Y'].to_list()))
    xr = math.cos(theta)*(x-xo)-math.sin(theta)*(y-yo) + xo
    yr = math.sin(theta)*(x-xo)+math.cos(theta)*(y-yo) + yo
    return [xr,yr]
df_bal_avgprop_ohe[['X_new', 'Y_new']] = df_bal_avgprop_ohe.apply(lambda row: rotate(row['X'], row['Y']), axis=1, result_type='expand')
df_bal_avgprop_ohe = df_bal_avgprop_ohe[(df_bal_avgprop_ohe.kavg_htst < 13000) & (df_bal_avgprop_ohe.kavg_htst > 100)]
print('features: ',df_bal_avgprop_ohe.columns)
print('dataset size: ',df_bal_avgprop_ohe.shape)

#### Nadir's dataset 70/30 split SVR

In [None]:
# X_train/x_test data splitting
y = np.array(df_bal_avgprop_ohe[[   'well','kavg_htst']])
x = np.array(df_bal_avgprop_ohe[[   'well','X_new', 'Y_new','TVD_SCS', 'interv_tst', 'rock_tst', 'vsh_wavg',
                                    'FORMATION_Balakhany VIII', 'FORMATION_Balakhany VIII 10',
                                    'FORMATION_Balakhany VIII 15', 'FORMATION_Balakhany VIII 20',
                                    'FORMATION_Balakhany VIII 25', 'FORMATION_Balakhany VIII 5',
                                    'FORMATION_Balakhany VIII sand', 'FORMATION_Balakhany X',
                                    'FORMATION_Balakhany X 20', 'FORMATION_Balakhany X 40',
                                    'FORMATION_Balakhany X 50', 'FORMATION_Balakhany X sand',
                                    'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG', 'field_DWG',
                                    'field_EAST AZERI', 'field_WEST AZERI', 'field_WEST CHIRAG']])
num = random.randint(0,100)
print('num', num)
x_train_init, x_test_init, y_train_init, y_test_init = train_test_split(x, y, test_size=0.3, random_state=num)
# Taking well names from train/test datasets
y_train_wells = y_train_init[:,0]
y_test_wells = y_test_init[:,0]
x_train = x_train_init[:,1:]
x_test = x_test_init[:,1:]
y_train = y_train_init[:,1]
y_test = y_test_init[:,1]
# GridSearch for ML-model
svr_gr_sr = SVR()
grid_param_SVR = {'kernel' : (['rbf']),
                  'C' : [10, 100, 500, 1000, 2000, 3000],
                  'gamma':[0.005, 0.01, 0.5],
                  'epsilon': [0.001,0.01, 1, 5]}
scorer = make_scorer(mae, greater_is_better=False)
gd_sr_SVR = GridSearchCV(estimator = svr_gr_sr, param_grid = grid_param_SVR, scoring=scorer, cv = 15)
gd_sr_SVR.fit(x_train, y_train)
GS_setting = gd_sr_SVR.best_params_
print(GS_setting)
# Applying Pipeline for ML-model
svr = Pipeline([("scaler",StandardScaler()),("svr",SVR(kernel = 'rbf', C=GS_setting['C'], 
                                                       gamma = GS_setting['gamma'], epsilon=GS_setting['epsilon']))])
svr.fit(x_train, y_train)
y_pred_train = svr.predict(x_train)
y_pred_test = svr.predict(x_test)
print('---------------------')
print('r2_train', r2(y_train, y_pred_train).round(2), 'x_train', x_train.shape)
print('r2_test', r2(y_test, y_pred_test).round(2), 'x_test', x_test.shape)
print('mae_train', mae(y_train, y_pred_train).round(0))
print('mae_test', mae(y_test, y_pred_test).round(0))
# QC of predicted values for train & test datasets
df_svr_train = pd.DataFrame(zip(y_train_wells, y_train, y_pred_train), columns=['well', 'actual','predict'])
df_svr_train['l_limit'] = df_svr_train.actual*0.75
df_svr_train['h_limit'] = df_svr_train.actual*1.25
df_svr_train['qc'] = 'out'
df_svr_train.loc[(df_svr_train.predict >= df_svr_train.l_limit) & (df_svr_train.predict <= df_svr_train.h_limit), 'qc'] = 'in'
df_svr_test = pd.DataFrame(zip(y_test_wells, y_test, y_pred_test), columns=['well', 'actual','predict'])
df_svr_test['l_limit'] = df_svr_test.actual*0.75
df_svr_test['h_limit'] = df_svr_test.actual*1.25
df_svr_test['qc'] = 'out'
df_svr_test.loc[(df_svr_test.predict >= df_svr_test.l_limit) & (df_svr_test.predict <= df_svr_test.h_limit), 'qc'] = 'in'

#### Run SVR

In [None]:
# Starting of the loop for Balakhany VIII chirag & azeri
y_test_lst = []
y_pred_test_lst = []
well_exclude_lst = []
gs_settings_lst = []
metrics_r2_mae_lst = []
df_bal_avgprop_ohe_gbr = df_bal_avgprop_ohe.sample(frac = 1).reset_index().drop('index', axis=1)
for i in tqdm(range(len(df_bal_avgprop_ohe_gbr))):
    #Making up the feature and target datasets
    df_wo_well = df_bal_avgprop_ohe_gbr.drop([i])
    well_exclude = df_bal_avgprop_ohe_gbr.iloc[i]['well']
    well_exclude_lst.append(well_exclude)
    y_train = np.array(df_wo_well['kavg_htst'])
    x_train = np.array(df_wo_well[[ 'X_new', 'Y_new','TVD_SCS', 'interv_tst', 'rock_tst', 'vsh_wavg',
                                    'FORMATION_Balakhany VIII', 'FORMATION_Balakhany VIII 10',
                                    'FORMATION_Balakhany VIII 15', 'FORMATION_Balakhany VIII 20',
                                    'FORMATION_Balakhany VIII 25', 'FORMATION_Balakhany VIII 5',
                                    'FORMATION_Balakhany VIII sand', 'FORMATION_Balakhany X',
                                    'FORMATION_Balakhany X 20', 'FORMATION_Balakhany X 40',
                                    'FORMATION_Balakhany X 50', 'FORMATION_Balakhany X sand',
                                    'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG', 'field_DWG',
                                    'field_EAST AZERI', 'field_WEST AZERI', 'field_WEST CHIRAG']])
    well_train = np.array(df_wo_well['well'])
    y_test = np.array(df_bal_avgprop_ohe_gbr.iloc[i]['kavg_htst'])
    y_test_lst.append(y_test)
    x_test = np.array(df_bal_avgprop_ohe_gbr.iloc[i][[  'X_new', 'Y_new','TVD_SCS', 'interv_tst', 'rock_tst', 'vsh_wavg',
                                                        'FORMATION_Balakhany VIII', 'FORMATION_Balakhany VIII 10',
                                                        'FORMATION_Balakhany VIII 15', 'FORMATION_Balakhany VIII 20',
                                                        'FORMATION_Balakhany VIII 25', 'FORMATION_Balakhany VIII 5',
                                                        'FORMATION_Balakhany VIII sand', 'FORMATION_Balakhany X',
                                                        'FORMATION_Balakhany X 20', 'FORMATION_Balakhany X 40',
                                                        'FORMATION_Balakhany X 50', 'FORMATION_Balakhany X sand',
                                                        'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG', 'field_DWG',
                                                        'field_EAST AZERI', 'field_WEST AZERI', 'field_WEST CHIRAG']])
    # GridSearch for ML-model
    # {'C': 500, 'epsilon': 0.001, 'gamma': 0.005, 'kernel': 'rbf'}
    grid_param_SVR = {  'C' : [500],
                        'epsilon': [0.01],
                        'gamma':[0.005],
                        'kernel' : (['rbf'])}
    GS_setting = grid_param_SVR
    gs_settings_lst.append((GS_setting['kernel'],GS_setting['C'],GS_setting['gamma'], GS_setting['epsilon']))
    # Statement of ML-model
    svr = Pipeline([("scaler",StandardScaler()),("svr",SVR( kernel = GS_setting['kernel'][0], 
                                                            C = GS_setting['C'][0],
                                                            gamma = GS_setting['gamma'][0], 
                                                            epsilon = GS_setting['epsilon'][0]))])
    # Fitting the ML-model
    svr.fit(x_train, y_train)
    y_pred_train = svr.predict(x_train)
    y_pred_test = svr.predict([x_test])
    y_pred_test_lst.append(y_pred_test[0])
    # Metrics computation for the ML-model
    r2_train = r2(y_train, y_pred_train).round(2)
    mae_train = mae(y_train, y_pred_train)
    metrics_r2_mae_lst.append((r2_train, mae_train.round(0)))
# Building up of dataframe
res_svr = pd.DataFrame( zip(y_test_lst,y_pred_test_lst,well_exclude_lst, gs_settings_lst), 
                        columns = ['test','predict','well', 'gs_setting',])
res_svr['l_test'] = res_svr.test*0.75
res_svr['h_test'] = res_svr.test*1.25
res_svr['qc'] = 'out'
res_svr.loc[(res_svr.predict >= res_svr.l_test) & (res_svr.predict <= res_svr.h_test), 'qc'] = 'in'
print('wells total:', res_svr.shape[0])
print('wells unpredicted:', res_svr['qc'].value_counts()['out'], (res_svr['qc'].value_counts()['out']/res_svr.shape[0]).round(3), 'v/v')
print('wells predicted:', res_svr['qc'].value_counts()['in'], (res_svr['qc'].value_counts()['in']/res_svr.shape[0]).round(3), 'v/v')
mae_df_xy = mae(res_svr.test, res_svr.predict).round(0)
r2_df_xy = r2(res_svr.test, res_svr.predict).round(2)
print('mae:', mae_df_xy, 'mDm')
print('r2:', r2_df_xy)

#### Reporting

In [None]:
# Making up the final x-plot
max_val = 14000
fig1_ml = px.scatter(res_svr, x='test', y='predict', 
                     color='qc', 
                     hover_data=['well'], 
                     width=400, height=400,
                     color_discrete_sequence=["red", "green"])
fig1_ml.update_traces(marker=dict(size=10,opacity=0.75,line=dict(color='rgb(47, 57, 61)', width=1)))
fig2_ml=px.line(x=[0,max_val], y=[0,max_val])
fig2_1_ml=px.line(x=[0,max_val], y=[0,max_val*1.25])
fig2_2_ml=px.line(x=[0,max_val], y=[0,max_val*0.75])
fig2_ml.update_traces(line=dict(color = 'blue'))
fig2_1_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig2_2_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig3_ml = go.Figure(data = fig1_ml.data + fig2_ml.data + fig2_1_ml.data + fig2_2_ml.data)
fig3_ml.update_layout(title = 'Comparison Actual vs Pred xy-kh rotated full Balakhany GBR',width=600,height=400, xaxis_title='test', yaxis_title='predict',
                      margin=dict(l=10,r=10,b=10,t=40))

### Shahriyar request

#### Data preparation

In [37]:
#Uploading k_htst data from csv-file & Calculation of Euclidean Distances
path = 'C:\\jupyter\\SPP\\inputoutput\\'
df_khtst = pd.read_csv(path + 'df_prq_khtst_v3.csv')
df_khtst_xy = df_khtst.set_index(['well','FORMATION']).join(
                                                            df_prq[['well','FORMATION','X','Y','TVD_SCS']].groupby(
                                                            ['well','FORMATION']).apply(lambda x: x.iloc[0]).drop(
                                                            ['well','FORMATION'], axis=1)
                                                            ).reset_index()
#Calculation of Euclidean Distances for the top of Balakhany VIII sand & Balakhany X sand
def well_dist_calc(formation='Balakhany VIII sand'):
    data = df_khtst_xy[(df_khtst_xy.FORMATION == formation) & (df_khtst_xy.X > 0) & (df_khtst_xy.Y > 0)]
    row_name = data.well.reset_index().drop(['index'], axis=1)
    distance_fm = pd.DataFrame(euclidean_distances(data[['X', 'Y', 'TVD_SCS']]), columns=list(data.well))
    distance_fm_well = distance_fm.join(row_name).set_index('well')
    distance_fm_well.reset_index()
    return distance_fm_well.reset_index()
dist_bal8 = well_dist_calc('Balakhany VIII sand')
dist_bal10 = well_dist_calc('Balakhany X sand')    
# Preparation dataset for X_train/x_test data splitting
well_clean_8 = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand')].well
well_clean_10 = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany X sand')].well
df_collect8 = []
for num, well_name in enumerate(dist_bal8.well[:]):
    well_dist3 = dist_bal8[dist_bal8.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()
    well_dist3_res = well_dist3.T[1:].reset_index().drop('index', axis=1)
    well_dist3_res.columns =['dist1', 'dist2', 'dist3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3['index']))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany VIII sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_res, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect8.append(result)
df_well_kh_dist8 = pd.concat(df_collect8).reset_index().drop('index', axis=1)
df_khtst_xy_bal8 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany VIII sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_dist_bal8 = df_well_kh_dist8.set_index('well').join(df_khtst_xy_bal8.set_index('well')).reset_index()
df_well_kh_dist_bal8_fld = df_well_kh_dist_bal8.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
df_well_kh_dist_bal8_fld = df_well_kh_dist_bal8_fld[(df_well_kh_dist_bal8_fld.well.isin(well_clean_8)) &
                                                    (df_well_kh_dist_bal8_fld.kh1>0) &
                                                    (df_well_kh_dist_bal8_fld.kh2>0) &
                                                    (df_well_kh_dist_bal8_fld.kh3>0) &
                                                    (df_well_kh_dist_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_collect10 = []
for num, well_name in enumerate(dist_bal10.well):
    well_dist3 = dist_bal10[dist_bal10.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()
    well_dist3_res = well_dist3.T[1:].reset_index().drop('index', axis=1)
    well_dist3_res.columns =['dist1', 'dist2', 'dist3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3['index']))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany X sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_res, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect10.append(result)
df_well_kh_dist10 = pd.concat(df_collect10).reset_index().drop('index', axis=1)
df_khtst_xy_bal10 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany X sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_dist_bal10 = df_well_kh_dist10.set_index('well').join(df_khtst_xy_bal10.set_index('well')).reset_index()
df_well_kh_dist_bal10_fld = df_well_kh_dist_bal10.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
df_well_kh_dist_bal10_fld = df_well_kh_dist_bal10_fld[(df_well_kh_dist_bal10_fld.well.isin(well_clean_10)) &
                                                    (df_well_kh_dist_bal10_fld.kh1>0) &
                                                    (df_well_kh_dist_bal10_fld.kh2>0) &
                                                    (df_well_kh_dist_bal10_fld.kh3>0) &
                                                    (df_well_kh_dist_bal10_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_well_kh_dist_all = pd.concat([df_well_kh_dist_bal8_fld, df_well_kh_dist_bal10_fld])
#Calculation of TST-thickness Balakhany VIII & X
df_fu_tst = df_prq[(df_prq.FORMATION.str.contains('Balakhany VIII')) | (df_prq.FORMATION.str.contains('Balakhany X'))]
df_fu_tst = df_fu_tst[['well', 'DEPTH','FORMATION','TST']]
df_fu_tst_top = df_fu_tst.groupby(['well','FORMATION'])['TST'].apply(lambda x: x.iloc[0]).reset_index()
df_fu_tst_top.rename(columns={'TST':'TST_top'}, inplace=True)
df_fu_tst_bot = df_fu_tst.groupby(['well','FORMATION'])['TST'].apply(lambda x: x.iloc[-1]).reset_index()
df_fu_tst_bot.rename(columns={'TST':'TST_bot'}, inplace=True)
df_fu_tst_final = df_fu_tst_top.set_index(['well','FORMATION']).join(df_fu_tst_bot.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final['TST_interv'] = round((df_fu_tst_final.TST_bot - df_fu_tst_final.TST_top),0)
df_fu_tst_final = df_fu_tst_final.set_index(['well','FORMATION']).join(xy_coord.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final = df_fu_tst_final.set_index(['well', 'FORMATION']).join(df_prq_tvdss.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final = df_fu_tst_final.set_index('well').join(df_prq.groupby('well')['field'].apply(lambda x: x.iloc[0])).reset_index()
df_fu_tst_final = df_fu_tst_final[(df_fu_tst_final.TST_interv > 0)]
#Reading df_prq_htst_avgprop_v1 and getting outliers
path = 'C:\\jupyter\\SPP\\inputoutput\\' 
df_htst_avgprop = pd.read_csv(path + 'df_prq_htst_avgprop_v1.csv')
well_no_outliers8 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand'].well.unique()
well_no_outliers10 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany X sand'].well.unique()
#Preparation weighted average df_htst_avgprop-dataset
cutoff_h_tst = 0.5
cutoff_perm_avg = 5
#Applying filtration to dataset with cutoffs
df_htst_avgprop_nz = df_htst_avgprop[(df_htst_avgprop.h_tst > cutoff_h_tst) & (df_htst_avgprop.md_perm_avg > cutoff_perm_avg)]
#Multiplaying htst by resprop values
df_htst_avgprop_nz['kavg_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_perm_avg
df_htst_avgprop_nz['phit_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_phit_avg
df_htst_avgprop_nz['vsh_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_vsh_avg
#Summarizing h_tst via well & formation
df_htst_fm = df_htst_avgprop_nz.groupby(['well','FORMATION'])['h_tst'].sum().reset_index()
df_htst_fm.rename(columns={'h_tst':'gross_tst'}, inplace=True)
#Calculating weighted averages
df_htst_avgprop_nz_avgpropsum = df_htst_avgprop_nz.groupby(['well','FORMATION'])[['phit_htst','vsh_htst']].sum().reset_index()
df_htst_avgprop_nz_avgpropsum_join = df_htst_avgprop_nz_avgpropsum.set_index(
                                     ['well','FORMATION']).join(df_htst_fm.set_index(['well','FORMATION'])).reset_index()
df_htst_avgprop_nz_avgpropsum_join['phit_wavg'] = df_htst_avgprop_nz_avgpropsum_join.phit_htst / df_htst_avgprop_nz_avgpropsum_join.gross_tst
df_htst_avgprop_nz_avgpropsum_join['vsh_wavg'] = df_htst_avgprop_nz_avgpropsum_join.vsh_htst / df_htst_avgprop_nz_avgpropsum_join.gross_tst
df_8bal_hpv = df_htst_avgprop_nz_avgpropsum_join[
              df_htst_avgprop_nz_avgpropsum_join.FORMATION == 'Balakhany VIII sand'][['well','FORMATION','gross_tst','phit_wavg','vsh_wavg']]
df_8bal_permh = df_htst_avgprop_nz[df_htst_avgprop_nz.FORMATION == 'Balakhany VIII sand'].groupby(['well','FORMATION'])['kavg_htst'].sum().reset_index()
df_8bal_phhpv = df_8bal_hpv.set_index(['well','FORMATION']).join(df_8bal_permh.set_index(['well','FORMATION'])).reset_index()
df_10bal_hpv = df_htst_avgprop_nz_avgpropsum_join[
              df_htst_avgprop_nz_avgpropsum_join.FORMATION == 'Balakhany X sand'][['well','FORMATION','gross_tst','phit_wavg','vsh_wavg']]
df_10bal_permh = df_htst_avgprop_nz[df_htst_avgprop_nz.FORMATION == 'Balakhany X sand'].groupby(['well','FORMATION'])['kavg_htst'].sum().reset_index()
df_10bal_phhpv = df_10bal_hpv.set_index(['well','FORMATION']).join(df_10bal_permh.set_index(['well','FORMATION'])).reset_index()
# #Preparing x,y matrices for ML
df_8bal_phhpv_tstint = df_8bal_phhpv.set_index(['well','FORMATION']).join(df_fu_tst_final.set_index(['well','FORMATION'])).reset_index()
df_8bal_phhpv_tstint = df_8bal_phhpv_tstint[['well', 'FORMATION', 'X', 'Y', 'DEPTH', 'TVD_SCS', 'field', 'gross_tst',
                                             'TST_interv', 'kavg_htst', 'phit_wavg', 'vsh_wavg']]
df_8bal_phhpv_tstint.rename(columns={'TST_interv':'interv_tst'}, inplace=True)
df_avgprop8_final_wa = df_8bal_phhpv_tstint.copy()
df_10bal_phhpv_tstint = df_10bal_phhpv.set_index(['well','FORMATION']).join(df_fu_tst_final.set_index(['well','FORMATION'])).reset_index()
df_10bal_phhpv_tstint = df_10bal_phhpv_tstint[['well', 'FORMATION', 'X', 'Y', 'DEPTH', 'TVD_SCS', 'field', 'gross_tst',
                                             'TST_interv', 'kavg_htst', 'phit_wavg', 'vsh_wavg']]
df_10bal_phhpv_tstint.rename(columns={'TST_interv':'interv_tst'}, inplace=True)
df_avgprop10_final_wa = df_10bal_phhpv_tstint.copy()
#Selecting data for Bal8 & Bal10 
df_avgprop_bal10_wa = df_avgprop10_final_wa[df_avgprop10_final_wa.FORMATION.str.contains('Balakhany X sand') & 
                                          df_avgprop10_final_wa.well.isin(well_no_outliers10)]
df_avgprop_bal8_wa = df_avgprop8_final_wa[df_avgprop8_final_wa.FORMATION.str.contains('Balakhany VIII sand') & 
                                          df_avgprop8_final_wa.well.isin(well_no_outliers8)]
df_avgprop_bal_wa = pd.concat([df_avgprop_bal8_wa, df_avgprop_bal10_wa])
# For Shahriyar
df_dist_kh_bal_shahriayr =  df_avgprop_bal_wa.set_index(['well','FORMATION']).join(
                            df_well_kh_dist_all.drop('field',axis=1).set_index(['well','FORMATION'])
                            ).reset_index()
#rotate x,y around xo,yo by theta (rad)
def rotate(x,y): 
    theta = (math.pi/180)*34
    xo = st.median(np.array(df_khtst_xy['X'].to_list()))
    yo = st.median(np.array(df_khtst_xy['Y'].to_list()))
    xr = math.cos(theta)*(x-xo)-math.sin(theta)*(y-yo) + xo
    yr = math.sin(theta)*(x-xo)+math.cos(theta)*(y-yo) + yo
    return [xr,yr]
df_dist_kh_bal_shahriayr[['X_new', 'Y_new']] = df_dist_kh_bal_shahriayr.apply(lambda row: rotate(row['X'], row['Y']), axis=1, result_type='expand')
df_dist_kh_bal_shahriayr_final = df_dist_kh_bal_shahriayr[[ 'well','FORMATION', 'X_new', 'Y_new', 'TVD_SCS', 'kh1', 'kh2', 'kh3', 
                                                            'interv_tst','gross_tst','kavg_htst' ]]
df_dist_kh_bal_shahriayr_final = pd.get_dummies(df_dist_kh_bal_shahriayr_final, columns = ['FORMATION'])
df_dist_kh_bal_shahriayr_final = df_dist_kh_bal_shahriayr_final[(df_dist_kh_bal_shahriayr_final.TVD_SCS.notna()) &
                                                                (df_dist_kh_bal_shahriayr_final.kh1.notna())]
# df_dist_kh_bal_shahriayr_final.to_csv('df_dist_kh_bal_shahriayr_final.csv', index=False)

#### 70/30 splits

In [None]:
# X_train/x_test data splitting
y = np.array(df_dist_kh_bal_shahriayr_final[[   'well','kavg_htst']])
x = np.array(df_dist_kh_bal_shahriayr_final[[   'well','X_new', 'Y_new', 'TVD_SCS', 'kh1', 'kh2', 'kh3', 'interv_tst',
                                                'gross_tst', 'kavg_htst', 'FORMATION_Balakhany VIII sand',
                                                'FORMATION_Balakhany X sand']])
num = random.randint(0,100)
print('num', num)
x_train_init, x_test_init, y_train_init, y_test_init = train_test_split(x, y, test_size=0.3, random_state=num)
# Taking well names from train/test datasets
y_train_wells = y_train_init[:,0]
y_test_wells = y_test_init[:,0]
x_train = x_train_init[:,1:]
x_test = x_test_init[:,1:]
y_train = y_train_init[:,1]
y_test = y_test_init[:,1]
# GridSearch for ML-model
svr_gr_sr = SVR()
grid_param_SVR = {'kernel' : (['rbf']),
                  'C' : [10, 100, 500, 1000, 2000, 3000],
                  'gamma':[0.005, 0.01, 0.5],
                  'epsilon': [0.001,0.01, 1, 5]}
scorer = make_scorer(mae, greater_is_better=False)
gd_sr_SVR = GridSearchCV(estimator = svr_gr_sr, param_grid = grid_param_SVR, scoring=scorer, cv = 15)
gd_sr_SVR.fit(x_train, y_train)
GS_setting = gd_sr_SVR.best_params_
print(GS_setting)
# Applying Pipeline for ML-model
svr = Pipeline([("scaler",StandardScaler()),("svr",SVR(kernel = 'rbf', C=GS_setting['C'], 
                                                       gamma = GS_setting['gamma'], epsilon=GS_setting['epsilon']))])
svr.fit(x_train, y_train)
y_pred_train = svr.predict(x_train)
y_pred_test = svr.predict(x_test)
print('---------------------')
print('r2_train', r2(y_train, y_pred_train).round(2), 'x_train', x_train.shape)
print('r2_test', r2(y_test, y_pred_test).round(2), 'x_test', x_test.shape)
print('mae_train', mae(y_train, y_pred_train).round(0))
print('mae_test', mae(y_test, y_pred_test).round(0))
# QC of predicted values for train & test datasets
df_svr_train = pd.DataFrame(zip(y_train_wells, y_train, y_pred_train), columns=['well', 'actual','predict'])
df_svr_train['l_limit'] = df_svr_train.actual*0.75
df_svr_train['h_limit'] = df_svr_train.actual*1.25
df_svr_train['qc'] = 'out'
df_svr_train.loc[(df_svr_train.predict >= df_svr_train.l_limit) & (df_svr_train.predict <= df_svr_train.h_limit), 'qc'] = 'in'
df_svr_test = pd.DataFrame(zip(y_test_wells, y_test, y_pred_test), columns=['well', 'actual','predict'])
df_svr_test['l_limit'] = df_svr_test.actual*0.75
df_svr_test['h_limit'] = df_svr_test.actual*1.25
df_svr_test['qc'] = 'out'
df_svr_test.loc[(df_svr_test.predict >= df_svr_test.l_limit) & (df_svr_test.predict <= df_svr_test.h_limit), 'qc'] = 'in'

#### Run SVR

In [None]:
# Starting of the loop 
y_test_lst = []
y_pred_test_lst = []
well_exclude_lst = []
gs_settings_lst = []
metrics_r2_mae_lst = []
df_dist_kh_bal_shahriayr_svr = df_dist_kh_bal_shahriayr_final.sample(frac = 1).reset_index().drop('index', axis=1)
for i in tqdm(range(len(df_dist_kh_bal_shahriayr_svr))):
    #Making up the feature and target datasets
    df_wo_well = df_dist_kh_bal_shahriayr_svr.drop([i])
    well_exclude = df_dist_kh_bal_shahriayr_svr.iloc[i]['well']
    well_exclude_lst.append(well_exclude)
    y_train = np.array(df_wo_well['kavg_htst'])
    x_train = np.array(df_wo_well[[ 'X_new', 'Y_new', 'TVD_SCS', 'kh1', 'kh2', 'kh3', 'interv_tst','gross_tst', 
                                    'FORMATION_Balakhany VIII sand', 'FORMATION_Balakhany X sand']])
    well_train = np.array(df_wo_well['well'])
    y_test = np.array(df_dist_kh_bal_shahriayr_svr.iloc[i]['kavg_htst'])
    y_test_lst.append(y_test)
    x_test = np.array(df_dist_kh_bal_shahriayr_svr.iloc[i][[    'X_new', 'Y_new', 'TVD_SCS', 'kh1', 'kh2', 'kh3', 'interv_tst','gross_tst', 
                                                                'FORMATION_Balakhany VIII sand', 'FORMATION_Balakhany X sand']])
    # GridSearch for ML-model
    # {'C': 500, 'epsilon': 0.001, 'gamma': 0.005, 'kernel': 'rbf'}
    grid_param_SVR = {  'C' : [500],
                        'epsilon': [0.01],
                        'gamma':[0.005],
                        'kernel' : (['rbf'])}
    GS_setting = grid_param_SVR
    gs_settings_lst.append((GS_setting['kernel'],GS_setting['C'],GS_setting['gamma'], GS_setting['epsilon']))
    # Statement of ML-model
    svr = Pipeline([("scaler",StandardScaler()),("svr",SVR( kernel = GS_setting['kernel'][0], 
                                                            C = GS_setting['C'][0],
                                                            gamma = GS_setting['gamma'][0], 
                                                            epsilon = GS_setting['epsilon'][0]))])
    # Fitting the ML-model
    svr.fit(x_train, y_train)
    y_pred_train = svr.predict(x_train)
    y_pred_test = svr.predict([x_test])
    y_pred_test_lst.append(y_pred_test[0])
    # Metrics computation for the ML-model
    r2_train = r2(y_train, y_pred_train).round(2)
    mae_train = mae(y_train, y_pred_train)
    metrics_r2_mae_lst.append((r2_train, mae_train.round(0)))
# Building up of dataframe
res_svr_sha = pd.DataFrame( zip(y_test_lst,y_pred_test_lst,well_exclude_lst, gs_settings_lst), 
                        columns = ['test','predict','well', 'gs_setting',])
res_svr_sha['l_test'] = res_svr_sha.test*0.75
res_svr_sha['h_test'] = res_svr_sha.test*1.25
res_svr_sha['qc'] = 'out'
res_svr_sha.loc[(res_svr_sha.predict >= res_svr_sha.l_test) & (res_svr_sha.predict <= res_svr_sha.h_test), 'qc'] = 'in'
print('wells total:', res_svr_sha.shape[0])
print('wells unpredicted:', res_svr_sha['qc'].value_counts()['out'], (res_svr_sha['qc'].value_counts()['out']/res_svr_sha.shape[0]).round(3), 'v/v')
print('wells predicted:', res_svr_sha['qc'].value_counts()['in'], (res_svr_sha['qc'].value_counts()['in']/res_svr_sha.shape[0]).round(3), 'v/v')
mae_df_xy = mae(res_svr_sha.test, res_svr_sha.predict).round(0)
r2_df_xy = r2(res_svr_sha.test, res_svr_sha.predict).round(2)
print('mae:', mae_df_xy, 'mDm')
print('r2:', r2_df_xy)

#### Reporting

In [None]:
# Making up the final x-plot
max_val = 14000
fig1_ml = px.scatter(res_svr_sha, x='test', y='predict', 
                     color='qc', 
                     hover_data=['well'], 
                     width=400, height=400,
                     color_discrete_sequence=["red", "green"])
fig1_ml.update_traces(marker=dict(size=10,opacity=0.75,line=dict(color='rgb(47, 57, 61)', width=1)))
fig2_ml=px.line(x=[0,max_val], y=[0,max_val])
fig2_1_ml=px.line(x=[0,max_val], y=[0,max_val*1.25])
fig2_2_ml=px.line(x=[0,max_val], y=[0,max_val*0.75])
fig2_ml.update_traces(line=dict(color = 'blue'))
fig2_1_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig2_2_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig3_ml = go.Figure(data = fig1_ml.data + fig2_ml.data + fig2_1_ml.data + fig2_2_ml.data)
fig3_ml.update_layout(title = 'Comparison Actual vs Pred Shahriyar SVR',width=600,height=400, xaxis_title='test', yaxis_title='predict',
                      margin=dict(l=10,r=10,b=10,t=40))