## Import libs

In [11]:
#Import libs 
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import statistics as st
import math
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from tqdm import tqdm
import textwrap
from statistics import mean
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score as r2, mean_absolute_error as mae, mean_squared_error as mse, accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.inspection import permutation_importance
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
import random
import mlflow
import mlflow.sklearn
pd.set_option("display.precision", 3)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', 18)

## Upload main data

In [12]:
# # Link to my OneDrive where the actual version of TL-dataset is stored
# https://eigcom-my.sharepoint.com/:x:/g/personal/taras_dolgushin_eilink_az/ET4iIiIeUUNCvtixzYiBLFkBc6qYHetw0H0WIqJrR1B6Uw?e=ZUJEIZ

In [13]:
# Loading the ACG_wells_JOINT_BEST_v6.csv file
path = 'C:\\jupyter\\SPP\\input\\'
data_init = pd.read_csv(path + 'ACG_wells_JOINT_BEST_v6.csv', sep=',')
# Data cleaning of TL-dataset
df = data_init.copy()
df = df[1:]
#Select only neccessary data
df_cln = df[['wellName', 'DEPTH', 'AREA', 'BADPORLOG', 'Casings', 'FORMATION',
            'FLANK1', 'FLANK2', 'Fluidcode', 'Fluidcode_mod', 'FLUIDCODE_PP',
            'LPERM', 'PHIT', 'NET', 
            'GR_N', 'GRMATRIX', 'GRSHALE','VSH', 'NPSS', 'RHOB', 'RHOF', 'RHOMA', 
            'RDEEP',  'SON', 'SONSH', 
            'TVD_SCS','TST', 'DEVI','HAZI','X', 'Y', 'Dip_Azimuth', 'Dip_TRU']]
#Fill up nan and -9999 values with 0
df_cln = df_cln.fillna(0)
df_cln = df_cln.replace(-9999, 0)
df_cln = df_cln.replace('-9999', '0')
#Assing proper datatypes for df
dicttypes = {'wellName':'string', 'DEPTH':'float', 'AREA':'int', 'BADPORLOG':'int', 'Casings':'float', 'FLANK1':'int', 'FLANK2':'int',
             'Fluidcode':'int', 'Fluidcode_mod':'int','FLUIDCODE_PP':'int','FORMATION':'string', 'GR_N':'float', 'GRMATRIX':'float', 
             'GRSHALE':'float', 'LPERM':'float', 'NPSS':'float',
             'PHIT':'float', 'NET':'float', 'RDEEP':'float', 'RHOB':'float', 'RHOF':'float', 'RHOMA':'float', 'TVD_SCS':'float', 'TST':'float',
             'VSH':'float', 'X':'float', 'Y':'float', 'Dip_Azimuth':'float', 'Dip_TRU':'float'}
df_cln = df_cln.astype(dicttypes, errors='ignore')
df_cln.loc[df_cln.FORMATION=='0', 'FORMATION']='None'
#Save data to parquet
df_cln.to_parquet('ACG_wells_JOINT_BEST_v6.parquet.gzip', compression='gzip')

#Loading metadata, distribution wells per Platforms and all the that.
path = 'C:\\jupyter\\SPP\\input\\'
metadata_init = pd.read_csv(path + 'ACG_wells_metadata.csv', sep=',')
metadata = metadata_init.copy()
metadata = metadata.rename(columns={'X':'X_wellhead', 'Y':'Y_wellhead'})
metadata.Status = metadata.Status.str.strip()
metadata.Status = metadata.Status.str.lower()
metadata.loc[metadata.Status == 'oil', 'Status' ] = 'production oil'
metadata.loc[metadata.Status == 'oil producer', 'Status' ] = 'production oil'
metadata.loc[metadata.Status == 'production', 'Status' ] = 'production oil'
metadata.loc[metadata.Status == 'produiction oil', 'Status' ] = 'production oil'
metadata.loc[metadata.Status == 'production_oil', 'Status' ] = 'production oil'
metadata.loc[metadata.Status == 'abandoned production oil', 'Status' ] = 'abandoned oil'
metadata.loc[metadata.Status == 'abandoned  oil', 'Status' ] = 'abandoned oil'
metadata.loc[metadata.Status == 'abandoned oi', 'Status' ] = 'abandoned oil'
metadata.loc[metadata.Status == 'injector  - water', 'Status' ] = 'injector - water'
metadata.loc[metadata.Status == 'injector water', 'Status' ] = 'injector - water'
metadata.loc[metadata.Status == 'injetor  - water', 'Status' ] = 'injector - water'
metadata.loc[metadata.Status == 'abandoned injector - water per b', 'Status' ] = 'abandoned injector - water'
metadata.loc[metadata.Status == 'plugged and abandoned', 'Status' ] = 'p&a'
metadata.loc[metadata.X_wellhead==118.270, 'X_wellhead'] = 526258.84
metadata.loc[metadata.Y_wellhead==526261.510, 'Y_wellhead'] = 4435802.01
metadata.loc[metadata.well=='C39', 'X_wellhead'] = 526258.840
metadata.loc[metadata.well=='C39', 'Y_wellhead'] = 4435802.010
metadata.loc[metadata.field=='West Azeri', 'field'] = 'WEST AZERI'
metadata.loc[metadata.field=='COP', 'field'] = 'WEST CHIRAG'
metadata.loc[metadata.well=='AZERI2', 'field'] = 'WEST AZERI'
metadata.loc[metadata.well=='AZERI3', 'field'] = 'WEST AZERI'
metadata.loc[metadata.well=='B31', 'field'] = 'CENTRAL AZERI'
metadata.loc[metadata.well=='J28_bpQIP', 'field'] = 'WEST CHIRAG'

#Read data from parquet
path = 'C:\\jupyter\\SPP\\input\\'
df_prq = pd.read_parquet(path + 'ACG_wells_JOINT_BEST_v6.parquet.gzip')
df_prq.rename(columns={'wellName':'well'}, inplace=True)
df_prq = df_prq.set_index('well').join(metadata.set_index('well')).reset_index()
# print('wells in df totally:', len(df_prq.well.unique()))
# Filter data with bad_well_list 
bad_well_list = ['E10Z','Predrill_J01Z', 'Predrill_J08', 'J28_bpQIP']
df_prq = df_prq[~df_prq.well.isin(bad_well_list)]
#Assign any Fluidcode_mod number by variable gross_pay=1 and gross_pay=0 if Fluidcode_mod as NaN
df_prq.loc[df_prq.Fluidcode_mod>0, 'gross_pay'] = 1
df_prq.loc[df_prq.Fluidcode_mod<=0, 'gross_pay'] = 0
df_prq.gross_pay = df_prq.gross_pay.astype('int')
#Getting XY coords of Balakhany formation tops
xy_coord = df_prq[['well', 'FORMATION', 'X', 'Y']]
xy_coord = xy_coord.groupby(['well', 'FORMATION']).apply(lambda x: x.iloc[0]).drop(columns=['well', 'FORMATION']).reset_index()
xy_coord = xy_coord[xy_coord.FORMATION.str.contains('Balakhany') & (xy_coord.X>0) & (xy_coord.Y>0)]
#Find top TVD_SCS for each formation
df_prq_tvdss = df_prq[['well','DEPTH','FORMATION','TVD_SCS']].groupby(['well','FORMATION']).apply(lambda x: x.iloc[0])
df_prq_tvdss = df_prq_tvdss.drop(['well','FORMATION'], axis=1).reset_index()
df_prq_tvdss = df_prq_tvdss[df_prq_tvdss.TVD_SCS>0]

## Data preparation

### Cleaning dataset for outliers

In [14]:
#Reading csv with initial KHtst_v3, joining xy-coord & TVD_SCS tops of formation
path = 'C:\\jupyter\\SPP\\inputoutput\\'
df_khtst = pd.read_csv(path + 'df_prq_khtst_v3.csv')
df_khtst_xy = df_khtst.set_index(['well','FORMATION']).join(xy_coord.set_index(['well','FORMATION'])).reset_index()
df_khtst_xy_tvd = df_khtst_xy.set_index(['well', 'FORMATION']).join(df_prq_tvdss.set_index(['well','FORMATION'])).reset_index()
df_khtst_xy_tvd_fld = df_khtst_xy_tvd.set_index('well').join(df_prq.groupby('well')['field'].apply(lambda x: x.iloc[0])).reset_index()
#Clean dataset for outliers for Balakhany VIII & X  for AZR and CHG fields by rule 1.5 * IQR
fm_list_8_10 = ['Balakhany VIII', 'Balakhany VIII sand', 'Balakhany VIII 25','Balakhany VIII 20', 
                'Balakhany VIII 15', 'Balakhany VIII 10', 'Balakhany VIII 5',
                'Balakhany X', 'Balakhany X sand', 'Balakhany X 40', 'Balakhany X 20'] 
azr_lst = ['CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI']
chg_lst = ['CHIRAG', 'DWG', 'DDGG', 'WEST CHIRAG']
df_lst = []
for fm in fm_list_8_10:
    df_khtst_fm = df_khtst_xy_tvd_fld[(df_khtst_xy_tvd_fld.FORMATION == fm) & (df_khtst_xy_tvd_fld.field.isin(azr_lst))]
    Q1 = df_khtst_fm['KHtst'].quantile(0.25)
    Q3 = df_khtst_fm['KHtst'].quantile(0.75)
    IQR = Q3 - Q1
    # print(f'bal {fm} azr IQR', IQR, 'bot limit:', (Q1 - 1.5 * IQR), 'top limit:', (Q3 + 1.5 * IQR))
    df_khtst_fm_qcl = df_khtst_fm[~((df_khtst_fm['KHtst'] < (Q1 - 1.5 * IQR)) | (df_khtst_fm['KHtst'] > (Q3 + 1.5 * IQR)))]
    df_lst.append(df_khtst_fm_qcl)
for fm in fm_list_8_10:
    df_khtst_fm = df_khtst_xy_tvd_fld[(df_khtst_xy_tvd_fld.FORMATION == fm) & (df_khtst_xy_tvd_fld.field.isin(chg_lst))]
    Q1 = df_khtst_fm['KHtst'].quantile(0.25)
    Q3 = df_khtst_fm['KHtst'].quantile(0.75)
    IQR = Q3 - Q1
    # print(f'bal {fm} chg IQR', IQR, 'bot limit:', (Q1 - 1.5 * IQR), 'top limit:', (Q3 + 1.5 * IQR))
    df_khtst_fm_qcl = df_khtst_fm[~((df_khtst_fm['KHtst'] < (Q1 - 1.5 * IQR)) | (df_khtst_fm['KHtst'] > (Q3 + 1.5 * IQR)))]
    df_lst.append(df_khtst_fm_qcl)
df_khtst_bal_qcl = pd.concat(df_lst)

### Getting TST-thick Bal VIII & X + uploading df_prq_htst_avgprop_v1

In [15]:
#Distribution tst-thickness Balaknany VIII / X over Chirag and Azeri zones
#Calculation of TST-thickness Balakhany VIII & X
df_fu_tst = df_prq[(df_prq.FORMATION.str.contains('Balakhany VIII')) | (df_prq.FORMATION.str.contains('Balakhany X'))]
df_fu_tst = df_fu_tst[['well', 'DEPTH','FORMATION','TST']]
df_fu_tst_top = df_fu_tst.groupby(['well','FORMATION'])['TST'].apply(lambda x: x.iloc[0]).reset_index()
df_fu_tst_top.rename(columns={'TST':'TST_top'}, inplace=True)
df_fu_tst_bot = df_fu_tst.groupby(['well','FORMATION'])['TST'].apply(lambda x: x.iloc[-1]).reset_index()
df_fu_tst_bot.rename(columns={'TST':'TST_bot'}, inplace=True)
df_fu_tst_final = df_fu_tst_top.set_index(['well','FORMATION']).join(df_fu_tst_bot.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final['TST_interv'] = round((df_fu_tst_final.TST_bot - df_fu_tst_final.TST_top),0)
df_fu_tst_final = df_fu_tst_final.set_index(['well','FORMATION']).join(xy_coord.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final = df_fu_tst_final.set_index(['well', 'FORMATION']).join(df_prq_tvdss.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final = df_fu_tst_final.set_index('well').join(df_prq.groupby('well')['field'].apply(lambda x: x.iloc[0])).reset_index()
df_fu_tst_final = df_fu_tst_final[(df_fu_tst_final.TST_interv > 0)]
#Reading df_prq_htst_avgprop_v1 and getting outliers
path = 'C:\\jupyter\\SPP\\inputoutput\\' 
df_htst_avgprop = pd.read_csv(path + 'df_prq_htst_avgprop_v1.csv')
well_no_outliers8 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand'].well.unique()
well_no_outliers10 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany X sand'].well.unique()
#Preparation weighted average df_htst_avgprop-dataset
cutoff_h_tst = 0.5
cutoff_perm_avg = 5
#Applying filtration to dataset with cutoffs
df_htst_avgprop_nz = df_htst_avgprop[(df_htst_avgprop.h_tst > cutoff_h_tst) & (df_htst_avgprop.md_perm_avg > cutoff_perm_avg)]
#Multiplaying htst by resprop values
df_htst_avgprop_nz['kavg_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_perm_avg
df_htst_avgprop_nz['phit_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_phit_avg
df_htst_avgprop_nz['vsh_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_vsh_avg
#Summarizing h_tst via well & formation
df_htst_fm = df_htst_avgprop_nz.groupby(['well','FORMATION'])['h_tst'].sum().reset_index()
df_htst_fm.rename(columns={'h_tst':'gross_tst'}, inplace=True)
#Calculating weighted averages
df_htst_avgprop_nz_avgpropsum = df_htst_avgprop_nz.groupby(['well','FORMATION'])[['phit_htst','vsh_htst']].sum().reset_index()
df_htst_avgprop_nz_avgpropsum_join = df_htst_avgprop_nz_avgpropsum.set_index(
                                     ['well','FORMATION']).join(df_htst_fm.set_index(['well','FORMATION'])).reset_index()
df_htst_avgprop_nz_avgpropsum_join['phit_wavg'] = df_htst_avgprop_nz_avgpropsum_join.phit_htst / df_htst_avgprop_nz_avgpropsum_join.gross_tst
df_htst_avgprop_nz_avgpropsum_join['vsh_wavg'] = df_htst_avgprop_nz_avgpropsum_join.vsh_htst / df_htst_avgprop_nz_avgpropsum_join.gross_tst
df_8bal_hpv = df_htst_avgprop_nz_avgpropsum_join[
              df_htst_avgprop_nz_avgpropsum_join.FORMATION == 'Balakhany VIII sand'][['well','FORMATION','gross_tst','phit_wavg','vsh_wavg']]
df_8bal_permh = df_htst_avgprop_nz[df_htst_avgprop_nz.FORMATION == 'Balakhany VIII sand'].groupby(['well','FORMATION'])['kavg_htst'].sum().reset_index()
df_8bal_phhpv = df_8bal_hpv.set_index(['well','FORMATION']).join(df_8bal_permh.set_index(['well','FORMATION'])).reset_index()
df_10bal_hpv = df_htst_avgprop_nz_avgpropsum_join[
              df_htst_avgprop_nz_avgpropsum_join.FORMATION == 'Balakhany X sand'][['well','FORMATION','gross_tst','phit_wavg','vsh_wavg']]
df_10bal_permh = df_htst_avgprop_nz[df_htst_avgprop_nz.FORMATION == 'Balakhany X sand'].groupby(['well','FORMATION'])['kavg_htst'].sum().reset_index()
df_10bal_phhpv = df_10bal_hpv.set_index(['well','FORMATION']).join(df_10bal_permh.set_index(['well','FORMATION'])).reset_index()
# #Preparing x,y matrices for ML
df_8bal_phhpv_tstint = df_8bal_phhpv.set_index(['well','FORMATION']).join(df_fu_tst_final.set_index(['well','FORMATION'])).reset_index()
df_8bal_phhpv_tstint = df_8bal_phhpv_tstint[['well', 'FORMATION', 'X', 'Y', 'DEPTH', 'TVD_SCS', 'field', 'gross_tst',
                                             'TST_interv', 'kavg_htst', 'phit_wavg', 'vsh_wavg']]
df_8bal_phhpv_tstint.rename(columns={'TST_interv':'interv_tst'}, inplace=True)
df_avgprop8_final_wa = df_8bal_phhpv_tstint.copy()
df_10bal_phhpv_tstint = df_10bal_phhpv.set_index(['well','FORMATION']).join(df_fu_tst_final.set_index(['well','FORMATION'])).reset_index()
df_10bal_phhpv_tstint = df_10bal_phhpv_tstint[['well', 'FORMATION', 'X', 'Y', 'DEPTH', 'TVD_SCS', 'field', 'gross_tst',
                                             'TST_interv', 'kavg_htst', 'phit_wavg', 'vsh_wavg']]
df_10bal_phhpv_tstint.rename(columns={'TST_interv':'interv_tst'}, inplace=True)
df_avgprop10_final_wa = df_10bal_phhpv_tstint.copy()
#Selecting data for Bal8 & Bal10 
df_avgprop_bal10_wa = df_avgprop10_final_wa[df_avgprop10_final_wa.FORMATION.str.contains('Balakhany X sand') & 
                                          df_avgprop10_final_wa.well.isin(well_no_outliers10)]
df_avgprop_bal8_wa = df_avgprop8_final_wa[df_avgprop8_final_wa.FORMATION.str.contains('Balakhany VIII sand') & 
                                          df_avgprop8_final_wa.well.isin(well_no_outliers8)]

### Preparation dataset for X_train/x_test

In [16]:
# Preparation dataset for X_train/x_test data splitting based on outliers cleaned data
azr_lst = ['CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI']
chg_lst = ['CHIRAG', 'DWG', 'DDGG', 'WEST CHIRAG']
well_clean_azr = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand') & 
                                  (df_khtst_bal_qcl.field.isin(azr_lst))].well
well_clean_all = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand')].well
#Calculation of Euclidean Distances for the top of Balakhany VIII sand.
def well_dist_calc(formation='Balakhany VIII sand'):
    data = df_khtst_xy_tvd[(df_khtst_xy_tvd.FORMATION == formation) & 
                            (df_khtst_xy_tvd.X > 0) & (df_khtst_xy_tvd.Y > 0) &
                            (~df_khtst_xy_tvd.TVD_SCS.isna())]
    row_name = data.well.reset_index().drop(['index'], axis=1)
    distance_fm = pd.DataFrame(euclidean_distances(data[['X', 'Y', 'TVD_SCS']]), columns=list(data.well))
    distance_fm_well = distance_fm.join(row_name).set_index('well')
    distance_fm_well.reset_index()
    return distance_fm_well.reset_index()
dist_bal8 = well_dist_calc('Balakhany VIII sand')
dist_bal10 = well_dist_calc('Balakhany X sand')  

### EuclDist Dist based dataset Balakhany VIII sand + X sand

In [17]:
#Uploading k_htst data from csv-file & Calculation of Euclidean Distances
path = 'C:\\jupyter\\SPP\\inputoutput\\'
df_khtst = pd.read_csv(path + 'df_prq_khtst_v3.csv')
df_khtst_xy = df_khtst.set_index(['well','FORMATION']).join(
                                 df_prq[['well','FORMATION','X','Y','TVD_SCS']].groupby(
                                 ['well','FORMATION']).apply(lambda x: x.iloc[0]).drop(
                                 ['well','FORMATION'], axis=1)).reset_index()
#Calculation of Euclidean Distances for the top of Balakhany VIII sand & Balakhany X sand
def well_dist_calc(formation='Balakhany VIII sand'):
    data = df_khtst_xy[(df_khtst_xy.FORMATION == formation) & (df_khtst_xy.X > 0) & (df_khtst_xy.Y > 0)]
    row_name = data.well.reset_index().drop(['index'], axis=1)
    distance_fm = pd.DataFrame(euclidean_distances(data[['X', 'Y', 'TVD_SCS']]), columns=list(data.well))
    distance_fm_well = distance_fm.join(row_name).set_index('well')
    distance_fm_well.reset_index()
    return distance_fm_well.reset_index()
dist_bal8 = well_dist_calc('Balakhany VIII sand')
dist_bal10 = well_dist_calc('Balakhany X sand')    
# Preparation dataset for X_train/x_test data splitting
well_clean_8 = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand')].well
well_clean_10 = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany X sand')].well

df_collect8 = []
for num, well_name in enumerate(dist_bal8.well[:]):
    well_dist3 = dist_bal8[dist_bal8.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()
    well_dist3_res = well_dist3.T[1:].reset_index().drop('index', axis=1)
    well_dist3_res.columns =['dist1', 'dist2', 'dist3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3['index']))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany VIII sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_res, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect8.append(result)
df_well_kh_dist8 = pd.concat(df_collect8).reset_index().drop('index', axis=1)
df_khtst_xy_bal8 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany VIII sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_dist_bal8 = df_well_kh_dist8.set_index('well').join(df_khtst_xy_bal8.set_index('well')).reset_index()
df_well_kh_dist_bal8_fld = df_well_kh_dist_bal8.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
df_well_kh_dist_bal8_fld = df_well_kh_dist_bal8_fld[(df_well_kh_dist_bal8_fld.well.isin(well_clean_8)) &
                                                    (df_well_kh_dist_bal8_fld.kh1>0) &
                                                    (df_well_kh_dist_bal8_fld.kh2>0) &
                                                    (df_well_kh_dist_bal8_fld.kh3>0) &
                                                    (df_well_kh_dist_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_collect10 = []
for num, well_name in enumerate(dist_bal10.well):
    well_dist3 = dist_bal10[dist_bal10.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()
    well_dist3_res = well_dist3.T[1:].reset_index().drop('index', axis=1)
    well_dist3_res.columns =['dist1', 'dist2', 'dist3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3['index']))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany X sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_res, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect10.append(result)
df_well_kh_dist10 = pd.concat(df_collect10).reset_index().drop('index', axis=1)
df_khtst_xy_bal10 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany X sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_dist_bal10 = df_well_kh_dist10.set_index('well').join(df_khtst_xy_bal10.set_index('well')).reset_index()
df_well_kh_dist_bal10_fld = df_well_kh_dist_bal10.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
df_well_kh_dist_bal10_fld = df_well_kh_dist_bal10_fld[(df_well_kh_dist_bal10_fld.well.isin(well_clean_10)) &
                                                    (df_well_kh_dist_bal10_fld.kh1>0) &
                                                    (df_well_kh_dist_bal10_fld.kh2>0) &
                                                    (df_well_kh_dist_bal10_fld.kh3>0) &
                                                    (df_well_kh_dist_bal10_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_well_kh_dist_all = pd.concat([df_well_kh_dist_bal8_fld, df_well_kh_dist_bal10_fld])
# df_well_kh_dist_all

### XY based on EuclDist Balakhany VIII sand & X sand

In [None]:
#Collecting XY based on Euclidean Distances for the top of Balakhany VIII sand.
df_collect = []
for num, well_name in enumerate(dist_bal8.well[:]):
    well_dist3 = dist_bal8[dist_bal8.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()['index']
    data = df_khtst_xy[(df_khtst_xy.FORMATION == 'Balakhany VIII sand') & (df_khtst_xy.X > 0) & (df_khtst_xy.Y > 0)]
    data[data.well.isin(well_dist3)][['well','X','Y']].T[1:]
    well_dist3_x = data[data.well.isin(well_dist3)][['well','X','Y']].T[1:2].reset_index().drop('index', axis=1)
    well_dist3_y = data[data.well.isin(well_dist3)][['well','X','Y']].T[2:3].reset_index().drop('index', axis=1)
    well_dist3_y.columns =['y1', 'y2', 'y3']
    well_dist3_x.columns =['x1', 'x2', 'x3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany VIII sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_x, well_dist3_y, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect.append(result)
df_well_kh_xy = pd.concat(df_collect).reset_index().drop('index', axis=1)
df_khtst_xy_bal8 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany VIII sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_xy_bal8 = df_well_kh_xy.set_index('well').join(df_khtst_xy_bal8.set_index('well')).reset_index()
df_well_kh_xy_bal8_fld = df_well_kh_xy_bal8.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
# Making up dataset with xy for azeri field
df_well_kh_xy_bal8_fld_azr = df_well_kh_xy_bal8_fld[(df_well_kh_xy_bal8_fld.field.isin(azr_lst)) & 
                                                    (df_well_kh_xy_bal8_fld.well.isin(well_clean_all)) &
                                                    (df_well_kh_xy_bal8_fld.kh1>0) &
                                                    (df_well_kh_xy_bal8_fld.kh2>0) &
                                                    (df_well_kh_xy_bal8_fld.kh3>0) &
                                                    (df_well_kh_xy_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
# Making up dataset with xy for chirag & azeri fields
df_well_kh_xy_bal8_fld_all = df_well_kh_xy_bal8_fld[(df_well_kh_xy_bal8_fld.well.isin(well_clean_all)) &
                                                    (df_well_kh_xy_bal8_fld.kh1>0) &
                                                    (df_well_kh_xy_bal8_fld.kh2>0) &
                                                    (df_well_kh_xy_bal8_fld.kh3>0) &
                                                    (df_well_kh_xy_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_well_kh_xy_bal8_fld_all.head(3)

In [None]:
#Collecting XY based on Euclidean Distances for the top of Balakhany X sand.
df_collect = []
for num, well_name in enumerate(dist_bal10.well[:]):
    well_dist3 = dist_bal10[dist_bal10.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()['index']
    data = df_khtst_xy[(df_khtst_xy.FORMATION == 'Balakhany X sand') & (df_khtst_xy.X > 0) & (df_khtst_xy.Y > 0)]
    data[data.well.isin(well_dist3)][['well','X','Y']].T[1:]
    well_dist3_x = data[data.well.isin(well_dist3)][['well','X','Y']].T[1:2].reset_index().drop('index', axis=1)
    well_dist3_y = data[data.well.isin(well_dist3)][['well','X','Y']].T[2:3].reset_index().drop('index', axis=1)
    well_dist3_y.columns =['y1', 'y2', 'y3']
    well_dist3_x.columns =['x1', 'x2', 'x3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany X sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_x, well_dist3_y, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect.append(result)
df_well_kh_xy = pd.concat(df_collect).reset_index().drop('index', axis=1)
df_khtst_xy_bal10 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany X sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_xy_bal10 = df_well_kh_xy.set_index('well').join(df_khtst_xy_bal10.set_index('well')).reset_index()
df_well_kh_xy_bal10_fld = df_well_kh_xy_bal10.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
# Making up dataset with xy for azeri field
df_well_kh_xy_bal10_fld_azr = df_well_kh_xy_bal10_fld[(df_well_kh_xy_bal10_fld.field.isin(azr_lst)) & 
                                                    (df_well_kh_xy_bal10_fld.well.isin(well_clean_all)) &
                                                    (df_well_kh_xy_bal10_fld.kh1>0) &
                                                    (df_well_kh_xy_bal10_fld.kh2>0) &
                                                    (df_well_kh_xy_bal10_fld.kh3>0) &
                                                    (df_well_kh_xy_bal10_fld.KHtst > 0)].reset_index().drop('index', axis=1)
# Making up dataset with xy for chirag & azeri fields
df_well_kh_xy_bal10_fld_all = df_well_kh_xy_bal10_fld[(df_well_kh_xy_bal10_fld.well.isin(well_clean_all)) &
                                                    (df_well_kh_xy_bal10_fld.kh1>0) &
                                                    (df_well_kh_xy_bal10_fld.kh2>0) &
                                                    (df_well_kh_xy_bal10_fld.kh3>0) &
                                                    (df_well_kh_xy_bal10_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_well_kh_xy_bal10_fld_all.head(3)

## Prediction with RFR

In [None]:
#Dataframe of wells with casing shoe inside Bal VIII or X intervals
df_prq_csg = df_prq[df_prq.FORMATION == 'Balakhany VIII sand'][['well','FORMATION','Casings']]
df_prq_csg_name = df_prq.groupby(['well','FORMATION'])['Casings'].apply(lambda x: (x.iloc[:].unique())).reset_index()
df_prq_csg_8 = df_prq_csg.groupby(['well','FORMATION'])['Casings'].apply(lambda x: len(x.iloc[:].unique())).reset_index()
df_prq_csg_8.rename(columns={'Casings':'csg_qty_bal8'}, inplace=True)
df_prq_csg = df_prq[df_prq.FORMATION == 'Balakhany X sand'][['well','FORMATION','Casings']]
df_prq_csg_10 = df_prq_csg.groupby(['well','FORMATION'])['Casings'].apply(lambda x: len(x.iloc[:].unique())).reset_index()
df_prq_csg_10.rename(columns={'Casings':'csg_qty_bal10'}, inplace=True)
df_khtst_bal_qcl_nm = df_khtst_bal_qcl.set_index(['well','FORMATION']).join(df_prq_csg_name.set_index(['well','FORMATION'])).reset_index()
df_khtst_bal_qcl10 = df_khtst_bal_qcl_nm.set_index(['well','FORMATION']).join(df_prq_csg_10.set_index(['well','FORMATION'])).reset_index()
df_khtst_bal_qcl_csg = df_khtst_bal_qcl10.set_index(['well','FORMATION']).join(df_prq_csg_8.set_index(['well','FORMATION'])).reset_index()
df_khtst_bal_qcl_csg_sel = df_khtst_bal_qcl_csg[(df_khtst_bal_qcl_csg.csg_qty_bal10 ==2) | (df_khtst_bal_qcl_csg.csg_qty_bal8 ==2)].sort_values(by='FORMATION')
df_khtst_bal_qcl_csg_sel[['well','FORMATION','KHtst','field','Casings']].reset_index().drop('index', axis=1)

### Legacy RFR for Bal8_sand & Bal10_sand

In [None]:
#Base settings for MLFlow tracking
mlflow.set_tracking_uri("http://16.171.23.137:5000")
mlflow.set_experiment('SPP_RandForReg')

In [22]:
def result_ml_plot(res, dataset, kh, max_val):
    """
    res = dataset with results of ML prediction
    dataset = dataframe with wellnames to join it with res based on actual KHtst values
    kh = title of variable with KH values
    max_val = max value for scatter plot
    """
    final = res.set_index('Actual').join(dataset[['well', kh]].set_index(kh)).reset_index()
    fig1_ml = px.scatter(final, x='Actual', y='Predicted', hover_data=['well'], width=400, height=400)
    fig1_ml.update_traces(marker=dict(size=10))
    fig2_ml=px.line(x=[0,max_val], y=[0,max_val])
    fig2_1_ml=px.line(x=[0,max_val], y=[0,max_val*1.25])
    fig2_2_ml=px.line(x=[0,max_val], y=[0,max_val*0.75])
    fig2_ml.update_traces(line=dict(color = 'red'))
    fig2_1_ml.update_traces(line=dict(color = 'red', dash='dash'))
    fig2_2_ml.update_traces(line=dict(color = 'red', dash='dash'))
    fig3_ml = go.Figure(data = fig1_ml.data + fig2_ml.data + fig2_1_ml.data + fig2_2_ml.data)
    fig3_ml.update_layout(title = 'Comparison Actual vs Pred',width=600,height=400, xaxis_title=kh + '_test', yaxis_title=kh + '_pred',
                    margin=dict(l=10,r=10,b=10,t=40))
    return fig3_ml.show()
def metric_result_print(y_train,y_pred_train,y_test, y_pred):
    r2_test = r2(y_test, y_pred)
    mae_test = mae(y_test, y_pred)
    mse_test = mse(y_test, y_pred)
    print(f'R2 test: {r2_test.round(2)}', 
        f'MAE test: {mae_test.round(3)}',
        f'sqrt MSE test: {np.sqrt(mse_test.round(3)):.3f}')
    r2_train = r2(y_train, y_pred_train)
    mae_train = mae(y_train, y_pred_train)
    mse_train = mse(y_train, y_pred_train)
    print(f'R2 train: {r2_train.round(2)}', 
        f'MAE train: {mae_train.round(3)}', 
        f'sqrt MSE train: {np.sqrt(mse_train.round(3)):.3f}')
def conv_log10_nat(y_log10):
    result = []
    for i in y_log10:
        result.append(10**i)
    return result

In [23]:
#Reading df_prq_htst_avgprop_v1 and getting outliers, create kavg_htst
path = 'C:\\jupyter\\SPP\\inputoutput\\'
df_htst_avgprop = pd.read_csv(path + 'df_prq_htst_avgprop_v1.csv')
well_no_outliers8 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand'].well.unique()
well_no_outliers10 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany X sand'].well.unique()
df_htst_avgprop['kavg_htst'] = df_htst_avgprop.h_tst * df_htst_avgprop.md_perm_avg
#Preparation simple dataframe with df_htst_avgprop-data without distances
df_avgprop_gb = df_htst_avgprop.groupby(['well','FORMATION'])[['h_tst','kavg_htst', 'md_phit_avg','md_vsh_avg']].agg(
                                        {'h_tst':'sum','kavg_htst':'sum', 'md_phit_avg':'mean', 'md_vsh_avg':'mean' }).reset_index()
df_avgprop_gb_tstint = df_avgprop_gb.set_index(['well','FORMATION']).join(df_fu_tst_final.set_index(['well','FORMATION'])).reset_index()
df_avgprop_gb_tstint = df_avgprop_gb_tstint[['well', 'FORMATION', 'X', 'Y', 'DEPTH', 'TVD_SCS', 'field', 'h_tst', 'TST_interv',
                                             'kavg_htst', 'md_phit_avg', 'md_vsh_avg']]
df_avgprop_gb_tstint.rename(columns={'TST_interv':'interv_tst', 'h_tst':'gross_tst'}, inplace=True)
df_avgprop_final = df_avgprop_gb_tstint.copy()
#Selecting data for Bal8 & Bal10 for Chi / Azr
azr_lst = ['CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI']
chg_lst = ['CHIRAG', 'DWG', 'DDGG', 'WEST CHIRAG']
df_avgprop_bal8chi = df_avgprop_final[df_avgprop_final.FORMATION.str.contains('Balakhany VIII sand') & df_avgprop_final.field.isin(chg_lst) & 
                                      df_avgprop_final.well.isin(well_no_outliers8)].dropna()
df_avgprop_bal8azr = df_avgprop_final[df_avgprop_final.FORMATION.str.contains('Balakhany VIII sand') & df_avgprop_final.field.isin(azr_lst) & 
                                      df_avgprop_final.well.isin(well_no_outliers8)].dropna()
df_avgprop_bal10 = df_avgprop_final[df_avgprop_final.FORMATION.str.contains('Balakhany X sand') & 
                                    df_avgprop_final.well.isin(well_no_outliers10)].dropna()
# X_train/x_test data splitting
y1 = np.array(df_avgprop_bal8chi['kavg_htst'].values)
x1 = np.array(df_avgprop_bal8chi.drop(['well','FORMATION','field','kavg_htst'], axis=1))
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.33, random_state=42)

#### Run RFR Bal8_sand smpl Chirag

In [None]:
#Gridsearch test run for RandForRegr Bal VIII sand
RF1 = RandomForestRegressor()
grid_param_RF1 = {
    'bootstrap': [True, False],
    'max_depth': [None, 10, 50, 75, 100, 150, 200, 500],
    'min_samples_leaf': [1, 2, 3, 5, 10],
    'min_samples_split': [1, 2, 3, 5, 10, 20],
    'n_estimators': [10, 25, 50, 100, 200]}
gd_sr_RF1 = GridSearchCV(estimator = RF1, param_grid = grid_param_RF1, scoring='r2', cv = None, n_jobs = -1)
gd_sr_RF1.fit(x1_train, y1_train)
print(gd_sr_RF1.best_params_)

In [None]:
# RandomForestRegressor for Bal VIII sand
# with mlflow.start_run(run_name='rfm_gs_r2'):
      # mlflow.set_tag("model_name", "RandForReg")
RF_setting = {'bootstrap':True, 
                  'max_depth':150, 
                  'min_samples_leaf':1, 
                  'min_samples_split':5,
                  'n_estimators':10} 
RF1 = RandomForestRegressor(bootstrap= RF_setting['bootstrap'], 
                              max_depth=RF_setting['max_depth'], 
                              min_samples_leaf=RF_setting['min_samples_leaf'], 
                              min_samples_split=RF_setting['min_samples_split'], 
                              n_estimators=RF_setting['n_estimators'])
RF1.fit(x1_train, y1_train)
#Returning our prediction values for the test data
y1_pred_train = RF1.predict(x1_train)
y1_pred = RF1.predict(x1_test)
#Combining the actual and predicted values into a single df
df_results_v1 = pd.DataFrame({'Actual': y1_test, 'Predicted': y1_pred})
result_ml_plot(res = df_results_v1, dataset = df_avgprop_bal8chi, kh='kavg_htst', max_val=14000)
metric_result_print(y1_train, y1_pred_train, y1_test, y1_pred)

      # mlflow.log_param("bootstrap", RF_setting['bootstrap'])
      # mlflow.log_param("max_depth", RF_setting['max_depth'])
      # mlflow.log_param("min_samples_leaf", RF_setting['min_samples_leaf'])
      # mlflow.log_param("min_samples_split", RF_setting['min_samples_split'])
      # mlflow.log_param("n_estimators", RF_setting['n_estimators'])

      # mlflow.log_metric("r2_test", r2_test)
      # mlflow.log_metric("mae_test", mae_test)
      # mlflow.log_metric("mse_test", mse_test)

      # mlflow.sklearn.log_model(RF, "RFR_models")

#### Run RFR Bal8_sand wa Chirag

In [27]:
#Reading df_prq_htst_avgprop_v1 and getting outliers
path = 'C:\\jupyter\\SPP\\inputoutput\\' 
df_htst_avgprop = pd.read_csv(path + 'df_prq_htst_avgprop_v1.csv')
well_no_outliers8 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand'].well.unique()
well_no_outliers10 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany X sand'].well.unique()
#Preparation weighted average df_htst_avgprop-dataset
cutoff_h_tst = 0.5
cutoff_perm_avg = 5
#Applying filtration to dataset with cutoffs
df_htst_avgprop_nz = df_htst_avgprop[(df_htst_avgprop.h_tst > cutoff_h_tst) & (df_htst_avgprop.md_perm_avg > cutoff_perm_avg)]
#Multiplaying htst by resprop values
df_htst_avgprop_nz['kavg_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_perm_avg
df_htst_avgprop_nz['phit_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_phit_avg
df_htst_avgprop_nz['vsh_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_vsh_avg
#Summarizing h_tst via well & formation
df_htst_fm = df_htst_avgprop_nz.groupby(['well','FORMATION'])['h_tst'].sum().reset_index()
df_htst_fm.rename(columns={'h_tst':'gross_tst'}, inplace=True)
#Calculating weighted averages
df_htst_avgprop_nz_avgpropsum = df_htst_avgprop_nz.groupby(['well','FORMATION'])[['phit_htst','vsh_htst']].sum().reset_index()
df_htst_avgprop_nz_avgpropsum_join = df_htst_avgprop_nz_avgpropsum.set_index(
                                     ['well','FORMATION']).join(df_htst_fm.set_index(['well','FORMATION'])).reset_index()
df_htst_avgprop_nz_avgpropsum_join['phit_wavg'] = df_htst_avgprop_nz_avgpropsum_join.phit_htst / df_htst_avgprop_nz_avgpropsum_join.gross_tst
df_htst_avgprop_nz_avgpropsum_join['vsh_wavg'] = df_htst_avgprop_nz_avgpropsum_join.vsh_htst / df_htst_avgprop_nz_avgpropsum_join.gross_tst
df_8bal_hpv = df_htst_avgprop_nz_avgpropsum_join[
              df_htst_avgprop_nz_avgpropsum_join.FORMATION == 'Balakhany VIII sand'][['well','FORMATION','gross_tst','phit_wavg','vsh_wavg']]
df_8bal_permh = df_htst_avgprop_nz[df_htst_avgprop_nz.FORMATION == 'Balakhany VIII sand'].groupby(['well','FORMATION'])['kavg_htst'].sum().reset_index()
df_8bal_phhpv = df_8bal_hpv.set_index(['well','FORMATION']).join(df_8bal_permh.set_index(['well','FORMATION'])).reset_index()
df_10bal_hpv = df_htst_avgprop_nz_avgpropsum_join[
              df_htst_avgprop_nz_avgpropsum_join.FORMATION == 'Balakhany X sand'][['well','FORMATION','gross_tst','phit_wavg','vsh_wavg']]
df_10bal_permh = df_htst_avgprop_nz[df_htst_avgprop_nz.FORMATION == 'Balakhany X sand'].groupby(['well','FORMATION'])['kavg_htst'].sum().reset_index()
df_10bal_phhpv = df_10bal_hpv.set_index(['well','FORMATION']).join(df_10bal_permh.set_index(['well','FORMATION'])).reset_index()
# #Preparing x,y matrices for ML
df_8bal_phhpv_tstint = df_8bal_phhpv.set_index(['well','FORMATION']).join(df_fu_tst_final.set_index(['well','FORMATION'])).reset_index()
df_8bal_phhpv_tstint = df_8bal_phhpv_tstint[['well', 'FORMATION', 'X', 'Y', 'DEPTH', 'TVD_SCS', 'field', 'gross_tst',
                                             'TST_interv', 'kavg_htst', 'phit_wavg', 'vsh_wavg']]
df_8bal_phhpv_tstint.rename(columns={'TST_interv':'interv_tst'}, inplace=True)
df_avgprop8_final_wa = df_8bal_phhpv_tstint.copy()
df_10bal_phhpv_tstint = df_10bal_phhpv.set_index(['well','FORMATION']).join(df_fu_tst_final.set_index(['well','FORMATION'])).reset_index()
df_10bal_phhpv_tstint = df_10bal_phhpv_tstint[['well', 'FORMATION', 'X', 'Y', 'DEPTH', 'TVD_SCS', 'field', 'gross_tst',
                                             'TST_interv', 'kavg_htst', 'phit_wavg', 'vsh_wavg']]
df_10bal_phhpv_tstint.rename(columns={'TST_interv':'interv_tst'}, inplace=True)
df_avgprop10_final_wa = df_10bal_phhpv_tstint.copy()
#Selecting data for Bal8 & Bal10 for Chi / Azr
azr_lst = ['CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI']
chg_lst = ['CHIRAG', 'DWG', 'DDGG', 'WEST CHIRAG']
df_avgprop_bal8chi_wa = df_avgprop8_final_wa[df_avgprop8_final_wa.FORMATION.str.contains('Balakhany VIII sand') & 
                                            df_avgprop8_final_wa.field.isin(chg_lst) & df_avgprop8_final_wa.well.isin(well_no_outliers8)].dropna()
df_avgprop_bal8azr_wa = df_avgprop8_final_wa[df_avgprop8_final_wa.FORMATION.str.contains('Balakhany VIII sand') & 
                                            df_avgprop8_final_wa.field.isin(azr_lst) & df_avgprop8_final_wa.well.isin(well_no_outliers8)].dropna()
df_avgprop_bal10_wa = df_avgprop10_final_wa[df_avgprop10_final_wa.FORMATION.str.contains('Balakhany X sand') & 
                                          df_avgprop10_final_wa.well.isin(well_no_outliers10)]
#X_train/x_test data splitting
y2 = np.array(df_avgprop_bal8chi_wa['kavg_htst'].values)
x2 = np.array(df_avgprop_bal8chi_wa.drop(['well','FORMATION','field','kavg_htst'], axis=1))
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.33, random_state=10)

In [None]:
#Gridsearch test run for RandForRegr Bal VIII sand Chirag
RF2 = RandomForestRegressor()
grid_param_RF2 = {
    'bootstrap': [True, False],
    'max_depth': [None, 10, 50, 75, 100, 150, 200, 500],
    'min_samples_leaf': [1, 2, 3, 5, 10],
    'min_samples_split': [1, 2, 3, 5, 10, 20],
    'n_estimators': [10, 25, 50, 100, 200]}
gd_sr_RF2 = GridSearchCV(estimator = RF2, param_grid = grid_param_RF2, scoring='r2', cv = None, n_jobs = -1)
gd_sr_RF2.fit(x2_train, y2_train)
print(gd_sr_RF2.best_params_)

In [None]:
# RandomForestRegressor for Bal VIII sand Chirag weighted avg df_htst_avgprop-dataset 
# with mlflow.start_run(run_name='rfm_weighted_avg_gs_r2'):
# mlflow.set_tag("model_name", "RandForReg")
RF_setting = {'bootstrap':True, 
              'max_depth':None, 
              'min_samples_leaf':2, 
              'min_samples_split':5,
              'n_estimators':10} 
RF2 = RandomForestRegressor(bootstrap= RF_setting['bootstrap'], 
                           max_depth=RF_setting['max_depth'], 
                           min_samples_leaf=RF_setting['min_samples_leaf'], 
                           min_samples_split=RF_setting['min_samples_split'], 
                           n_estimators=RF_setting['n_estimators'])
RF2.fit(x2_train, y2_train)
#Returning our prediction values for the test data
y2_pred_train = RF2.predict(x2_train)
y2_pred = RF2.predict(x2_test)
#Combining the actual and predicted values into a single df
df_results_v2 = pd.DataFrame({'Actual': y2_test, 'Predicted': y2_pred})
result_ml_plot(res = df_results_v2, dataset = df_avgprop_bal8chi_wa, kh='kavg_htst', max_val=14000)
metric_result_print(y2_train, y2_pred_train, y2_test, y2_pred)
      # mlflow.log_param("bootstrap", RF_setting['bootstrap'])
      # mlflow.log_param("max_depth", RF_setting['max_depth'])
      # mlflow.log_param("min_samples_leaf", RF_setting['min_samples_leaf'])
      # mlflow.log_param("min_samples_split", RF_setting['min_samples_split'])
      # mlflow.log_param("n_estimators", RF_setting['n_estimators'])

      # mlflow.log_metric("r2_test", r2_test)
      # mlflow.log_metric("mae_test", mae_test)
      # mlflow.log_metric("mse_test", mse_test)

      # mlflow.sklearn.log_model(RF, "RFR_models")

In [None]:
# Permutation importance for Bal VIII sand wa Azeri test set
result_pi_test = permutation_importance(RF2, x2_test, y2_test, n_repeats=10, random_state=42, n_jobs=2)
sorted_importances_idx = result_pi_test.importances_mean.argsort()
importances = pd.DataFrame(result_pi_test.importances[sorted_importances_idx].T,
                           columns=df_avgprop_bal8chi_wa.drop(['well','FORMATION','field','kavg_htst'], axis=1).columns[sorted_importances_idx])
ax = importances.plot.box(vert=False, whis=10)
ax.set_title("Permut Imp Bal VIII sand wa Chirag (test set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()

In [None]:
# Permutation importance for Bal VIII sand wa Azeri train set
result_pi_train = permutation_importance(RF2, x2_train, y2_train, n_repeats=10, random_state=42, n_jobs=2)
sorted_importances_idx_train = result_pi_train.importances_mean.argsort()
importances_train = pd.DataFrame(result_pi_train.importances[sorted_importances_idx_train].T,
                                 columns=df_avgprop_bal8chi_wa.drop(['well','FORMATION','field','kavg_htst'], axis=1).columns[sorted_importances_idx_train])
ax = importances_train.plot.box(vert=False, whis=10)
ax.set_title("Permut Imp Bal VIII sand wa Chirag (train set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()

#### Run RFR Bal8_sand wa Azeri

In [32]:
# Balakhany VIII Azeri weighted averaging
df_avgprop_bal8azr_wa = df_avgprop8_final_wa[df_avgprop8_final_wa.FORMATION.str.contains('Balakhany VIII sand') & 
                                            df_avgprop8_final_wa.field.isin(azr_lst) & df_avgprop8_final_wa.well.isin(well_no_outliers8)].dropna()
#X_train/x_test data splitting
y2_1 = np.array(df_avgprop_bal8azr_wa['kavg_htst'].values)
x2_1 = np.array(df_avgprop_bal8azr_wa.drop(['well','FORMATION','field','kavg_htst'], axis=1))
x2_1_train, x2_1_test, y2_1_train, y2_1_test = train_test_split(x2_1, y2_1, test_size=0.33, random_state=15)

In [None]:
#Display on map x_train & x_test for Azeri
fig = go.Figure()
field_avg_coord_chg = field_avg_coord[field_avg_coord.field.isin(chg_lst)]
# field_avg_coord_azr = field_avg_coord[field_avg_coord.field.isin(azr_lst)] 
fig.add_trace(go.Scatter(x=x2_1_train[:,0], y=x2_1_train[:,1], 
                         marker=dict(color='rgb(255, 255, 255)', size=y2_1_train*0.01, line=dict(color='rgb(252, 48, 3)', width=3)),
                         mode='markers', name='train set'))
fig.add_trace(go.Scatter(x=x2_1_test[:,0], y=x2_1_test[:,1], 
                         marker=dict(color='rgb(255, 255, 255)', size=y2_1_test*0.01, line=dict(color='rgb(52, 61, 235)', width=3)),
                         mode='markers', name='test set'))
fig.add_trace(go.Scatter(x=list(df_avgprop_bal8azr_wa.X), y=list(df_avgprop_bal8azr_wa.Y), customdata = df_avgprop_bal8azr_wa[['well', 'kavg_htst']],
                         marker=dict(color=df_avgprop_bal8azr_wa.TVD_SCS, size=df_avgprop_bal8azr_wa.kavg_htst*0.01, colorscale='Viridis_r',  showscale=True,
                         line=dict(color='rgb(47, 57, 61)', width=0.5)),
                         mode='markers', name='kavg_htst wells', hovertemplate="".join(["well:%{customdata[0]}, kavg_htst:%{customdata[1]}<extra></extra>"])))
fig.add_trace(go.Scatter(x=field_avg_coord_azr.X_wellhead, y=field_avg_coord_azr.Y_wellhead, customdata = field_avg_coord_azr[['field']],
                         text=field_avg_coord_azr['field'], textposition="middle right",
                         marker=dict(color='rgb(0, 0,0)', size=12),
                         mode='markers+text', name='Platforms', 
                         marker_symbol='square', hovertemplate="".join(["%{customdata[0]}<extra></extra>"])))
fig.update_layout(title_text='Balakhany VIII sand KHtst, size=f(KHtst), color=f(TVD_SCS) for RFR prediction Azeri',
                  autosize=True, width=1000, height=500, margin=dict(l=10,r=10,b=10,t=50))
fig.update_layout(legend=dict( yanchor="top", y=1, xanchor="right", x=1, bgcolor='rgba(255,255,255,1)', bordercolor='Black',borderwidth=1))
fig.show()

In [None]:
#Gridsearch test run for RandForRegr Bal VIII sand Azeri
RF2_1 = RandomForestRegressor()
grid_param_RF2_1 = {
    'bootstrap': [True, False],
    'max_depth': [None, 10, 50, 75, 100, 150, 200, 500],
    'min_samples_leaf': [1, 2, 3, 5, 10],
    'min_samples_split': [1, 2, 3, 5, 10, 20],
    'n_estimators': [10, 25, 50, 100, 200]}
gd_sr_RF2_1 = GridSearchCV(estimator = RF2_1, param_grid = grid_param_RF2_1, scoring='r2', cv = None, n_jobs = -1)
gd_sr_RF2_1.fit(x2_1_train, y2_1_train)
print(gd_sr_RF2_1.best_params_)

In [None]:
# RandomForestRegressor for Bal VIII sand Azeri avg df_htst_avgprop-dataset 
RF_setting = {'bootstrap':True, 
              'max_depth':None, 
              'min_samples_leaf':2, 
              'min_samples_split':3,
              'n_estimators':10}
RF2_1 = RandomForestRegressor(bootstrap= RF_setting['bootstrap'], 
                              max_depth=RF_setting['max_depth'], 
                              min_samples_leaf=RF_setting['min_samples_leaf'], 
                              min_samples_split=RF_setting['min_samples_split'], 
                              n_estimators=RF_setting['n_estimators'])
RF2_1.fit(x2_1_train, y2_1_train)
#Returning our prediction values for the test data
y2_1_pred_train = RF2_1.predict(x2_1_train)
y2_1_pred = RF2_1.predict(x2_1_test)
#Combining the actual and predicted values into a single df
df_results_v2_1 = pd.DataFrame({'Actual': y2_1_test, 'Predicted': y2_1_pred})
result_ml_plot(res = df_results_v2_1, dataset = df_avgprop_bal8azr_wa, kh='kavg_htst', max_val=3000)
metric_result_print(y2_1_train, y2_1_pred_train, y2_1_test, y2_1_pred)

In [None]:
# Feature_importances for Bal VIII sand wa Azeri
feature_names = df_avgprop_bal8azr_wa.drop(['well','FORMATION','field','kavg_htst'], axis=1).columns
mdi_importances = pd.Series(RF2_1.feature_importances_, index=feature_names).sort_values(ascending=True)
ax = mdi_importances.plot.barh()
ax.set_title("RFR Feature Importances Bal VIII sand wa Azeri")
ax.figure.tight_layout()

In [None]:
# Permutation importance for Bal VIII sand wa Azeri test set
result_pi_test = permutation_importance(RF2_1, x2_1_test, y2_1_test, n_repeats=10, random_state=42, n_jobs=2)
sorted_importances_idx = result_pi_test.importances_mean.argsort()
importances = pd.DataFrame(result_pi_test.importances[sorted_importances_idx].T,
                           columns=df_avgprop_bal8azr_wa.drop(['well','FORMATION','field','kavg_htst'], axis=1).columns[sorted_importances_idx])
ax = importances.plot.box(vert=False, whis=10)
ax.set_title("Permut Imp Bal VIII sand wa Azeri (test set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()

In [None]:
# Permutation importance for Bal VIII sand wa Azeri training set
result_pi_train = permutation_importance(RF2_1, x2_1_train, y2_1_train, n_repeats=10, random_state=42, n_jobs=2)
sorted_importances_idx_train = result_pi_train.importances_mean.argsort()
importances_train = pd.DataFrame(result_pi_train.importances[sorted_importances_idx_train].T,
                                 columns=df_avgprop_bal8azr_wa.drop(['well','FORMATION','field','kavg_htst'], axis=1).columns[sorted_importances_idx_train])
ax = importances_train.plot.box(vert=False, whis=10)
ax.set_title("Permut Imp Bal VIII sand wa Azeri (train set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()

#### Run RFR Bal10_sand smpl

In [None]:
# Balakhany X simple averaging
df_avgprop_bal10 = df_avgprop_final[df_avgprop_final.FORMATION.str.contains('Balakhany X sand') & 
                                    df_avgprop_final.well.isin(well_no_outliers10)].dropna()
# X_train/x_test data splitting
y3 = np.array(df_avgprop_bal10['kavg_htst'].values)
x3 = np.array(df_avgprop_bal10.drop(['well','FORMATION','field','kavg_htst'], axis=1))
x3_train, x3_test, y3_train, y3_test = train_test_split(x3, y3, test_size=0.33, random_state=36)

In [None]:
#Gridsearch test run for RandForRegr Bal X sand
RF3 = RandomForestRegressor()
grid_param_RF3 = {
    'bootstrap': [True, False],
    'max_depth': [None, 10, 50, 75, 100, 150, 200, 500],
    'min_samples_leaf': [1, 2, 3, 5, 10],
    'min_samples_split': [1, 2, 3, 5, 10, 20],
    'n_estimators': [10, 25, 50, 100, 200]}
gd_sr_RF3 = GridSearchCV(estimator = RF3, param_grid = grid_param_RF3, scoring='r2', cv = None, n_jobs = -1)
gd_sr_RF3.fit(x3_train, y3_train)
print(gd_sr_RF3.best_params_)

In [None]:
# RandomForestRegressor for Bal X simple avg df_htst_avgprop-dataset 
RF_setting = {'bootstrap':True, 
              'max_depth':100, 
              'min_samples_leaf':2, 
              'min_samples_split':3,
              'n_estimators':25} 
RF3 = RandomForestRegressor(bootstrap= RF_setting['bootstrap'], 
                           max_depth=RF_setting['max_depth'], 
                           min_samples_leaf=RF_setting['min_samples_leaf'], 
                           min_samples_split=RF_setting['min_samples_split'], 
                           n_estimators=RF_setting['n_estimators'])
RF3.fit(x3_train, y3_train)
#Returning our prediction values for the test data
y3_pred_train = RF3.predict(x3_train)
y3_pred = RF3.predict(x3_test)
#Combining the actual and predicted values into a single df
df_results_v3 = pd.DataFrame({'Actual': y3_test, 'Predicted': y3_pred})
result_ml_plot(res = df_results_v3, dataset = df_avgprop_bal10, kh='kavg_htst', max_val=6000)
metric_result_print(y3_train, y3_pred_train, y3_test, y3_pred)

#### Run RFR Bal10_sand wa

In [None]:
# Balakhany X weighted averaging
df_avgprop_bal10_wa = df_avgprop10_final_wa[df_avgprop10_final_wa.FORMATION.str.contains('Balakhany X sand') & 
                                          df_avgprop10_final_wa.well.isin(well_no_outliers10)].dropna()
#X_train/x_test data splitting
y4 = np.array(df_avgprop_bal10_wa['kavg_htst'].values)
x4 = np.array(df_avgprop_bal10_wa.drop(['well','FORMATION','field','kavg_htst'], axis=1))
x4_train, x4_test, y4_train, y4_test = train_test_split(x4, y4, test_size=0.33, random_state=98)

In [None]:
#Gridsearch test run for RandForRegr Bal X sand
RF4 = RandomForestRegressor()
grid_param_RF4 = {'bootstrap': [True, False],
                  'max_depth': [None, 10, 50, 75, 100, 150, 200, 500],
                  'min_samples_leaf': [1, 2, 3, 5, 10],
                  'min_samples_split': [1, 2, 3, 5, 10, 20],
                  'n_estimators': [10, 25, 50, 100, 200]}
gd_sr_RF4 = GridSearchCV(estimator = RF4, param_grid = grid_param_RF4, scoring='r2', cv = None, n_jobs = -1)
gd_sr_RF4.fit(x4_train, y4_train)
print(gd_sr_RF4.best_params_)

In [None]:
# RandomForestRegressor for Bal X weighted avg df_htst_avgprop-dataset 
RF_setting = {'bootstrap':True, 
              'max_depth':100, 
              'min_samples_leaf':1, 
              'min_samples_split':5,
              'n_estimators':10} 
RF4 = RandomForestRegressor(bootstrap= RF_setting['bootstrap'], 
                           max_depth=RF_setting['max_depth'], 
                           min_samples_leaf=RF_setting['min_samples_leaf'], 
                           min_samples_split=RF_setting['min_samples_split'], 
                           n_estimators=RF_setting['n_estimators'])
RF4.fit(x4_train, y4_train)
#Returning our prediction values for the test data
y4_pred_train = RF4.predict(x4_train)
y4_pred = RF4.predict(x4_test)
#Combining the actual and predicted values into a single df
df_results_v4 = pd.DataFrame({'Actual': y4_test, 'Predicted': y4_pred})
result_ml_plot(res = df_results_v4, dataset = df_avgprop_bal10_wa, kh='kavg_htst', max_val=6000)
metric_result_print(y4_train, y4_pred_train, y4_test, y4_pred)

In [None]:
# Permutation importance for Bal X sand wa test set
result_pi_test = permutation_importance(RF4, x4_test, y4_test, n_repeats=10, random_state=42, n_jobs=2)
sorted_importances_idx = result_pi_test.importances_mean.argsort()
importances = pd.DataFrame(result_pi_test.importances[sorted_importances_idx].T,
                           columns=df_avgprop_bal10_wa.drop(['well','FORMATION','field','kavg_htst'], axis=1).columns[sorted_importances_idx])
ax = importances.plot.box(vert=False, whis=10)
ax.set_title("Permut Imp Bal X sand wa (test set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()

In [None]:
# Permutation importance for Bal X sand wa train set
result_pi_train = permutation_importance(RF4, x4_train, y4_train, n_repeats=10, random_state=42, n_jobs=2)
sorted_importances_idx_train = result_pi_train.importances_mean.argsort()
importances_train = pd.DataFrame(result_pi_train.importances[sorted_importances_idx_train].T,
                                 columns=df_avgprop_bal10_wa.drop(['well','FORMATION','field','kavg_htst'], axis=1).columns[sorted_importances_idx_train])
ax = importances_train.plot.box(vert=False, whis=10)
ax.set_title("Permut Imp Bal X sand wa (train set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()

## Creation dist-kh dataset

In [None]:
#Uploading k_htst data from csv-file
path = 'C:\\jupyter\\SPP\\inputoutput\\'
df_khtst = pd.read_csv(path + 'df_prq_khtst_v3.csv')
df_khtst_xy = df_khtst.set_index(['well','FORMATION']).join(
                                 df_prq[['well','FORMATION','X','Y','TVD_SCS']].groupby(
                                 ['well','FORMATION']).apply(lambda x: x.iloc[0]).drop(
                                 ['well','FORMATION'], axis=1)).reset_index()
#Calculation of Euclidean Distances for the top of Balakhany VIII sand.
def well_dist_calc(formation='Balakhany VIII sand'):
    data = df_khtst_xy[(df_khtst_xy.FORMATION == formation) & (df_khtst_xy.X > 0) & (df_khtst_xy.Y > 0)]
    row_name = data.well.reset_index().drop(['index'], axis=1)
    distance_fm = pd.DataFrame(euclidean_distances(data[['X', 'Y', 'TVD_SCS']]), columns=list(data.well))
    distance_fm_well = distance_fm.join(row_name).set_index('well')
    distance_fm_well.reset_index()
    return distance_fm_well.reset_index()
dist_bal8 = well_dist_calc('Balakhany VIII sand')
dist_bal10 = well_dist_calc('Balakhany X sand')            

In [None]:
# Preparation dataset for X_train/x_test data splitting
azr_lst = ['CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI']
chg_lst = ['CHIRAG', 'DWG', 'DDGG', 'WEST CHIRAG']
well_clean_azr = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand') & 
                                  (df_khtst_bal_qcl.field.isin(azr_lst))].well
well_clean_all = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand')].well

df_collect = []
for num, well_name in enumerate(dist_bal8.well[:]):
    well_dist3 = dist_bal8[dist_bal8.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()
    well_dist3_res = well_dist3.T[1:].reset_index().drop('index', axis=1)
    well_dist3_res.columns =['dist1', 'dist2', 'dist3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3['index']))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany VIII sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_res, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect.append(result)
df_well_kh_dist = pd.concat(df_collect).reset_index().drop('index', axis=1)
df_khtst_xy_bal8 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany VIII sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_dist_bal8 = df_well_kh_dist.set_index('well').join(df_khtst_xy_bal8.set_index('well')).reset_index()
df_well_kh_dist_bal8_fld = df_well_kh_dist_bal8.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
df_well_kh_dist_bal8_fld_azr = df_well_kh_dist_bal8_fld[(df_well_kh_dist_bal8_fld.field.isin(azr_lst)) & 
                                                        (df_well_kh_dist_bal8_fld.well.isin(well_clean_azr)) &
                                                        (df_well_kh_dist_bal8_fld.kh1>0) &
                                                        (df_well_kh_dist_bal8_fld.kh2>0) &
                                                        (df_well_kh_dist_bal8_fld.kh3>0) &
                                                        (df_well_kh_dist_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_well_kh_dist_bal8_fld_all = df_well_kh_dist_bal8_fld[(df_well_kh_dist_bal8_fld.well.isin(well_clean_all)) &
                                                        (df_well_kh_dist_bal8_fld.kh1>0) &
                                                        (df_well_kh_dist_bal8_fld.kh2>0) &
                                                        (df_well_kh_dist_bal8_fld.kh3>0) &
                                                        (df_well_kh_dist_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
# df_well_kh_dist_bal8_fld_azr[['kh1','kh2', 'kh3', 'KHtst' ]] = df_well_kh_dist_bal8_fld_azr[['kh1','kh2', 'kh3','KHtst' ]].apply(lambda x: np.log10(x))
# df_well_kh_dist_bal8_fld_azr = df_well_kh_dist_bal8_fld_azr[~((df_well_kh_dist_bal8_fld_azr.well.str.contains('Z')) | 
#                                                              (df_well_kh_dist_bal8_fld_azr.well.str.contains('Y')))].reset_index()

In [None]:
# Preparation dataset for Balakhany X
well_clean_all = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand')].well
df_collect = []
for num, well_name in enumerate(dist_bal8.well[:]):
    well_dist3 = dist_bal8[dist_bal8.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()
    well_dist3_res = well_dist3.T[1:].reset_index().drop('index', axis=1)
    well_dist3_res.columns =['dist1', 'dist2', 'dist3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3['index']))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany VIII sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_res, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect.append(result)
df_well_kh_dist = pd.concat(df_collect).reset_index().drop('index', axis=1)
df_khtst_xy_bal8 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany VIII sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_dist_bal8 = df_well_kh_dist.set_index('well').join(df_khtst_xy_bal8.set_index('well')).reset_index()
df_well_kh_dist_bal8_fld = df_well_kh_dist_bal8.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
df_well_kh_dist_bal8_fld_azr = df_well_kh_dist_bal8_fld[(df_well_kh_dist_bal8_fld.field.isin(azr_lst)) & 
                                                        (df_well_kh_dist_bal8_fld.well.isin(well_clean_azr)) &
                                                        (df_well_kh_dist_bal8_fld.kh1>0) &
                                                        (df_well_kh_dist_bal8_fld.kh2>0) &
                                                        (df_well_kh_dist_bal8_fld.kh3>0) &
                                                        (df_well_kh_dist_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_well_kh_dist_bal8_fld_all = df_well_kh_dist_bal8_fld[(df_well_kh_dist_bal8_fld.well.isin(well_clean_all)) &
                                                        (df_well_kh_dist_bal8_fld.kh1>0) &
                                                        (df_well_kh_dist_bal8_fld.kh2>0) &
                                                        (df_well_kh_dist_bal8_fld.kh3>0) &
                                                        (df_well_kh_dist_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
# df_well_kh_dist_bal8_fld_azr[['kh1','kh2', 'kh3', 'KHtst' ]] = df_well_kh_dist_bal8_fld_azr[['kh1','kh2', 'kh3','KHtst' ]].apply(lambda x: np.log10(x))
# df_well_kh_dist_bal8_fld_azr = df_well_kh_dist_bal8_fld_azr[~((df_well_kh_dist_bal8_fld_azr.well.str.contains('Z')) | 
#                                                              (df_well_kh_dist_bal8_fld_azr.well.str.contains('Y')))].reset_index()

### The loop with RFR for dist-kh dataset

In [None]:
# Settings for ML-model
RF_setting = {'bootstrap':True, 
              'max_depth':20, 
              'min_samples_leaf':2, 
              'min_samples_split':2,
              'n_estimators':5}
# Starting of the loop
y_test_lst = []
y_pred_lst = []
well_exclude_lst = []
for i in tqdm(range(len(df_well_kh_dist_bal8_fld_all))):
    df_wo_well = df_well_kh_dist_bal8_fld_all.drop([i])
    well_exclude = df_well_kh_dist_bal8_fld_all.iloc[i]['well']
    well_exclude_lst.append(well_exclude)
    y_train = np.array(df_wo_well['KHtst'])
    x_train = np.array(df_wo_well[['dist1', 'dist2', 'dist3', 'kh1', 'kh2', 'kh3']])
    well_train = np.array(df_wo_well['well'])
    y_test = np.array(df_well_kh_dist_bal8_fld_all.iloc[i]['KHtst'])
    x_test = np.array(df_well_kh_dist_bal8_fld_all.iloc[i][['dist1', 'dist2', 'dist3', 'kh1', 'kh2', 'kh3']])
    y_test_lst.append(y_test)
# Statement of ML-model
    RF = RandomForestRegressor(bootstrap= RF_setting['bootstrap'], 
                               max_depth=RF_setting['max_depth'], 
                               min_samples_leaf=RF_setting['min_samples_leaf'], 
                               min_samples_split=RF_setting['min_samples_split'], 
                               n_estimators=RF_setting['n_estimators'])
    RF.fit(x_train, y_train)
    y_pred = RF.predict([x_test]).round(0) 
    y_pred_lst.append(y_pred[0])
# Building up of dataframe
res_rfr = pd.DataFrame(zip(y_test_lst,y_pred_lst,well_exclude_lst), columns = ['test','predict','well_excl'])
res_rfr['l_test'] = res_rfr.test*0.75
res_rfr['h_test'] = res_rfr.test*1.25
res_rfr['qc'] = 'out'
res_rfr.loc[(res_rfr.predict >= res_rfr.l_test) & (res_rfr.predict <= res_rfr.h_test), 'qc'] = 'in'
print('wells total:', res_rfr.shape[0])
print('wells unpredicted:', res_rfr['qc'].value_counts()['out'], (res_rfr['qc'].value_counts()['out']/res_rfr.shape[0]).round(3), 'v/v')
print('wells predicted:', res_rfr['qc'].value_counts()['in'], (res_rfr['qc'].value_counts()['in']/res_rfr.shape[0]).round(3), 'v/v')
mae_df_xy = mae(res_rfr.test, res_rfr.predict).round(0)
r2_df_xy = r2(res_rfr.test, res_rfr.predict).round(2)
print('mae:', mae_df_xy, 'mDm')
print('r2:', r2_df_xy)
# Making up the final x-plot
max_val = 14000
fig1_ml = px.scatter(res_rfr, x='test', y='predict', color='qc', hover_data=['well_excl'], width=400, height=400,
                     color_discrete_sequence=["red", "green"])
fig1_ml.update_traces(marker=dict(size=10,opacity=0.75,line=dict(color='rgb(47, 57, 61)', width=1)))
fig2_ml=px.line(x=[0,max_val], y=[0,max_val])
fig2_1_ml=px.line(x=[0,max_val], y=[0,max_val*1.25])
fig2_2_ml=px.line(x=[0,max_val], y=[0,max_val*0.75])
fig2_ml.update_traces(line=dict(color = 'blue'))
fig2_1_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig2_2_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig3_ml = go.Figure(data = fig1_ml.data + fig2_ml.data + fig2_1_ml.data + fig2_2_ml.data)
fig3_ml.update_layout(title = 'Comparison Actual vs Pred dist-kh RFR',width=600,height=400, xaxis_title='test', yaxis_title='predict',
                      margin=dict(l=10,r=10,b=10,t=40))

### Run RFR for Bal VIII sand train/test for azeri

In [None]:
#Gridsearch test run for RandForRegr Bal VIII sand
RF = RandomForestRegressor()
grid_param_RF = {
    'bootstrap': [True, False],
    'max_depth': [None, 10, 50, 75, 100, 150, 200, 500],
    'min_samples_leaf': [1, 2, 3, 5, 10],
    'min_samples_split': [1, 2, 3, 5, 10, 20],
    'n_estimators': [10, 25, 50, 100, 200]}
gd_sr_RF = GridSearchCV(estimator = RF, param_grid = grid_param_RF, scoring='r2', cv = None, n_jobs = -1)
gd_sr_RF.fit(x_train, y_train)
print(gd_sr_RF.best_params_)
# X_train/x_test data splitting
y = np.array(df_well_kh_dist_bal8_fld_azr['KHtst'].values)
x = np.array(df_well_kh_dist_bal8_fld_azr.drop(['well','FORMATION','field','KHtst'], axis=1))
w = np.array(df_well_kh_dist_bal8_fld_azr['well'].values)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

RF_setting = {'bootstrap':True, 
              'max_depth':75, 
              'min_samples_leaf':1, 
              'min_samples_split':10,
              'n_estimators':25}
RF = RandomForestRegressor(bootstrap= RF_setting['bootstrap'], 
                           max_depth=RF_setting['max_depth'], 
                           min_samples_leaf=RF_setting['min_samples_leaf'], 
                           min_samples_split=RF_setting['min_samples_split'], 
                           n_estimators=RF_setting['n_estimators']) 
# RF = Pipeline([("scaler",StandardScaler()),("rfr",RandomForestRegressor(bootstrap= RF_setting['bootstrap'], 
#                                                                         max_depth=RF_setting['max_depth'], 
#                                                                         min_samples_leaf=RF_setting['min_samples_leaf'], 
#                                                                         min_samples_split=RF_setting['min_samples_split'], 
#                                                                         n_estimators=RF_setting['n_estimators']))])
# #hard verstion with target transformer
# ttr = TransformedTargetRegressor(regressor=RandomForestRegressor(bootstrap= RF_setting['bootstrap'], 
#                                                                  max_depth=RF_setting['max_depth'], 
#                                                                  min_samples_leaf=RF_setting['min_samples_leaf'], 
#                                                                  min_samples_split=RF_setting['min_samples_split'], 
#                                                                  n_estimators=RF_setting['n_estimators']), 
#                                 transformer = StandardScaler())
# RF = Pipeline([("scaler",StandardScaler()),("RF",ttr)])

RF.fit(x_train, y_train)
#Returning our prediction values for the test data
y_pred_train = RF.predict(x_train)
y_pred = RF.predict(x_test)
# # Converting log-data to natural values
# y_pred_nat = conv_log10_nat(y_pred)
# y_test_nat = conv_log10_nat(y_test)
# y_pred_train_nat = conv_log10_nat(y_pred_train)
# y_train_nat = conv_log10_nat(y_train)
#Combining the actual and predicted values into a single df
df_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_results_train = pd.DataFrame({'Actual': y_train, 'Predicted': y_pred_train})
result_ml_plot(res = df_results, dataset = df_well_kh_dist_bal8_fld_azr, kh='KHtst', max_val=4000)
metric_result_print(y_train,y_pred_train,y_test, y_pred)
result_ml_plot(res = df_results_train, dataset = df_well_kh_dist_bal8_fld_azr, kh='KHtst', max_val=4000)

In [None]:
# Feature_importances for Bal VIII sand wa Azeri
feature_names = df_well_kh_dist_bal8_fld_azr.drop(['well','FORMATION','field','KHtst'], axis=1).columns
# mdi_importances = pd.Series(ttr.regressor_.feature_importances_, index=feature_names).sort_values(ascending=True) #for TransformedTargetRegressor
# mdi_importances = pd.Series(RF.steps[1][1].feature_importances_, index=feature_names).sort_values(ascending=True) #for pure Pipeline
ax = mdi_importances.plot.barh()
ax.set_title("RFR Feature Importances Bal VIII Azeri")
ax.figure.tight_layout()

## Creation xy-dist-kh dataset

In [None]:
#Reading csv with initial KHtst_v3, joining xy-coord & TVD_SCS tops of formation
path = 'C:\\jupyter\\SPP\\inputoutput\\'
df_khtst = pd.read_csv(path + 'df_prq_khtst_v3.csv')
df_khtst_xy = df_khtst.set_index(['well','FORMATION']).join(xy_coord.set_index(['well','FORMATION'])).reset_index()
df_khtst_xy['KH_log10'] = round(np.log(df_khtst_xy.KHtst))
df_khtst_xy_tvd = df_khtst_xy.set_index(['well', 'FORMATION']).join(df_prq_tvdss.set_index(['well','FORMATION'])).reset_index()
df_khtst_xy_tvd = df_khtst_xy_tvd.set_index('well').join(df_prq_wstat.set_index('well')).reset_index()
df_khtst_xy_tvd_fld = df_khtst_xy_tvd.set_index('well').join(df_prq.groupby('well')['field'].apply(lambda x: x.iloc[0])).reset_index()
#Clean dataset for outliers for Balakhany VIII & X  for AZR and CHG fields by rule 1.5 * IQR
fm_list_8_10 = ['Balakhany VIII', 'Balakhany VIII sand', 'Balakhany VIII 25','Balakhany VIII 20', 
             'Balakhany VIII 15', 'Balakhany VIII 10', 'Balakhany VIII 5',
             'Balakhany X', 'Balakhany X sand', 'Balakhany X 40', 'Balakhany X 20'] 
azr_lst = ['CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI']
chg_lst = ['CHIRAG', 'DWG', 'DDGG', 'WEST CHIRAG']
df_lst = []
for fm in fm_list_8_10:
    df_khtst_fm = df_khtst_xy_tvd_fld[(df_khtst_xy_tvd_fld.FORMATION == fm) & (df_khtst_xy_tvd_fld.field.isin(azr_lst))]
    Q1 = df_khtst_fm['KHtst'].quantile(0.25)
    Q3 = df_khtst_fm['KHtst'].quantile(0.75)
    IQR = Q3 - Q1
    # print(f'bal {fm} azr IQR', IQR, 'bot limit:', (Q1 - 1.5 * IQR), 'top limit:', (Q3 + 1.5 * IQR))
    df_khtst_fm_qcl = df_khtst_fm[~((df_khtst_fm['KHtst'] < (Q1 - 1.5 * IQR)) | (df_khtst_fm['KHtst'] > (Q3 + 1.5 * IQR)))]
    df_lst.append(df_khtst_fm_qcl)
for fm in fm_list_8_10:
    df_khtst_fm = df_khtst_xy_tvd_fld[(df_khtst_xy_tvd_fld.FORMATION == fm) & (df_khtst_xy_tvd_fld.field.isin(chg_lst))]
    Q1 = df_khtst_fm['KHtst'].quantile(0.25)
    Q3 = df_khtst_fm['KHtst'].quantile(0.75)
    IQR = Q3 - Q1
    # print(f'bal {fm} chg IQR', IQR, 'bot limit:', (Q1 - 1.5 * IQR), 'top limit:', (Q3 + 1.5 * IQR))
    df_khtst_fm_qcl = df_khtst_fm[~((df_khtst_fm['KHtst'] < (Q1 - 1.5 * IQR)) | (df_khtst_fm['KHtst'] > (Q3 + 1.5 * IQR)))]
    df_lst.append(df_khtst_fm_qcl)
df_khtst_bal_qcl = pd.concat(df_lst)
#Uploading k_htst data from csv-file
path = 'C:\\jupyter\\SPP\\inputoutput\\'
df_khtst = pd.read_csv(path + 'df_prq_khtst_v3.csv')
df_khtst_xy = df_khtst.set_index(['well','FORMATION']).join(
                                 df_prq[['well','FORMATION','X','Y','TVD_SCS']].groupby(
                                 ['well','FORMATION']).apply(lambda x: x.iloc[0]).drop(
                                 ['well','FORMATION'], axis=1)).reset_index()
# Preparation dataset for X_train/x_test data splitting based on outliers cleaned data
azr_lst = ['CENTRAL AZERI', 'WEST AZERI', 'EAST AZERI']
chg_lst = ['CHIRAG', 'DWG', 'DDGG', 'WEST CHIRAG']
well_clean_azr = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand') & 
                                  (df_khtst_bal_qcl.field.isin(azr_lst))].well
well_clean_all = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand')].well
#Calculation of Euclidean Distances for the top of Balakhany VIII sand.
def well_dist_calc(formation='Balakhany VIII sand'):
    data = df_khtst_xy[(df_khtst_xy.FORMATION == formation) & (df_khtst_xy.X > 0) & (df_khtst_xy.Y > 0)]
    row_name = data.well.reset_index().drop(['index'], axis=1)
    distance_fm = pd.DataFrame(euclidean_distances(data[['X', 'Y', 'TVD_SCS']]), columns=list(data.well))
    distance_fm_well = distance_fm.join(row_name).set_index('well')
    distance_fm_well.reset_index()
    return distance_fm_well.reset_index()
dist_bal8 = well_dist_calc('Balakhany VIII sand')
dist_bal10 = well_dist_calc('Balakhany X sand')  
#Collecting XY based on Euclidean Distances for the top of Balakhany VIII sand.
df_collect = []
for num, well_name in enumerate(dist_bal8.well[:]):
    data = df_khtst_xy[(df_khtst_xy.FORMATION == 'Balakhany VIII sand') & (df_khtst_xy.X > 0) & (df_khtst_xy.Y > 0)]
    data[data.well.isin(well_dist3)][['well','X','Y']].T[1:]
    well_dist3 = dist_bal8[dist_bal8.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()['index']
    well_dist3_x = data[data.well.isin(well_dist3)][['well','X','Y']].T[1:2].reset_index().drop('index', axis=1)
    well_dist3_y = data[data.well.isin(well_dist3)][['well','X','Y']].T[2:3].reset_index().drop('index', axis=1)
    well_dist3_y.columns =['y1', 'y2', 'y3']
    well_dist3_x.columns =['x1', 'x2', 'x3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany VIII sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_x, well_dist3_y, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect.append(result)
df_well_kh_xy = pd.concat(df_collect).reset_index().drop('index', axis=1)
df_khtst_xy_bal8 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany VIII sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_xy_bal8 = df_well_kh_xy.set_index('well').join(df_khtst_xy_bal8.set_index('well')).reset_index()
df_well_kh_xy_bal8_fld = df_well_kh_xy_bal8.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
# Making up dataset with xy for azeri field
df_well_kh_xy_bal8_fld_azr = df_well_kh_xy_bal8_fld[(df_well_kh_xy_bal8_fld.field.isin(azr_lst)) & 
                                                    (df_well_kh_xy_bal8_fld.well.isin(well_clean_all)) &
                                                    (df_well_kh_xy_bal8_fld.kh1>0) &
                                                    (df_well_kh_xy_bal8_fld.kh2>0) &
                                                    (df_well_kh_xy_bal8_fld.kh3>0) &
                                                    (df_well_kh_xy_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
# Making up dataset with xy for chirag & azeri fields
df_well_kh_xy_bal8_fld_all = df_well_kh_xy_bal8_fld[(df_well_kh_xy_bal8_fld.well.isin(well_clean_all)) &
                                                    (df_well_kh_xy_bal8_fld.kh1>0) &
                                                    (df_well_kh_xy_bal8_fld.kh2>0) &
                                                    (df_well_kh_xy_bal8_fld.kh3>0) &
                                                    (df_well_kh_xy_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_well_kh_xy_bal8_fld_all.head(3)

### The loop with RFR for xy-kh dataset

In [None]:
# Starting of the loop with RFR for azeri fields
y_test_lst = []
y_pred_lst = []
well_exclude_lst = []
for i in tqdm(range(len(df_well_kh_xy_bal8_fld_all))):
    df_wo_well = df_well_kh_xy_bal8_fld_all.drop([i])
    well_exclude = df_well_kh_xy_bal8_fld_all.iloc[i]['well']
    well_exclude_lst.append(well_exclude)
    y_train = np.array(df_wo_well['KHtst'])
    x_train = np.array(df_wo_well[['x1', 'x2', 'x3', 'y1', 'y2', 'y3', 'kh1', 'kh2', 'kh3']])
    well_train = np.array(df_wo_well['well'])
    y_test = np.array(df_well_kh_xy_bal8_fld_all.iloc[i]['KHtst'])
    y_test_lst.append(y_test)
    x_test = np.array(df_well_kh_xy_bal8_fld_all.iloc[i][['x1', 'x2', 'x3', 'y1', 'y2', 'y3', 'kh1', 'kh2', 'kh3']])
# Statement of ML-model
    RF = RandomForestRegressor(bootstrap= RF_setting['bootstrap'], 
                               max_depth=RF_setting['max_depth'], 
                               min_samples_leaf=RF_setting['min_samples_leaf'], 
                               min_samples_split=RF_setting['min_samples_split'], 
                               n_estimators=RF_setting['n_estimators'])
    RF.fit(x_train, y_train)
    y_pred = RF.predict([x_test]).round(0) 
    y_pred_lst.append(y_pred[0])
# Building up of dataframe
res_rfrxy = pd.DataFrame(zip(y_test_lst,y_pred_lst,well_exclude_lst), columns = ['test','predict','well_excl'])
res_rfrxy['l_test'] = res_rfrxy.test*0.75
res_rfrxy['h_test'] = res_rfrxy.test*1.25
res_rfrxy['qc'] = 'out'
res_rfrxy.loc[(res_rfrxy.predict >= res_rfrxy.l_test) & (res_rfrxy.predict <= res_rfrxy.h_test), 'qc'] = 'in'
res_rfrxy['l_test'] = res_rfrxy.test*0.75
res_rfrxy['h_test'] = res_rfrxy.test*1.25
res_rfrxy['qc'] = 'out'
res_rfrxy.loc[(res_rfrxy.predict >= res_rfrxy.l_test) & (res_rfrxy.predict <= res_rfrxy.h_test), 'qc'] = 'in'
print('wells total:', res_rfrxy.shape[0])
print('wells unpredicted:', res_rfrxy['qc'].value_counts()['out'], (res_rfrxy['qc'].value_counts()['out']/res_rfrxy.shape[0]).round(3), 'v/v')
print('wells predicted:', res_rfrxy['qc'].value_counts()['in'], (res_rfrxy['qc'].value_counts()['in']/res_rfrxy.shape[0]).round(3), 'v/v')
mae_df_xy = mae(res_rfrxy.test, res_rfrxy.predict).round(0)
r2_df_xy = r2(res_rfrxy.test, res_rfrxy.predict).round(2)
print('mae:', mae_df_xy, 'mDm')
print('r2:', r2_df_xy)
# Making up the final x-plot
max_val = 14000
fig1_ml = px.scatter(res_rfrxy, x='test', y='predict', color='qc', hover_data=['well_excl'], width=400, height=400,
                     color_discrete_sequence=["red", "green"])
fig1_ml.update_traces(marker=dict(size=10,opacity=0.75,line=dict(color='rgb(47, 57, 61)', width=1)))
fig2_ml=px.line(x=[0,max_val], y=[0,max_val])
fig2_1_ml=px.line(x=[0,max_val], y=[0,max_val*1.25])
fig2_2_ml=px.line(x=[0,max_val], y=[0,max_val*0.75])
fig2_ml.update_traces(line=dict(color = 'blue'))
fig2_1_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig2_2_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig3_ml = go.Figure(data = fig1_ml.data + fig2_ml.data + fig2_1_ml.data + fig2_2_ml.data)
fig3_ml.update_layout(title = 'Comparison Actual vs Pred xy-kh RFR',width=600,height=400, xaxis_title='test', yaxis_title='predict',
                      margin=dict(l=10,r=10,b=10,t=40))

## Nadir's dataset based on ALL Bal8+10 FU

### Data preparation

In [None]:
#Reading csv with initial KHtst_v3, joining xy-coord & TVD_SCS tops of formation
path = 'C:\\jupyter\\SPP\\inputoutput\\'
df_khtst = pd.read_csv(path + 'df_prq_khtst_v3.csv')
df_khtst_xy = df_khtst.set_index(['well','FORMATION']).join(xy_coord.set_index(['well','FORMATION'])).reset_index()
df_khtst_xy_tvd = df_khtst_xy.set_index(['well', 'FORMATION']).join(df_prq_tvdss.set_index(['well','FORMATION'])).reset_index()
df_khtst_xy_tvd_fld = df_khtst_xy_tvd.set_index('well').join(df_prq.groupby('well')['field'].apply(lambda x: x.iloc[0])).reset_index()
# Preparation dataset for X_train/x_test data splitting based on outliers cleaned data
well_clean_all = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION.str.contains('Balakhany VIII'))].well
df_khtst_xy_tvd_fld_bal = df_khtst_xy_tvd_fld[  df_khtst_xy_tvd_fld.FORMATION.str.contains('Balakhany VIII') |
                                                df_khtst_xy_tvd_fld.FORMATION.str.contains('Balakhany X')].drop('DEPTH', axis=1)
#Calculation of TST-thickness for ALL Balakhany FU
df_fu_tst = df_prq[(df_prq.FORMATION.str.contains('Balakhany VIII')) | (df_prq.FORMATION.str.contains('Balakhany X'))]
df_fu_tst = df_fu_tst[['well', 'DEPTH','FORMATION','TST']]
df_fu_tst_top = df_fu_tst.groupby(['well','FORMATION'])['TST'].apply(lambda x: x.iloc[0]).reset_index()
df_fu_tst_top.rename(columns={'TST':'TST_top'}, inplace=True)
df_fu_tst_bot = df_fu_tst.groupby(['well','FORMATION'])['TST'].apply(lambda x: x.iloc[-1]).reset_index()
df_fu_tst_bot.rename(columns={'TST':'TST_bot'}, inplace=True)
df_fu_tst_final = df_fu_tst_top.set_index(['well','FORMATION']).join(df_fu_tst_bot.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final['TST_interv'] = round((df_fu_tst_final.TST_bot - df_fu_tst_final.TST_top),0)
df_fu_tst_final = df_fu_tst_final.set_index(['well','FORMATION']).join(xy_coord.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final = df_fu_tst_final.set_index(['well', 'FORMATION']).join(df_prq_tvdss.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final = df_fu_tst_final.set_index('well').join(df_prq.groupby('well')['field'].apply(lambda x: x.iloc[0])).reset_index()
df_fu_tst_final = df_fu_tst_final[(df_fu_tst_final.TST_interv > 0)]
#Reading df_prq_htst_avgprop_v1 and getting outliers
path = 'C:\\jupyter\\SPP\\inputoutput\\' 
df_htst_avgprop = pd.read_csv(path + 'df_prq_htst_avgprop_v1.csv')
well_no_outliers8 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand'].well.unique()
well_no_outliers10 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany X sand'].well.unique()
#Preparation weighted average df_htst_avgprop-dataset
cutoff_h_tst = 0.5
cutoff_perm_avg = 5
#Applying filtration to dataset with cutoffs
df_htst_avgprop_nz = df_htst_avgprop[(df_htst_avgprop.h_tst > cutoff_h_tst) & (df_htst_avgprop.md_perm_avg > cutoff_perm_avg)]
#Multiplaying htst by resprop values
df_htst_avgprop_nz['kavg_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_perm_avg
df_htst_avgprop_nz['phit_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_phit_avg
df_htst_avgprop_nz['vsh_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_vsh_avg
#Summarizing h_tst via well & formation
df_htst_fm = df_htst_avgprop_nz.groupby(['well','FORMATION'])['h_tst'].sum().reset_index()
df_htst_fm.rename(columns={'h_tst':'gross_tst'}, inplace=True)
#Calculating weighted averages
df_htst_avgprop_nz_avgpropsum = df_htst_avgprop_nz.groupby(['well','FORMATION'])[['phit_htst','vsh_htst']].sum().reset_index()
df_htst_avgprop_nz_avgpropsum_join = df_htst_avgprop_nz_avgpropsum.set_index(
                                     ['well','FORMATION']).join(df_htst_fm.set_index(['well','FORMATION'])).reset_index()
df_htst_avgprop_nz_avgpropsum_join['phit_wavg'] = df_htst_avgprop_nz_avgpropsum_join.phit_htst / df_htst_avgprop_nz_avgpropsum_join.gross_tst
df_htst_avgprop_nz_avgpropsum_join['vsh_wavg'] = df_htst_avgprop_nz_avgpropsum_join.vsh_htst / df_htst_avgprop_nz_avgpropsum_join.gross_tst
df_bal_hpv = df_htst_avgprop_nz_avgpropsum_join[
              df_htst_avgprop_nz_avgpropsum_join.FORMATION.str.contains('Balakhany')][['well','FORMATION','gross_tst','phit_wavg','vsh_wavg']]     
df_bal_permh = df_htst_avgprop_nz[df_htst_avgprop_nz.FORMATION.str.contains('Balakhany')].groupby(['well','FORMATION'])['kavg_htst'].sum().reset_index()
df_bal_phhpv = df_bal_hpv.set_index(['well','FORMATION']).join(df_bal_permh.set_index(['well','FORMATION'])).reset_index()
# df_bal_phhpv
#Preparing x,y matrices for ML
df_bal_phhpv_tstint = df_bal_phhpv.set_index(['well','FORMATION']).join(df_fu_tst_final.set_index(['well','FORMATION'])).reset_index()
df_bal_phhpv_tstint.rename(columns={'TST_interv':'interv_tst'}, inplace=True)
df_bal_phhpv_tstint.rename(columns={'gross_tst':'rock_tst'}, inplace=True)
df_bal_phhpv_tstint = df_bal_phhpv_tstint[['well','FORMATION','X', 'Y','TVD_SCS','field','interv_tst','rock_tst', 'vsh_wavg', 'kavg_htst']]
df_bal_avgprop = df_bal_phhpv_tstint[df_bal_phhpv_tstint.X.notna() & df_bal_phhpv_tstint.Y.notna() & df_bal_phhpv_tstint.TVD_SCS.notna()]
df_bal_avgprop_ohe = pd.get_dummies(df_bal_avgprop, columns = ['FORMATION', 'field'])
# Rotating field across the middle to reflect x and y more geologically sensible
def rotate(x,y): #rotate x,y around xo,yo by theta (rad)
    theta = (math.pi/180)*34
    xo = st.median(np.array(df_khtst_xy['X'].to_list()))
    yo = st.median(np.array(df_khtst_xy['Y'].to_list()))
    xr = math.cos(theta)*(x-xo)-math.sin(theta)*(y-yo) + xo
    yr = math.sin(theta)*(x-xo)+math.cos(theta)*(y-yo) + yo
    return [xr,yr]
df_bal_avgprop_ohe[['X_new', 'Y_new']] = df_bal_avgprop_ohe.apply(lambda row: rotate(row['X'], row['Y']), axis=1, result_type='expand')
df_bal_avgprop_ohe = df_bal_avgprop_ohe[(df_bal_avgprop_ohe.kavg_htst < 13000) & (df_bal_avgprop_ohe.kavg_htst > 100)]
print('features: ',df_bal_avgprop_ohe.columns)
print('dataset size: ',df_bal_avgprop_ohe.shape)

### Nadir's dataset 70/30 split RFR

In [None]:
# X_train/x_test data splitting
y = np.array(df_bal_avgprop_ohe[[   'well','kavg_htst']])
x = np.array(df_bal_avgprop_ohe[[   'well','X_new', 'Y_new','TVD_SCS', 'interv_tst', 'rock_tst', 'vsh_wavg',
                                    'FORMATION_Balakhany VIII', 'FORMATION_Balakhany VIII 10',
                                    'FORMATION_Balakhany VIII 15', 'FORMATION_Balakhany VIII 20',
                                    'FORMATION_Balakhany VIII 25', 'FORMATION_Balakhany VIII 5',
                                    'FORMATION_Balakhany VIII sand', 'FORMATION_Balakhany X',
                                    'FORMATION_Balakhany X 20', 'FORMATION_Balakhany X 40',
                                    'FORMATION_Balakhany X 50', 'FORMATION_Balakhany X sand',
                                    'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG', 'field_DWG',
                                    'field_EAST AZERI', 'field_WEST AZERI', 'field_WEST CHIRAG']])
num = random.randint(0,100)
print('num', num)
x_train_init, x_test_init, y_train_init, y_test_init = train_test_split(x, y, test_size=0.3, random_state=num)
# Taking well names from train/test datasets
y_train_wells = y_train_init[:,0]
y_test_wells = y_test_init[:,0]
x_train = x_train_init[:,1:]
x_test = x_test_init[:,1:]
y_train = y_train_init[:,1]
y_test = y_test_init[:,1]
#Gridsearch test run for RandForRegr Bal VIII sand
rfr_gr_sr = RandomForestRegressor()
grid_param_RFR = {
    'bootstrap': [True, False],
    'max_depth': [10, 50, 75, 100],
    'min_samples_leaf': [1, 2, 3, 5],
    'min_samples_split': [1, 2, 3, 5],
    'n_estimators': [10, 25, 50, 100]}
scorer = make_scorer(mae, greater_is_better=False)
gd_sr_RFR = GridSearchCV(estimator = rfr_gr_sr, param_grid = grid_param_RFR, scoring=scorer, cv = 15, n_jobs = -1)
gd_sr_RFR.fit(x_train, y_train)
GS_setting = gd_sr_RFR.best_params_
print(GS_setting)
# Applying Pipeline for ML-model
rfr = Pipeline([("scaler",StandardScaler()),("rfr",RandomForestRegressor(   bootstrap= GS_setting['bootstrap'], 
                                                                            max_depth=GS_setting['max_depth'], 
                                                                            min_samples_leaf=GS_setting['min_samples_leaf'], 
                                                                            min_samples_split=GS_setting['min_samples_split'], 
                                                                            n_estimators=GS_setting['n_estimators']))])
rfr.fit(x_train, y_train)
y_pred_train = rfr.predict(x_train)
y_pred_test = rfr.predict(x_test)
print('---------------------')
print('r2_train', r2(y_train, y_pred_train).round(2), 'x_train', x_train.shape)
print('r2_test', r2(y_test, y_pred_test).round(2), 'x_test', x_test.shape)
print('mae_train', mae(y_train, y_pred_train).round(0))
print('mae_test', mae(y_test, y_pred_test).round(0))
# QC of predicted values for train & test datasets
df_rfr_train = pd.DataFrame(zip(y_train_wells, y_train, y_pred_train), columns=['well', 'actual','predict'])
df_rfr_train['l_limit'] = df_rfr_train.actual*0.75
df_rfr_train['h_limit'] = df_rfr_train.actual*1.25
df_rfr_train['qc'] = 'out'
df_rfr_train.loc[(df_gbr_train.predict >= df_gbr_train.l_limit) & (df_gbr_train.predict <= df_gbr_train.h_limit), 'qc'] = 'in'
df_rfr_test = pd.DataFrame(zip(y_test_wells, y_test, y_pred_test), columns=['well', 'actual','predict'])
df_rfr_test['l_limit'] = df_rfr_test.actual*0.75
df_rfr_test['h_limit'] = df_rfr_test.actual*1.25
df_rfr_test['qc'] = 'out'
df_rfr_test.loc[(df_rfr_test.predict >= df_rfr_test.l_limit) & (df_rfr_test.predict <= df_rfr_test.h_limit), 'qc'] = 'in'

### Run RFR

In [None]:
# Starting of the loop for Balakhany VIII chirag & azeri
y_test_lst = []
y_pred_test_lst = []
well_exclude_lst = []
gs_settings_lst = []
metrics_r2_mae_lst = []
df_bal_avgprop_ohe_gbr = df_bal_avgprop_ohe.sample(frac = 1).reset_index().drop('index', axis=1)
for i in tqdm(range(len(df_bal_avgprop_ohe_gbr))):
    #Making up the feature and target datasets
    df_wo_well = df_bal_avgprop_ohe_gbr.drop([i])
    well_exclude = df_bal_avgprop_ohe_gbr.iloc[i]['well']
    well_exclude_lst.append(well_exclude)
    y_train = np.array(df_wo_well['kavg_htst'])
    x_train = np.array(df_wo_well[[ 'X_new', 'Y_new','TVD_SCS', 'interv_tst', 'rock_tst', 'vsh_wavg',
                                    'FORMATION_Balakhany VIII', 'FORMATION_Balakhany VIII 10',
                                    'FORMATION_Balakhany VIII 15', 'FORMATION_Balakhany VIII 20',
                                    'FORMATION_Balakhany VIII 25', 'FORMATION_Balakhany VIII 5',
                                    'FORMATION_Balakhany VIII sand', 'FORMATION_Balakhany X',
                                    'FORMATION_Balakhany X 20', 'FORMATION_Balakhany X 40',
                                    'FORMATION_Balakhany X 50', 'FORMATION_Balakhany X sand',
                                    'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG', 'field_DWG',
                                    'field_EAST AZERI', 'field_WEST AZERI', 'field_WEST CHIRAG']])
    well_train = np.array(df_wo_well['well'])
    y_test = np.array(df_bal_avgprop_ohe_gbr.iloc[i]['kavg_htst'])
    y_test_lst.append(y_test)
    x_test = np.array(df_bal_avgprop_ohe_gbr.iloc[i][[  'X_new', 'Y_new','TVD_SCS', 'interv_tst', 'rock_tst', 'vsh_wavg',
                                                        'FORMATION_Balakhany VIII', 'FORMATION_Balakhany VIII 10',
                                                        'FORMATION_Balakhany VIII 15', 'FORMATION_Balakhany VIII 20',
                                                        'FORMATION_Balakhany VIII 25', 'FORMATION_Balakhany VIII 5',
                                                        'FORMATION_Balakhany VIII sand', 'FORMATION_Balakhany X',
                                                        'FORMATION_Balakhany X 20', 'FORMATION_Balakhany X 40',
                                                        'FORMATION_Balakhany X 50', 'FORMATION_Balakhany X sand',
                                                        'field_CENTRAL AZERI', 'field_CHIRAG', 'field_DDGG', 'field_DWG',
                                                        'field_EAST AZERI', 'field_WEST AZERI', 'field_WEST CHIRAG']])
    # GridSearch for ML-model
    # {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
    grid_param_RFR = {  'bootstrap' : True,
                        'max_depth': [10],
                        'min_samples_leaf':[2],
                        'min_samples_split' : [5],
                        'n_estimators': [50]}
    GS_setting = grid_param_RFR
    gs_settings_lst.append((    GS_setting['bootstrap'],GS_setting['max_depth'],GS_setting['min_samples_leaf'], 
                                GS_setting['min_samples_split'], GS_setting['n_estimators']))
    # Statement of ML-model
    rfr = Pipeline([("scaler",StandardScaler()),("rfr",RandomForestRegressor(   bootstrap= GS_setting['bootstrap'], 
                                                                                max_depth=GS_setting['max_depth'][0], 
                                                                                min_samples_leaf=GS_setting['min_samples_leaf'][0], 
                                                                                min_samples_split=GS_setting['min_samples_split'][0],
                                                                                n_estimators=GS_setting['n_estimators'][0]))])
    # Fitting the ML-model
    rfr.fit(x_train, y_train)
    y_pred_train = rfr.predict(x_train)
    y_pred_test = rfr.predict([x_test])
    y_pred_test_lst.append(y_pred_test[0])
    # Metrics computation for the ML-model
    r2_train = r2(y_train, y_pred_train).round(2)
    mae_train = mae(y_train, y_pred_train)
    metrics_r2_mae_lst.append((r2_train, mae_train.round(0)))
# Building up of dataframe
res_rfr = pd.DataFrame( zip(y_test_lst,y_pred_test_lst,well_exclude_lst, gs_settings_lst), 
                        columns = ['test','predict','well', 'gs_setting',])
res_rfr['l_test'] = res_rfr.test*0.75
res_rfr['h_test'] = res_rfr.test*1.25
res_rfr['qc'] = 'out'
res_rfr.loc[(res_rfr.predict >= res_rfr.l_test) & (res_rfr.predict <= res_rfr.h_test), 'qc'] = 'in'
print('wells total:', res_rfr.shape[0])
print('wells unpredicted:', res_rfr['qc'].value_counts()['out'], (res_rfr['qc'].value_counts()['out']/res_rfr.shape[0]).round(3), 'v/v')
print('wells predicted:', res_rfr['qc'].value_counts()['in'], (res_rfr['qc'].value_counts()['in']/res_rfr.shape[0]).round(3), 'v/v')
mae_df_xy = mae(res_rfr.test, res_rfr.predict).round(0)
r2_df_xy = r2(res_rfr.test, res_rfr.predict).round(2)
print('mae:', mae_df_xy, 'mDm')
print('r2:', r2_df_xy)

### Reporting

In [None]:
# Making up the final x-plot
max_val = 14000
fig1_ml = px.scatter(res_rfr, x='test', y='predict', 
                     color='qc', 
                     hover_data=['well'], 
                     width=400, height=400,
                     color_discrete_sequence=["red", "green"])
fig1_ml.update_traces(marker=dict(size=10,opacity=0.75,line=dict(color='rgb(47, 57, 61)', width=1)))
fig2_ml=px.line(x=[0,max_val], y=[0,max_val])
fig2_1_ml=px.line(x=[0,max_val], y=[0,max_val*1.25])
fig2_2_ml=px.line(x=[0,max_val], y=[0,max_val*0.75])
fig2_ml.update_traces(line=dict(color = 'blue'))
fig2_1_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig2_2_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig3_ml = go.Figure(data = fig1_ml.data + fig2_ml.data + fig2_1_ml.data + fig2_2_ml.data)
fig3_ml.update_layout(title = 'Comparison Actual vs Pred xy-kh rotated full Balakhany GBR',width=600,height=400, xaxis_title='test', yaxis_title='predict',
                      margin=dict(l=10,r=10,b=10,t=40))

## Shahriyar request

#### Data preparation

In [35]:
#Uploading k_htst data from csv-file & Calculation of Euclidean Distances
path = 'C:\\jupyter\\SPP\\inputoutput\\'
df_khtst = pd.read_csv(path + 'df_prq_khtst_v3.csv')
df_khtst_xy = df_khtst.set_index(['well','FORMATION']).join(
                                                            df_prq[['well','FORMATION','X','Y','TVD_SCS']].groupby(
                                                            ['well','FORMATION']).apply(lambda x: x.iloc[0]).drop(
                                                            ['well','FORMATION'], axis=1)
                                                            ).reset_index()
#Calculation of Euclidean Distances for the top of Balakhany VIII sand & Balakhany X sand
def well_dist_calc(formation='Balakhany VIII sand'):
    data = df_khtst_xy[(df_khtst_xy.FORMATION == formation) & (df_khtst_xy.X > 0) & (df_khtst_xy.Y > 0)]
    row_name = data.well.reset_index().drop(['index'], axis=1)
    distance_fm = pd.DataFrame(euclidean_distances(data[['X', 'Y', 'TVD_SCS']]), columns=list(data.well))
    distance_fm_well = distance_fm.join(row_name).set_index('well')
    distance_fm_well.reset_index()
    return distance_fm_well.reset_index()
dist_bal8 = well_dist_calc('Balakhany VIII sand')
dist_bal10 = well_dist_calc('Balakhany X sand')    
# Preparation dataset for X_train/x_test data splitting
well_clean_8 = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand')].well
well_clean_10 = df_khtst_bal_qcl[(df_khtst_bal_qcl.FORMATION == 'Balakhany X sand')].well
df_collect8 = []
for num, well_name in enumerate(dist_bal8.well[:]):
    well_dist3 = dist_bal8[dist_bal8.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()
    well_dist3_res = well_dist3.T[1:].reset_index().drop('index', axis=1)
    well_dist3_res.columns =['dist1', 'dist2', 'dist3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3['index']))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany VIII sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_res, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect8.append(result)
df_well_kh_dist8 = pd.concat(df_collect8).reset_index().drop('index', axis=1)
df_khtst_xy_bal8 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany VIII sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_dist_bal8 = df_well_kh_dist8.set_index('well').join(df_khtst_xy_bal8.set_index('well')).reset_index()
df_well_kh_dist_bal8_fld = df_well_kh_dist_bal8.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
df_well_kh_dist_bal8_fld = df_well_kh_dist_bal8_fld[(df_well_kh_dist_bal8_fld.well.isin(well_clean_8)) &
                                                    (df_well_kh_dist_bal8_fld.kh1>0) &
                                                    (df_well_kh_dist_bal8_fld.kh2>0) &
                                                    (df_well_kh_dist_bal8_fld.kh3>0) &
                                                    (df_well_kh_dist_bal8_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_collect10 = []
for num, well_name in enumerate(dist_bal10.well):
    well_dist3 = dist_bal10[dist_bal10.well == well_name].T[1:].sort_values(by=num)[1:4].reset_index()
    well_dist3_res = well_dist3.T[1:].reset_index().drop('index', axis=1)
    well_dist3_res.columns =['dist1', 'dist2', 'dist3']
    well_kh3 = df_khtst[(df_khtst.well.isin(list(well_dist3['index']))) & 
                        (df_khtst_xy.FORMATION == 'Balakhany X sand')]['KHtst'].reset_index()
    well_kh3 = well_kh3.T
    well_kh3_res = well_kh3.reset_index()[1:].drop('index', axis=1).reset_index().drop('index', axis=1)
    well_kh3_res.columns =['kh1', 'kh2', 'kh3']
    concat_df = pd.concat([well_dist3_res, well_kh3_res], axis=1)
    result = concat_df.join(pd.DataFrame([well_name], columns=['well']))
    df_collect10.append(result)
df_well_kh_dist10 = pd.concat(df_collect10).reset_index().drop('index', axis=1)
df_khtst_xy_bal10 = df_khtst_xy[df_khtst_xy.FORMATION=='Balakhany X sand'][['well', 'FORMATION', 'KHtst']]
df_well_kh_dist_bal10 = df_well_kh_dist10.set_index('well').join(df_khtst_xy_bal10.set_index('well')).reset_index()
df_well_kh_dist_bal10_fld = df_well_kh_dist_bal10.set_index('well').join(metadata[['well','field']].set_index('well')).reset_index()
df_well_kh_dist_bal10_fld = df_well_kh_dist_bal10_fld[(df_well_kh_dist_bal10_fld.well.isin(well_clean_10)) &
                                                    (df_well_kh_dist_bal10_fld.kh1>0) &
                                                    (df_well_kh_dist_bal10_fld.kh2>0) &
                                                    (df_well_kh_dist_bal10_fld.kh3>0) &
                                                    (df_well_kh_dist_bal10_fld.KHtst > 0)].reset_index().drop('index', axis=1)
df_well_kh_dist_all = pd.concat([df_well_kh_dist_bal8_fld, df_well_kh_dist_bal10_fld])
#Calculation of TST-thickness Balakhany VIII & X
df_fu_tst = df_prq[(df_prq.FORMATION.str.contains('Balakhany VIII')) | (df_prq.FORMATION.str.contains('Balakhany X'))]
df_fu_tst = df_fu_tst[['well', 'DEPTH','FORMATION','TST']]
df_fu_tst_top = df_fu_tst.groupby(['well','FORMATION'])['TST'].apply(lambda x: x.iloc[0]).reset_index()
df_fu_tst_top.rename(columns={'TST':'TST_top'}, inplace=True)
df_fu_tst_bot = df_fu_tst.groupby(['well','FORMATION'])['TST'].apply(lambda x: x.iloc[-1]).reset_index()
df_fu_tst_bot.rename(columns={'TST':'TST_bot'}, inplace=True)
df_fu_tst_final = df_fu_tst_top.set_index(['well','FORMATION']).join(df_fu_tst_bot.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final['TST_interv'] = round((df_fu_tst_final.TST_bot - df_fu_tst_final.TST_top),0)
df_fu_tst_final = df_fu_tst_final.set_index(['well','FORMATION']).join(xy_coord.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final = df_fu_tst_final.set_index(['well', 'FORMATION']).join(df_prq_tvdss.set_index(['well','FORMATION'])).reset_index()
df_fu_tst_final = df_fu_tst_final.set_index('well').join(df_prq.groupby('well')['field'].apply(lambda x: x.iloc[0])).reset_index()
df_fu_tst_final = df_fu_tst_final[(df_fu_tst_final.TST_interv > 0)]
#Reading df_prq_htst_avgprop_v1 and getting outliers
path = 'C:\\jupyter\\SPP\\inputoutput\\' 
df_htst_avgprop = pd.read_csv(path + 'df_prq_htst_avgprop_v1.csv')
well_no_outliers8 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany VIII sand'].well.unique()
well_no_outliers10 = df_khtst_bal_qcl[df_khtst_bal_qcl.FORMATION == 'Balakhany X sand'].well.unique()
#Preparation weighted average df_htst_avgprop-dataset
cutoff_h_tst = 0.5
cutoff_perm_avg = 5
#Applying filtration to dataset with cutoffs
df_htst_avgprop_nz = df_htst_avgprop[(df_htst_avgprop.h_tst > cutoff_h_tst) & (df_htst_avgprop.md_perm_avg > cutoff_perm_avg)]
#Multiplaying htst by resprop values
df_htst_avgprop_nz['kavg_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_perm_avg
df_htst_avgprop_nz['phit_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_phit_avg
df_htst_avgprop_nz['vsh_htst'] = df_htst_avgprop_nz.h_tst * df_htst_avgprop_nz.md_vsh_avg
#Summarizing h_tst via well & formation
df_htst_fm = df_htst_avgprop_nz.groupby(['well','FORMATION'])['h_tst'].sum().reset_index()
df_htst_fm.rename(columns={'h_tst':'gross_tst'}, inplace=True)
#Calculating weighted averages
df_htst_avgprop_nz_avgpropsum = df_htst_avgprop_nz.groupby(['well','FORMATION'])[['phit_htst','vsh_htst']].sum().reset_index()
df_htst_avgprop_nz_avgpropsum_join = df_htst_avgprop_nz_avgpropsum.set_index(
                                     ['well','FORMATION']).join(df_htst_fm.set_index(['well','FORMATION'])).reset_index()
df_htst_avgprop_nz_avgpropsum_join['phit_wavg'] = df_htst_avgprop_nz_avgpropsum_join.phit_htst / df_htst_avgprop_nz_avgpropsum_join.gross_tst
df_htst_avgprop_nz_avgpropsum_join['vsh_wavg'] = df_htst_avgprop_nz_avgpropsum_join.vsh_htst / df_htst_avgprop_nz_avgpropsum_join.gross_tst
df_8bal_hpv = df_htst_avgprop_nz_avgpropsum_join[
              df_htst_avgprop_nz_avgpropsum_join.FORMATION == 'Balakhany VIII sand'][['well','FORMATION','gross_tst','phit_wavg','vsh_wavg']]
df_8bal_permh = df_htst_avgprop_nz[df_htst_avgprop_nz.FORMATION == 'Balakhany VIII sand'].groupby(['well','FORMATION'])['kavg_htst'].sum().reset_index()
df_8bal_phhpv = df_8bal_hpv.set_index(['well','FORMATION']).join(df_8bal_permh.set_index(['well','FORMATION'])).reset_index()
df_10bal_hpv = df_htst_avgprop_nz_avgpropsum_join[
              df_htst_avgprop_nz_avgpropsum_join.FORMATION == 'Balakhany X sand'][['well','FORMATION','gross_tst','phit_wavg','vsh_wavg']]
df_10bal_permh = df_htst_avgprop_nz[df_htst_avgprop_nz.FORMATION == 'Balakhany X sand'].groupby(['well','FORMATION'])['kavg_htst'].sum().reset_index()
df_10bal_phhpv = df_10bal_hpv.set_index(['well','FORMATION']).join(df_10bal_permh.set_index(['well','FORMATION'])).reset_index()
# #Preparing x,y matrices for ML
df_8bal_phhpv_tstint = df_8bal_phhpv.set_index(['well','FORMATION']).join(df_fu_tst_final.set_index(['well','FORMATION'])).reset_index()
df_8bal_phhpv_tstint = df_8bal_phhpv_tstint[['well', 'FORMATION', 'X', 'Y', 'DEPTH', 'TVD_SCS', 'field', 'gross_tst',
                                             'TST_interv', 'kavg_htst', 'phit_wavg', 'vsh_wavg']]
df_8bal_phhpv_tstint.rename(columns={'TST_interv':'interv_tst'}, inplace=True)
df_avgprop8_final_wa = df_8bal_phhpv_tstint.copy()
df_10bal_phhpv_tstint = df_10bal_phhpv.set_index(['well','FORMATION']).join(df_fu_tst_final.set_index(['well','FORMATION'])).reset_index()
df_10bal_phhpv_tstint = df_10bal_phhpv_tstint[['well', 'FORMATION', 'X', 'Y', 'DEPTH', 'TVD_SCS', 'field', 'gross_tst',
                                             'TST_interv', 'kavg_htst', 'phit_wavg', 'vsh_wavg']]
df_10bal_phhpv_tstint.rename(columns={'TST_interv':'interv_tst'}, inplace=True)
df_avgprop10_final_wa = df_10bal_phhpv_tstint.copy()
#Selecting data for Bal8 & Bal10 
df_avgprop_bal10_wa = df_avgprop10_final_wa[df_avgprop10_final_wa.FORMATION.str.contains('Balakhany X sand') & 
                                          df_avgprop10_final_wa.well.isin(well_no_outliers10)]
df_avgprop_bal8_wa = df_avgprop8_final_wa[df_avgprop8_final_wa.FORMATION.str.contains('Balakhany VIII sand') & 
                                          df_avgprop8_final_wa.well.isin(well_no_outliers8)]
df_avgprop_bal_wa = pd.concat([df_avgprop_bal8_wa, df_avgprop_bal10_wa])
# For Shahriyar
df_dist_kh_bal_shahriayr =  df_avgprop_bal_wa.set_index(['well','FORMATION']).join(
                            df_well_kh_dist_all.drop('field',axis=1).set_index(['well','FORMATION'])
                            ).reset_index()
#rotate x,y around xo,yo by theta (rad)
def rotate(x,y): 
    theta = (math.pi/180)*34
    xo = st.median(np.array(df_khtst_xy['X'].to_list()))
    yo = st.median(np.array(df_khtst_xy['Y'].to_list()))
    xr = math.cos(theta)*(x-xo)-math.sin(theta)*(y-yo) + xo
    yr = math.sin(theta)*(x-xo)+math.cos(theta)*(y-yo) + yo
    return [xr,yr]
df_dist_kh_bal_shahriayr[['X_new', 'Y_new']] = df_dist_kh_bal_shahriayr.apply(lambda row: rotate(row['X'], row['Y']), axis=1, result_type='expand')
df_dist_kh_bal_shahriayr_final = df_dist_kh_bal_shahriayr[[ 'well','FORMATION', 'X_new', 'Y_new', 'TVD_SCS', 'kh1', 'kh2', 'kh3', 
                                                            'interv_tst','gross_tst','kavg_htst' ]]
df_dist_kh_bal_shahriayr_final = pd.get_dummies(df_dist_kh_bal_shahriayr_final, columns = ['FORMATION'])
df_dist_kh_bal_shahriayr_final = df_dist_kh_bal_shahriayr_final[(df_dist_kh_bal_shahriayr_final.TVD_SCS.notna()) &
                                                                (df_dist_kh_bal_shahriayr_final.kh1.notna())]
# df_dist_kh_bal_shahriayr_final.to_csv('df_dist_kh_bal_shahriayr_final.csv', index=False)

### 70/30 splits

In [None]:
# X_train/x_test data splitting
y = np.array(df_dist_kh_bal_shahriayr_final[[   'well','kavg_htst']])
x = np.array(df_dist_kh_bal_shahriayr_final[[   'well','X_new', 'Y_new', 'TVD_SCS', 'kh1', 'kh2', 'kh3', 'interv_tst','gross_tst', 
                                                       'FORMATION_Balakhany VIII sand', 'FORMATION_Balakhany X sand']])
num = random.randint(0,100)
print('num', num)
x_train_init, x_test_init, y_train_init, y_test_init = train_test_split(x, y, test_size=0.3, random_state=num)
# Taking well names from train/test datasets
y_train_wells = y_train_init[:,0]
y_test_wells = y_test_init[:,0]
x_train = x_train_init[:,1:]
x_test = x_test_init[:,1:]
y_train = y_train_init[:,1]
y_test = y_test_init[:,1]
#Gridsearch test run for RandForRegr Bal VIII sand
rfr_gr_sr = RandomForestRegressor()
grid_param_RFR = {
    'bootstrap': [True, False],
    'max_depth': [10, 50, 75, 100],
    'min_samples_leaf': [1, 2, 3, 5],
    'min_samples_split': [1, 2, 3, 5],
    'n_estimators': [10, 25, 50, 100]}
scorer = make_scorer(mae, greater_is_better=False)
gd_sr_RFR = GridSearchCV(estimator = rfr_gr_sr, param_grid = grid_param_RFR, scoring=scorer, cv = 15, n_jobs = -1)
gd_sr_RFR.fit(x_train, y_train)
GS_setting = gd_sr_RFR.best_params_
print(GS_setting)
# Applying Pipeline for ML-model
rfr = Pipeline([("scaler",StandardScaler()),("rfr",RandomForestRegressor(   bootstrap= GS_setting['bootstrap'], 
                                                                            max_depth=GS_setting['max_depth'], 
                                                                            min_samples_leaf=GS_setting['min_samples_leaf'], 
                                                                            min_samples_split=GS_setting['min_samples_split'], 
                                                                            n_estimators=GS_setting['n_estimators']))])
rfr.fit(x_train, y_train)
y_pred_train = rfr.predict(x_train)
y_pred_test = rfr.predict(x_test)
print('---------------------')
print('r2_train', r2(y_train, y_pred_train).round(2), 'x_train', x_train.shape)
print('r2_test', r2(y_test, y_pred_test).round(2), 'x_test', x_test.shape)
print('mae_train', mae(y_train, y_pred_train).round(0))
print('mae_test', mae(y_test, y_pred_test).round(0))
# QC of predicted values for train & test datasets
df_rfr_train = pd.DataFrame(zip(y_train_wells, y_train, y_pred_train), columns=['well', 'actual','predict'])
df_rfr_train['l_limit'] = df_rfr_train.actual*0.75
df_rfr_train['h_limit'] = df_rfr_train.actual*1.25
df_rfr_train['qc'] = 'out'
df_rfr_train.loc[(df_gbr_train.predict >= df_gbr_train.l_limit) & (df_gbr_train.predict <= df_gbr_train.h_limit), 'qc'] = 'in'
df_rfr_test = pd.DataFrame(zip(y_test_wells, y_test, y_pred_test), columns=['well', 'actual','predict'])
df_rfr_test['l_limit'] = df_rfr_test.actual*0.75
df_rfr_test['h_limit'] = df_rfr_test.actual*1.25
df_rfr_test['qc'] = 'out'
df_rfr_test.loc[(df_rfr_test.predict >= df_rfr_test.l_limit) & (df_rfr_test.predict <= df_rfr_test.h_limit), 'qc'] = 'in'

### Run RFR

In [None]:
# Starting of the loop
y_test_lst = []
y_pred_test_lst = []
well_exclude_lst = []
gs_settings_lst = []
metrics_r2_mae_lst = []
df_dist_kh_bal_shahriayr_rfr = df_dist_kh_bal_shahriayr_final.sample(frac = 1).reset_index().drop('index', axis=1)
for i in tqdm(range(len(df_dist_kh_bal_shahriayr_rfr))):
    #Making up the feature and target datasets
    df_wo_well = df_dist_kh_bal_shahriayr_rfr.drop([i])
    well_exclude = df_dist_kh_bal_shahriayr_rfr.iloc[i]['well']
    well_exclude_lst.append(well_exclude)
    y_train = np.array(df_wo_well['kavg_htst'])
    x_train = np.array(df_wo_well[[ 'X_new', 'Y_new', 'TVD_SCS', 'kh1', 'kh2', 'kh3', 'interv_tst','gross_tst', 
                                    'FORMATION_Balakhany VIII sand', 'FORMATION_Balakhany X sand']])
    well_train = np.array(df_wo_well['well'])
    y_test = np.array(df_dist_kh_bal_shahriayr_rfr.iloc[i]['kavg_htst'])
    y_test_lst.append(y_test)
    x_test = np.array(df_dist_kh_bal_shahriayr_rfr.iloc[i][[  'X_new', 'Y_new', 'TVD_SCS', 'kh1', 'kh2', 'kh3', 'interv_tst','gross_tst', 
                                                              'FORMATION_Balakhany VIII sand', 'FORMATION_Balakhany X sand']])
    # GridSearch for ML-model
    # {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
    grid_param_RFR = {  'bootstrap' : True,
                        'max_depth': [10],
                        'min_samples_leaf':[2],
                        'min_samples_split' : [5],
                        'n_estimators': [50]}
    GS_setting = grid_param_RFR
    gs_settings_lst.append((    GS_setting['bootstrap'],GS_setting['max_depth'],GS_setting['min_samples_leaf'], 
                                GS_setting['min_samples_split'], GS_setting['n_estimators']))
    # Statement of ML-model
    rfr = Pipeline([("scaler",StandardScaler()),("rfr",RandomForestRegressor(   bootstrap= GS_setting['bootstrap'], 
                                                                                max_depth=GS_setting['max_depth'][0], 
                                                                                min_samples_leaf=GS_setting['min_samples_leaf'][0], 
                                                                                min_samples_split=GS_setting['min_samples_split'][0],
                                                                                n_estimators=GS_setting['n_estimators'][0]))])
    # Fitting the ML-model
    rfr.fit(x_train, y_train)
    y_pred_train = rfr.predict(x_train)
    y_pred_test = rfr.predict([x_test])
    y_pred_test_lst.append(y_pred_test[0])
    # Metrics computation for the ML-model
    r2_train = r2(y_train, y_pred_train).round(2)
    mae_train = mae(y_train, y_pred_train)
    metrics_r2_mae_lst.append((r2_train, mae_train.round(0)))
# Building up of dataframe
res_rfr_sha = pd.DataFrame( zip(y_test_lst,y_pred_test_lst,well_exclude_lst, gs_settings_lst), 
                        columns = ['test','predict','well', 'gs_setting',])
res_rfr_sha['l_test'] = res_rfr_sha.test*0.75
res_rfr_sha['h_test'] = res_rfr_sha.test*1.25
res_rfr_sha['qc'] = 'out'
res_rfr_sha.loc[(res_rfr_sha.predict >= res_rfr_sha.l_test) & (res_rfr_sha.predict <= res_rfr_sha.h_test), 'qc'] = 'in'
print('wells total:', res_rfr_sha.shape[0])
print('wells unpredicted:', res_rfr_sha['qc'].value_counts()['out'], (res_rfr_sha['qc'].value_counts()['out']/res_rfr_sha.shape[0]).round(3), 'v/v')
print('wells predicted:', res_rfr_sha['qc'].value_counts()['in'], (res_rfr_sha['qc'].value_counts()['in']/res_rfr_sha.shape[0]).round(3), 'v/v')
mae_df_xy = mae(res_rfr_sha.test, res_rfr_sha.predict).round(0)
r2_df_xy = r2(res_rfr_sha.test, res_rfr_sha.predict).round(2)
print('mae:', mae_df_xy, 'mDm')
print('r2:', r2_df_xy)

### Reporting

In [None]:
# Making up the final x-plot
max_val = 14000
fig1_ml = px.scatter(res_rfr_sha, x='test', y='predict', 
                     color='qc', 
                     hover_data=['well'], 
                     width=400, height=400,
                     color_discrete_sequence=["green", "red"])
fig1_ml.update_traces(marker=dict(size=10,opacity=0.75,line=dict(color='rgb(47, 57, 61)', width=1)))
fig2_ml=px.line(x=[0,max_val], y=[0,max_val])
fig2_1_ml=px.line(x=[0,max_val], y=[0,max_val*1.25])
fig2_2_ml=px.line(x=[0,max_val], y=[0,max_val*0.75])
fig2_ml.update_traces(line=dict(color = 'blue'))
fig2_1_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig2_2_ml.update_traces(line=dict(color = 'blue', dash='dash'))
fig3_ml = go.Figure(data = fig1_ml.data + fig2_ml.data + fig2_1_ml.data + fig2_2_ml.data)
fig3_ml.update_layout(title = 'Comparison Actual vs Shahriyar RFR',width=600,height=400, xaxis_title='test', yaxis_title='predict',
                      margin=dict(l=10,r=10,b=10,t=40))