<a href="https://colab.research.google.com/github/yohanesnuwara/geostatistics/blob/main/project_notebooks/EDA_geoprovider_1240_wells.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Interactive exploratory data analysis (scatter plot) of 1,240 North Sea wells released by GeoProvider (2021)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import plotly.express as px

from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual, ToggleButtons
import ipywidgets as widgets

plt.style.use("classic")

## Load data

In [None]:
url = "https://zenodo.org/record/4419060/files/RealPore%20Por%20Perm%20Lithology%20data%201240%20Wells%20Norway%20public.xlsx?download=1"

# Loading takes 2-3 mins
data = pd.read_excel(url, sheet_name=0)

data.head(3)

Unnamed: 0,seq numb,Data source file name,Well Name,Comments,NPDID,Plug or sample number,Measured Depth,Permeability horizontal ka (air),Permeability horizontal ke (klinkenberg corrected) also called kl. KL,Permeability vertical ka (air),Permeability vertical ke (klinkenberg corrected),Permeability horizontal kg (gas) KG HOR,Permeability horizontal ke (klinkenberg corrected) also called kl. KL.1,Permeability vertical kg (gas) KG VERT,Permeability vertical ke (klinkenberg corrected) .1,Nitrogen Permeability. Hor.,Klinkenberg corrected gas perm. Hor.,Nitrogen Permeability. Vert.,Klinkenberg corrected gas perm. Vert.,Permeabilty without correction specified KH horizontal 1/pm,Permeabilty without correction specified KV vertical 1/pm,Porosity. Horizontal PLUG,Porosity. Vertical PLUG,Porosity measurement from fluid summation,Porosity helium,porosity best of available,Porosity Sum,Pore Sturation Oil (So),Pore Sturation Water Sw Sw. S wtr. WTR.,gain density gr/cm3,Formation description original,Remarks on the table,Main Lithology Origin,Non sorted Transcription,main lithology,clean lithology,color,grain size,rounding,cement,sorting,sed structures,auxilaries,auxilaries.1,auxilaries.2,auxilaries.3,auxilaries.4,auxilaries.5,Unnamed: 48
0,1.0,1/2 - 1,1_2 -1,,1382,1,3126.03,0.07,0.39,,,,,,,,,,,,,16.17,,12.4,29.7,29.7,,27.4,45.2,2.65,,CORE 4,,,,,,,,,,,,,,,,,
1,2.0,1/2 - 1,1_2 -1,,1382,2,3126.33,0.6,77.0,,,,,,,,,,,,,18.0,,,24.0,24.0,,,,2.65,,CORE 4,,,,,,,,,,,,,,,,,
2,3.0,1/2 - 1,1_2 -1,,1382,3,3126.64,88.0,74.0,74.0,64.0,,,,,,,,,,,,,32.6,28.6,28.6,,59.8,17.5,2.66,,CORE 4,,,,,,,,,,,,,,,,,


## Preprocessing

In [None]:
column_list = ["clean lithology", "rounding", "sorting", "cement", "sed structures"]

# Change NaN values in the above column names to "Unidentified"
data[column_list] = data[column_list].fillna("Unidentified")

# Equalize all duplicated values 
data["clean lithology"] = data["clean lithology"].replace({"Limestone ": "Limestone", " Limestone": "Limestone",
                                                           "Sandstone ": "Sandstone", "sandstone": "Sandstone", 
                                                           "Silstone": "Siltstone", "Siltstone)": "Siltstone", 
                                                           "siltstone": "Siltstone", "rock quartz": "quartz"})

data["rounding"] = data["rounding"].replace({"subangular ": "subangular"})

# Rename column names of permeability, porosity, grain density
data = data.rename(columns={"Klinkenberg corrected gas perm. Hor.": "permh_klinken",
                            "porosity best of available": "porosity",
                            "gain density gr/cm3": "grain_density"})

# Convert dtype=object in above 3 columns to float, coercing strings e.g. "NMP" to NaN
data["permh_klinken"] = pd.to_numeric(data["permh_klinken"], errors="coerce")
data["porosity"] = pd.to_numeric(data["porosity"], errors="coerce")
data["grain_density"] = pd.to_numeric(data["grain_density"], errors="coerce")

# Select subset of dataframe
data_select = data[["Well Name", "Measured Depth", "porosity", "permh_klinken", 
                    "grain_density", "clean lithology", "rounding", "sorting",
                    "cement", "sed structures"]].copy()

# Drop NaN from porosity, permeability, grain_density
data_select = data_select.dropna(subset=["porosity", "permh_klinken", "grain_density"])

## Scatter plot (interactive)

In [None]:
# @title Scatter Plot Attributes

x = "porosity" #@param ["porosity", "permh_klinken", "grain_density"] 
y = "permh_klinken" #@param ["porosity", "permh_klinken", "grain_density"]
c = "grain_density" #@param ["porosity", "permh_klinken", "grain_density"]
sort_by = "cement" #@param ["rounding", "clean lithology", "sorting", "cement", "sed structures"]

minc = min(data_select[c].values)
maxc = max(data_select[c].values)

@interact

def f(sort_by_value=list(data_select[sort_by].unique()),
      log_x=[False, True], log_y=[True, False],
      cmin=widgets.FloatSlider(value=minc, min=minc, max=maxc, step=.01),
      cmax=widgets.FloatSlider(value=maxc, min=minc, max=maxc, step=.01)):

  # masking the sort by
  mask1 = data_select[sort_by] == sort_by_value
  df = data_select[mask1]

  # masking based on min and max color
  mask2 = (df[c] > cmin) & (df[c] < cmax)
  df = df[mask2]

  cont_color = ["blue", "green", "red", "yellow"]

  fig = px.scatter(df, x=x, y=y, log_x=log_x, log_y=log_y, 
                   color=c, range_x=(0,50), width=700, height=700,
                   color_continuous_scale=cont_color,
                   range_color=(minc, maxc),
                   hover_data=["Well Name", "sorting", "clean lithology", 
                               "rounding", "cement", "sed structures"])
  
  fig.update_layout(
      title={
          'text': "Scatter Plot for {}: {}".format(sort_by, sort_by_value),
          'y':0.95,
          'x':0.5,
          'xanchor': 'center',
          'yanchor': 'top'}) 
  fig.show() 

interactive(children=(Dropdown(description='sort_by_value', options=('fair cemented', 'well cemented', 'consol…

## Copyright

Interactive tool Copyright © Yohanes Nuwara (2021)

Data Copyright © GeoProvider https://geoprovider.no/ and licensed CC-BY 4.0 (2021)