In [1]:
import pandas as pd
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import accuracy_score

def convert_label(value_map):
    def mapper(label):
        if label in value_map:
            return value_map[label]
        else:
            return label
    return mapper

In [2]:
# Load the data into a pandas DataFrame
df = pd.read_excel('az10.xlsx')
data = df.copy()
data.head()

Unnamed: 0,Well ID,X,Y,DEPTH,CALI,DT,NPHI,PHIE,RHOB,SW,litho,VANH,VCLC,VDOL,VOL_QUARTZ,VOL_SHALE,Un
0,AZ-010,607255.5255,2054041.255,2555.1384,5.7268,61.7078,0.101,0.1201,2.5714,0.3912,LIMESTONE,0.1401,0.6317,0.108,0,0.0,-999.25
1,AZ-010,607255.5255,2054041.255,2555.2908,5.7468,65.2759,0.1378,0.148,2.5083,0.288,LIMESTONE,0.0288,0.5921,0.2312,0,0.0,-999.25
2,AZ-010,607255.5255,2054041.255,2555.4432,5.8124,68.7534,0.1657,0.173,2.4661,0.2421,LIMESTONE,0.0346,0.5207,0.2717,0,0.0,-999.25
3,AZ-010,607255.5255,2054041.255,2555.5956,5.8405,70.1109,0.1981,0.1984,2.4371,0.2994,MIX,0.0034,0.3592,0.4391,0,0.0,-999.25
4,AZ-010,607255.5255,2054041.255,2555.748,5.7661,73.7412,0.2272,0.2284,2.4057,0.3439,DOLOMITE,0.0001,0.2274,0.5441,0,0.0,-999.25


In [3]:
unique_values = data['litho'].unique()
unique_values

array(['LIMESTONE', 'MIX', 'DOLOMITE', 'ANHYDRIDE', 'SANDSTONE', 'SHALE'],
      dtype=object)

# Drop data

In [4]:
data = data.drop('Well ID', axis=1)
data = data.drop('VOL_QUARTZ', axis=1)
data = data.drop('Un', axis=1)
data = data.drop('X', axis=1)
data = data.drop('Y', axis=1)

In [5]:

def auto_discretize_column(df, column_name, num_bins=3):
    # Get the column data
    col_data = df[column_name]
    
    # Determine the bin ranges based on the data distribution
    min_val = col_data.min()
    max_val = col_data.max()
    
    
    bin_size = (max_val - min_val) / num_bins
    bins = [min_val + i * bin_size for i in range(num_bins + 1)]
    bins[0] -= max_val
    bins[num_bins] += max_val
    
    return bins

In [6]:

def fill_missing_values(df, columns_to_fill):
    for col in columns_to_fill:
        # Get the columns that are related to the current column
        related_cols = [c for c in df.columns if c != col and c in df.columns]
        
        # Calculate the mean of the related columns
        mean_value = df[related_cols].mean(axis=1)
        
        # Fill the missing values with the mean
        df[col] = df[col].fillna(mean_value)
    
    return df

In [60]:
# fill_data = fill_missing_values(data, ['CALI', 'DT', 'NPHI', 'PHIE', 'RHOB', 'SW', 'VANH', 'VCLC', 'VDOL', 'VOL_SHALE'])
# fill_data

# discretization 

In [7]:

data2 = data.copy()

data2["litho"] = data2["litho"].apply(convert_label({'LIMESTONE': 0, 'MIX': 1,'SANDSTONE':2,'DOLOMITE':4, 'ANHYDRIDE':5, 'SHALE':6}))
data2['PHIE'] = pd.cut(data2['PHIE'], bins=auto_discretize_column(data2,'PHIE',7), labels=[0, 1, 2,3,4,5,6])
data2['CALI'] = pd.cut(data2['CALI'], bins=auto_discretize_column(data2,'CALI',7), labels=[0, 1, 2,3,4,5,6])
data2['DT'] = pd.cut(data2['DT'], bins=auto_discretize_column(data2,'DT',7), labels=[0, 1, 2,3,4,5,6])
data2['DEPTH'] = pd.cut(data2['DEPTH'], bins=auto_discretize_column(data2,'DEPTH',7), labels=[0, 1, 2,3,4,5,6])
data2['NPHI'] = pd.cut(data2['NPHI'], bins=auto_discretize_column(data2,'NPHI',7), labels=[0, 1, 2,3,4,5,6])
data2['RHOB'] = pd.cut(data2['RHOB'], bins=auto_discretize_column(data2,'RHOB',7), labels=[0, 1, 2,3,4,5,6])
data2['SW'] = pd.cut(data2['SW'], bins=auto_discretize_column(data2,'SW',7), labels=[0, 1, 2,3,4,5,6]) 
data2['VANH'] = pd.cut(data2['VANH'], bins=auto_discretize_column(data2,'VANH',7), labels=[0, 1, 2,3,4,5,6])
data2['VCLC'] = pd.cut(data2['VCLC'], bins=auto_discretize_column(data2,'VCLC',7), labels=[0, 1, 2,3,4,5,6])
data2['VDOL'] = pd.cut(data2['VDOL'], bins=auto_discretize_column(data2,'VDOL',7), labels=[0, 1, 2,3,4,5,6])
data2['VOL_SHALE'] = pd.cut(data2['VOL_SHALE'], bins=auto_discretize_column(data2,'VOL_SHALE',7), labels=[0, 1, 2,3,4,5,6]) 




In [8]:
data2.head()

Unnamed: 0,DEPTH,CALI,DT,NPHI,PHIE,RHOB,SW,litho,VANH,VCLC,VDOL,VOL_SHALE
0,0,0,1,1,2,4,2,0,6,5,0,0
1,0,0,2,2,3,3,1,0,6,5,1,0
2,0,1,2,2,3,3,1,0,6,4,2,0
3,0,1,2,3,4,3,2,1,6,3,3,0
4,0,0,3,3,5,2,2,4,6,1,4,0


In [9]:
data = data.astype({
    'CALI': 'int64',
    'DT': 'int64',
    'NPHI': 'int64',
    'PHIE': 'int64',
    'RHOB': 'int64',
    'SW': 'int64',
    'VANH': 'int64',
    'VCLC': 'int64',
    'VDOL': 'int64',
    'VOL_SHALE': 'int64',
    'DEPTH': 'int64'
})


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2302 entries, 0 to 2301
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   DEPTH      2302 non-null   int64 
 1   CALI       2302 non-null   int64 
 2   DT         2302 non-null   int64 
 3   NPHI       2302 non-null   int64 
 4   PHIE       2302 non-null   int64 
 5   RHOB       2302 non-null   int64 
 6   SW         2302 non-null   int64 
 7   litho      2302 non-null   object
 8   VANH       2302 non-null   int64 
 9   VCLC       2302 non-null   int64 
 10  VDOL       2302 non-null   int64 
 11  VOL_SHALE  2302 non-null   int64 
dtypes: int64(11), object(1)
memory usage: 215.9+ KB


In [63]:
# Get the number of rows and columns
num_rows, num_cols = data.shape
print(f"The DataFrame has {num_rows} rows and {num_cols} columns.")

The DataFrame has 2302 rows and 12 columns.


In [64]:
data2.info()
data = data2.copy()
data.to_csv('data_cleaned.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2302 entries, 0 to 2301
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   DEPTH      2302 non-null   category
 1   CALI       2302 non-null   category
 2   DT         2302 non-null   category
 3   NPHI       2302 non-null   category
 4   PHIE       2302 non-null   category
 5   RHOB       2302 non-null   category
 6   SW         2302 non-null   category
 7   litho      2302 non-null   int64   
 8   VANH       2302 non-null   category
 9   VCLC       2302 non-null   category
 10  VDOL       2302 non-null   category
 11  VOL_SHALE  2302 non-null   category
dtypes: category(11), int64(1)
memory usage: 46.7 KB
