# KMean Clustering Test



**The required libraries are imported as below.**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [2]:
df = pd.read_csv('Well_Core.csv')
df.dropna(inplace=True)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Well_Core.csv'

In [None]:
df.sort_values('FZI', ascending=False)

In [None]:
sns.lmplot(x='PHIZ', y='FZI', data=df, palette='Set1', height=5, aspect=2, fit_reg=False)


In [None]:
print(np.log(df['FZI']).min(), np.log(df['FZI']).max())


In [None]:
f, ax = plt.subplots(figsize=(10, 7))

sns.scatterplot(x="CPOR", y='CPERM', data=df)
ax.set(yscale="log")
ax.grid(True)
ax.axis([0.001, 0.25, 0.001, 1000])

ax.set(xlabel="Core Porosity", ylabel="Core Permeability")

In [None]:
#test = df.iloc[:,1:6] # 1t for rows and second for columns

test = df[['CPOR', 'CPERM', 'FZI']]
test

In [None]:
param_space = {'RQI': np.logspace(-2, 0.19, 8)}
param_space

In [None]:
g = sns.FacetGrid(test, hue=param_space, palette='Set1',height=6, aspect=2)
g = g.map(plt.hist,'FZI', bins=20,alpha=0.5)

## from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

In [None]:
def optimise_k_means(test, max_k):
    means = []
    inertias = []
    
    for k in range(1,max_k):
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(test)
        means.append(k)
        inertias.append(kmeans.inertia_)
        
    fig = plt.subplots(figsize=(10, 5))
    plt.plot(means, inertias, 'o-')
    plt.xlabel("Number of Clusters")
    plt.ylabel("Inertia")
    plt.grid(True)
    plt.show()

In [None]:
optimise_k_means(test, 30)

In [None]:
kmeans = KMeans(8)
kmeans.fit(test)

In [None]:
identified_clusters = kmeans.fit_predict(test)
identified_clusters

In [None]:
f, ax = plt.subplots(figsize=(10, 7))

sns.scatterplot(x="CPOR", y="FZI", data=test, hue = identified_clusters, palette='Set1')
ax.set(yscale="log")
ax.grid(True)
ax.axis([0.001, 0.25, 0.001, 1000])

ax.set(xlabel="Core Porosity", ylabel="Core Permeability")

In [None]:

def myfunc(value):
   
    if value['FZI'] < 0.4:
        return 'HFU1'
    elif value['FZI'] >= 0.4 and value['FZI'] < 0.7:
        return 'HFU2'
    else:
        return 'HFU3'

df['HFU'] = df.apply(lambda x : myfunc(x), axis=1)
df

**How many entries we have in the dataset?**

In [None]:
df.info()

**Display the basic statistics, mean, std, max etc....**

In [None]:
# Code here please so that you don't lose the output

In [None]:
df.dropna().describe()

## Exploratory Data Analysis

Let's do some EDA here, always good to know your data!

**How the area 'A' is related to the compactness 'C', create a scatter plot please. Luckily, we have the target values, pass it as hue argument!** Any comments on clusters!

In [None]:
# Code here please so that you don't lose the output

In [None]:
sns.lmplot(x='PHIZ', y='RQI',data=df, hue = 'HFU',
           palette='Set1', height=5, aspect=2, fit_reg=False)

In [None]:
df['FZI'].max()

In [None]:
g = sns.FacetGrid(df,hue='HFU',palette='Set1',height=6, aspect=2)
g = g.map(plt.hist,'FZI',bins=20,alpha=0.5)

In [None]:
#from matplotlib.ticker import FuncFormatter
f, ax = plt.subplots(figsize=(10, 7))

sns.scatterplot(x="PHIZ", y="RQI", hue='HFU', data=df)
ax.set(xscale="log", yscale="log")
ax.grid(True)
ax.axis([0.0015, 1, 0.005, 10])
#ax.set_xticks([0.005,0.1,1])
#ax.set_yticks([0.005,5,10])
ax.set(xlabel="PHiZ", ylabel="RQI")



In [None]:
f, ax = plt.subplots(figsize=(10, 7))

sns.scatterplot(x="CPOR", y="CPERM", hue='HFU', data=df)
ax.set(yscale="log")
ax.grid(True)
ax.axis([0.005, 0.25, 0.005, 1000])
#ax.set_xticks([0.005,0.1,1])
#ax.set_yticks([0.005,5,10])
ax.set(xlabel="Core Porosity", ylabel="Core Permeability")

**To create the plot below (a stacked histogram) use [sns.FacetGrid](https://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.FacetGrid.html).**

In [None]:
# Code here please so that you don't lose the output

In [None]:
g = sns.FacetGrid(df, col="WELL", height=3.5, aspect=.65)
g.map(sns.histplot, "RQI")

In [None]:
# Code here please so that you don't lose the output
df2 = df[df['HFU'] == 'HFU3'].dropna()
df2

In [None]:
# Code here please so that you don't lose the output
f, ax = plt.subplots(figsize=(5, 3))

sns.scatterplot(x="CPOR", y="CPERM", data=df2)
ax.set(yscale="log")
ax.grid(True)
ax.axis([0.06, 0.20, 1, 1000])
#ax.set_xticks([0.005,0.1,1])
#ax.set_yticks([0.005,5,10])
ax.set(xlabel="Core Porosity", ylabel="Core Permeability")

In [None]:
#preprocessing
features = ['CPOR']
target = 'CPERM'

X = df2[features].values.reshape(-1, len(features))
y = np.log(df2[target].values)
print(X.shape, y.shape)

In [None]:
#Ordinary Least Squares
from sklearn import linear_model
ols = linear_model.LinearRegression()
model = ols.fit(X, y)

In [None]:
#linear regression coefficient and the y-intercept 
print(model.coef_, model.intercept_)

## The HFU 3 permeability  can be predicted from porosity, with the following linear model:
The result of the regression is: $10^{(41.80786208  * CPOR -1.4470765324424617)}$

In [None]:
#Accuracy assessment: R2
model.score(X, y)

In [None]:
#assume CPOR = 14% and 18%
x_pred = np.array([0.14, 0.18])
x_pred = x_pred.reshape(-1, len(features))  # preprocessing required by scikit-learn functions
model.predict(x_pred)

In [None]:
x_pred = np.linspace(0, 0.20, 200)            # 200 data points between 0 ~ 40
x_pred = x_pred.reshape(-1, len(features))  # preprocessing required by scikit-learn functions
y_pred = model.predict(x_pred)

In [None]:
plt.style.use('default')
plt.style.use('ggplot')

fig, ax = plt.subplots(figsize=(8, 5))

ax.plot(x_pred, y_pred, color='r', label='Regression model')
ax.scatter(X, y, edgecolor='k', facecolor='blue', alpha=0.7, label='HFU3 actual data')
ax.set_ylabel('Permeability (Md)', fontsize=10)
ax.set_xlabel('Porosity (vol/vol)', fontsize=10)
#ax.set(yscale="log")
ax.axis([0.06, 0.20, 0, 10])
ax.legend(facecolor='white', fontsize=11)
ax.text(0.05, 0.72, '$y = %.2f CPOR - %.2f $' % (model.coef_[0], abs(model.intercept_)), fontsize=17, transform=ax.transAxes)

fig.tight_layout()

In [None]:
df3 = df[df['HFU'] == 'HFU2'].dropna()
df3.head(5)

In [None]:
# Code here please so that you don't lose the output
f, ax = plt.subplots(figsize=(5, 3))

sns.scatterplot(x="CPOR", y="CPERM", data=df3)
ax.set(yscale="log")
ax.grid(True)
ax.axis([0.001, 0.15, 0.005, 50])
#ax.set_xticks([0.005,0.1,1])
#ax.set_yticks([0.005,5,10])
ax.set(xlabel="Core Porosity", ylabel="Core Permeability")

In [None]:
#preprocessing
features = ['CPOR']
target = 'CPERM'

X = df3[features].values.reshape(-1, len(features))
y = np.log(df3[target].values)
print(X.shape, y.shape)

In [None]:
#Ordinary Least Squares
from sklearn import linear_model
ols = linear_model.LinearRegression()
model = ols.fit(X, y)#linear regression coefficient and the y-intercept 
print(model.coef_, model.intercept_)

## The HFU 2 permeability  can be predicted from porosity, with the following linear model:
The result of the regression is: $10^{(44.87998396  * CPOR -4.690650768532301)}$

In [None]:
#Accuracy assessment: R2
model.score(X, y)

In [None]:
x_pred = np.linspace(0, 0.15, 200)            # 200 data points between 0 ~ 40
x_pred = x_pred.reshape(-1, len(features))  # preprocessing required by scikit-learn functions
y_pred = model.predict(x_pred)

In [None]:
plt.style.use('default')
plt.style.use('ggplot')

fig, ax = plt.subplots(figsize=(8, 5))

ax.plot(x_pred, y_pred, color='r', label='Regression model')
ax.scatter(X, y, edgecolor='k', facecolor='blue', alpha=0.7, label='HFU2 actual data')
ax.set_ylabel('Permeability (Md)', fontsize=10)
ax.set_xlabel('Porosity (vol/vol)', fontsize=10)
#ax.set(yscale="log")
ax.axis([0.0, 0.16, -5.75, 5])
ax.legend(facecolor='white', fontsize=11)
ax.text(0.05, 0.72, '$y = %.2f CPOR - %.2f $' % (model.coef_[0], abs(model.intercept_)), fontsize=17, transform=ax.transAxes)

fig.tight_layout()