<a href="https://colab.research.google.com/github/yasirabd/research-diagnostic-turbine/blob/main/Variable_Similarity_Based_Modeling_(VBM).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*Author: Yasir Abdur Rohman*<br>
**Property of PT Indonesia Power & Lab Getaran & Diagnosis Mesin Undip**

---

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import dot
from numpy.linalg import norm
from scipy.spatial import distance

In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

worksheet = gc.open('Sample Data VBM').sheet1

# get_all_values gives a list of rows.
rows = worksheet.get_all_values()

In [None]:
data = pd.DataFrame.from_records(rows)
data.columns = data.iloc[0]
data = data[1:]

# convert to numeric
cols = data.columns
data[cols] = data[cols].apply(pd.to_numeric, errors='coerce', axis=1)

data.head()

Unnamed: 0,Tag 1,Tag 2,Tag 3,Tag 4,Tag 5
1,420,289,2288,5062,4084
2,420,276,1830,5168,4140
3,392,261,2220,4929,4151
4,401,286,1990,5242,4126
5,381,274,1647,4833,3960


In [None]:
# convert to numpy array and transpose
state_matrix = data.values.T

print(f"Shape state matrix: {state_matrix.shape}")
state_matrix

Shape state matrix: (5, 400)


array([[ 420,  420,  392, ...,  378,  410,  388],
       [ 289,  276,  261, ...,  265,  262,  256],
       [2288, 1830, 2220, ..., 2086, 1766, 1692],
       [5062, 5168, 4929, ..., 4908, 5355, 4632],
       [4084, 4140, 4151, ..., 4045, 4110, 4183]])

In [None]:
current_actual = np.array([407, 251, 1890, 5295, 4175]).T
print(f"Shape current actual: {current_actual.shape}")
current_actual

Shape current actual: (5,)


array([ 407,  251, 1890, 5295, 4175])

In [None]:
# calculate similarity
def cosine_similarity(vector1, vector2):
    cos_sim = dot(vector1, vector2)/(norm(vector1)*norm(vector2))
    return cos_sim

def euclidean_distance(vector1,vector2):
  return np.sqrt(sum(pow(a-b,2) for a, b in zip(vector1, vector2)))

def scipy_distance(vector1, vector2, dist='euclidean'):
    if dist == 'euclidean':
        return distance.euclidean(vector1, vector2)
    elif dist == 'braycurtis':
        return distance.braycurtis(vector1, vector2)
    elif dist == 'correlation':
        return distance.correlation(vector1, vector2)
    elif dist == 'canberra':
        return distance.canberra(vector1, vector2)
    elif dist == 'chebyshev':
        return distance.chebyshev(vector1, vector2)
    elif dist == 'cityblock':
        return distance.cityblock(vector1, vector2)
    elif dist == 'minkowski':
        return distance.minkowski(vector1, vector2)
    elif dist == 'sqeuclidean':
        return distance.sqeuclidean(vector1, vector2)
    elif dist == 'cosine':
        return distance.cosine(vector1, vector2)

In [None]:
# example
a = np.array([407, 251, 1890, 5295, 4175])
b = np.array([370, 255, 1620, 4775, 4231])

print(f"Cosine similarity: {cosine_similarity(a,b)}")
print(f"Euclidean similarity: {euclidean_distance(a,b)}")
print(f"Braycurtis similarity: {scipy_distance(a,b, dist='braycurtis')}")
print(f"Correlation similarity: {scipy_distance(a,b, dist='correlation')}")
print(f"Canberra similarity: {scipy_distance(a,b, dist='canberra')}")
print(f"Chebyshev similarity: {scipy_distance(a,b, dist='chebyshev')}")
print(f"Cityblock similarity: {scipy_distance(a,b, dist='cityblock')}")
print(f"Minkowski similarity: {scipy_distance(a,b, dist='minkowski')}")
print(f"Squared Euclidean similarity: {scipy_distance(a,b, dist='sqeuclidean')}")
print(f"Cosine similarity: {scipy_distance(a,b, dist='cosine')}")

Cosine similarity: 0.9981632316930458
Euclidean similarity: 589.7635119266026
Braycurtis similarity: 0.03811938630796338
Correlation similarity: 0.004387840336861126
Canberra similarity: 0.1907477013308671
Chebyshev similarity: 520
Cityblock similarity: 887
Minkowski similarity: 589.7635119266026
Squared Euclidean similarity: 347821.0
Cosine similarity: 0.0018367683069543395


The most similar result with an example from GE VBM similarity is `Cosine Similarity`.

In [None]:
# example 2
a = np.array([407, 251, 1890, 5295, 4175])
b = np.array([370, 255, 1620, 4775, 4231])
c = np.array([371, 255, 1623, 4774, 4228])
d = np.array([409, 266, 2333, 5090, 4002])
e = np.array([407, 251, 1890, 5295, 4175])

print(f"Cosine similarity a and b: {scipy_distance(a,b, dist='cosine')}")
print(f"Cosine similarity a and c: {scipy_distance(a,c, dist='cosine')}")
print(f"Cosine similarity a and d: {scipy_distance(a,d, dist='cosine')}")
print(f"Cosine similarity a and e: {scipy_distance(a,e, dist='cosine')}")
print(f"Bray-Curtis similarity a and e: {scipy_distance(a,e, dist='braycurtis')}")
print(f"Bray-Curtis similarity a and b: {1-scipy_distance(a,b, dist='braycurtis')}")

print()

# with highest score 1
print(f"Cosine similarity a and b: {cosine_similarity(a,b)}")
print(f"Cosine similarity a and c: {cosine_similarity(a,c)}")
print(f"Cosine similarity a and d: {cosine_similarity(a,d)}")
print(f"Cosine similarity a and e: {cosine_similarity(a,e)}")
print(f"Correlation similarity: {scipy_distance(a,e, dist='correlation')}")
print(f"Canberra similarity: {scipy_distance(a,e, dist='canberra')}")


Cosine similarity a and b: 0.0018367683069543395
Cosine similarity a and c: 0.0018098931815617725
Cosine similarity a and d: 0.0026239179578928518
Cosine similarity a and e: 0.0
Bray-Curtis similarity a and e: 0.0
Bray-Curtis similarity a and b: 0.9618806136920366

Cosine similarity a and b: 0.9981632316930458
Cosine similarity a and c: 0.9981901068184382
Cosine similarity a and d: 0.9973760820421069
Cosine similarity a and e: 1.0
Correlation similarity: 0.0
Canberra similarity: 0.0


In [None]:
current_actual

array([ 407,  251, 1890, 5295, 4175])

In [None]:
# calculate similarity
sim_vec = []

for i in range(state_matrix.shape[1]):
    sim = 1 - scipy_distance(current_actual, state_matrix[:, i], dist='canberra')
    sim_vec.append(sim)

In [None]:
top10 = np.sort(np.array(sim_vec).argsort()[::-1][:10])
sim_vec10 = np.array(sim_vec)[top10]

print(top10)
print(sim_vec10)

[ 31  58  88 149 160 168 228 245 326 368]
[0.93760353 0.94298773 0.95397765 0.95366041 0.95034228 0.93906428
 0.94808046 0.9504988  0.9518535  0.94672577]


In [None]:
dynamic_matrix = state_matrix[:, top10]
dynamic_matrix

array([[ 409,  402,  420,  392,  405,  412,  408,  410,  392,  412],
       [ 258,  249,  256,  254,  253,  241,  265,  247,  254,  253],
       [1956, 1989, 1941, 1867, 1833, 1971, 1833, 1991, 1910, 1900],
       [5245, 5271, 5317, 5153, 5011, 5187, 5292, 5357, 5355, 4901],
       [3977, 4019, 4133, 4191, 4172, 4202, 4109, 4225, 4281, 4159]])

In [None]:
# calculate weight
weight = np.array([s/np.sum(sim_vec10) for s in sim_vec10])
weight, np.sum(weight)

(array([0.09895766, 0.09952593, 0.10068584, 0.10065236, 0.10030215,
        0.09911184, 0.10006343, 0.10031867, 0.10046165, 0.09992045]), 1.0)

In [None]:
# estimate value
estimate_value = np.dot(dynamic_matrix, weight.T)

print(current_actual)
print(estimate_value)

[ 407  251 1890 5295 4175]
[ 406.18806805  253.00936326 1918.95209054 5208.97466523 4147.09903767]


In [None]:
from sklearn.metrics import mean_squared_error

print(f"MSE: {mean_squared_error(current_actual, estimate_value)}")
print(f"RMSE: {np.sqrt(mean_squared_error(current_actual, estimate_value))}")

MSE: 1804.3484482823874
RMSE: 42.47762291233335


In [None]:
smart_signal_estimate = np.array([406, 253, 1950, 5315, 4164])

In [None]:
for i in range(len(current_actual)):
    sel1 = np.abs(current_actual[i]-estimate_value[i]) 
    sel2 = np.abs(current_actual[i]-smart_signal_estimate[i])
    # print(f"Actual: {current_actual[i]}; Esimate: {estimate_value[i]}")
    print(sel1, sel2)

0.8119319465114359 1
2.0093632552983536 2
28.95209053797339 60
86.02533476591634 20
27.90096233335771 11
