In [2]:
# Import required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

We find that Eigenvector and Subgraph are essentially perfectly correlated. So we choose to only keep eigenvector since it is marginally cheaper to compute.

In [2]:
# Import dataset
normalised_centralities = pd.read_excel("~/MATH3888/Files/Datasets/normal_standardised.xlsx")

# Make dataframe without columns that are unrelated to the centrality measures
data = normalised_centralities.drop(['Unnamed: 0','Subgraph'], axis=1)

In [3]:
data

Unnamed: 0,Degree,Closeness,Eigenvector,Betweenness,Information
0,-0.255827,-1.004357,-0.340857,-2.392757,-0.871620
1,0.563683,0.530213,0.505454,0.313961,0.821233
2,-0.255827,0.254027,0.846720,0.743423,-0.220953
3,-0.951218,-0.101830,0.053197,-2.392757,-0.695312
4,0.605780,1.061621,1.081787,0.601960,0.853009
...,...,...,...,...,...
4390,-1.283990,-2.087709,-2.083078,-2.392757,-1.637294
4391,-0.531975,-1.902794,-2.125411,-0.205167,-0.942268
4392,0.115125,-1.194414,-0.895968,0.317415,0.164793
4393,-1.283990,-1.212059,-1.313679,0.015433,-1.583926


From the sklearn.decomposition package we use the PCA function to begin to determine the significance of each centrality measure. Since we are interested in 6 measures, we keep the number of components parameter equal to 6. We then fit our data to the PCA model.

In [4]:
pca = PCA(n_components = 5)
pca.fit(data)

In [5]:
# Eigenvalues of Component Vectors
component_eigenvalues = pca.explained_variance_
component_eigenvalues

array([3.9615879 , 0.61651657, 0.30331731, 0.06619867, 0.05351746])

In [6]:
# Percentage of Variance Explained by Each Component Vector
explained_variance = pca.explained_variance_ratio_
explained_variance.round(3)

array([0.792, 0.123, 0.061, 0.013, 0.011])

In [7]:
# Eigenvectors Corresponding to Each of the Components
vectors = pca.components_
vectors

array([[-0.46726302, -0.47557488, -0.44408134, -0.35869957, -0.47918701],
       [ 0.05405231, -0.2971279 , -0.43147515,  0.85005276,  0.00573143],
       [ 0.58893795, -0.2184966 , -0.52894292, -0.3851454 ,  0.42106263],
       [ 0.63759091, -0.30582864,  0.32058138,  0.01952789, -0.62991501],
       [ 0.15929077,  0.73774767, -0.48380566,  0.00515775, -0.44301276]])

Since the first component explains 79% of the variance, we will only consider it. We now calculate the weight of each centrality measure contributing to the first principal component.

In [8]:
weights = np.abs(vectors) / np.sum(np.abs(vectors), axis=1 ,keepdims=True)
weights

array([[0.21002418, 0.21376018, 0.19960454, 0.16122736, 0.21538374],
       [0.03299011, 0.1813481 , 0.26334518, 0.5188185 , 0.0034981 ],
       [0.27487256, 0.10197801, 0.24687132, 0.17975731, 0.19652081],
       [0.33321642, 0.15983152, 0.16754157, 0.01020562, 0.32920486],
       [0.08709103, 0.403358  , 0.26451711, 0.00281996, 0.2422139 ]])

In [9]:
#Taking weights of only the first components:
weights[:1]

array([[0.21002418, 0.21376018, 0.19960454, 0.16122736, 0.21538374]])

Therefore our completed index is as follows:
$$ 0.21*degree + 0.21*closeness + 0.20*eigenvector + 0.16*betweenness + 0.22*information $$

In [3]:
top = np.array([0.467,0.475,0.444,0.359,0.479])
res = top / np.sum(top)

array([0.20998201, 0.21357914, 0.19964029, 0.16142086, 0.2153777 ])