1. Load AminerData_v6.mat using scipy.io.loadmat
2. Load original layer Ai (first layer of multiplex network)
3. Format Ai into edgelist format for MultiVERSE
4. Take output matrices Fi from FASCINATE
4. Cosine Similarity on each Fi to form Bi (second layer of multiplex network)
5. Create edgelist from Bi

In [None]:
"""
CREATING MULTIPLEX NETWORKS AND FORMATTING FOR MULTIVERSE
"""

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.io import loadmat
from sklearn.metrics.pairwise import cosine_similarity
import time

In [None]:
"""
LAYER A1 FORMATTING FOR PROPER MULTIVERSE INPUT
"""

In [2]:
aminer = loadmat('AminerData_v6.mat')
aminer.keys()

dict_keys(['__header__', '__version__', '__globals__', 'DO', 'DU', 'G', 'G_new'])

In [3]:
# import original AminerData
# A1 = aminer['G'][0,0][0][0][0] 
A2 = aminer['G'][0,1][0][0][0]
# A3 = aminer['G'][0,2][0][0][0]

In [4]:
# A1[0].shape, A3[0].shape  # representation of first node
A2[0].shape

(1, 899)

In [18]:
"""
edgelist format for MultiVERSE:
(layer, source, target, weight)
"""

class Edge():
    
    def __init__(self, layer):
        self.edges = np.transpose(np.nonzero(layer))
        
    def edgelist_format(self, weight=1, save_as_txt=False, file_name=None):
        """
        layer:       layer Ai
        edges:       (nx2) within layer edges
        save_as_txt: if True saves edge as .txt file
        file_name:   name of .txt file
    
        Takes in edges and adds layer_num and weight to format into 
        (layer_num, start node, finish node, weight)
        as a numpy array
    
        e.g.
        1,2,3,1
        layer 1, edge from node 2 to 3, weight 1
        """    
        iter = 0
        new_edges = []
        for row in self.edges:
            row = np.insert(row, 0, 1)
            row = list(np.insert(row, 3, weight))
            new_edges.append(row)
            iter += 1
        new_edges = np.asarray(new_edges)
    
        # save as input .txt file for MultiVERSE
        if save_as_txt:
            if file_name:
                np.savetxt(file_name+'.txt', new_edges, fmt='%d', delimiter=' ')
        return new_edges

In [21]:
A2 = Edge(A2)
A2.edges.shape

(4814, 2)

In [23]:
A2_edges = A2.edgelist_format()

In [24]:
A2_edges[:10]

array([[  1,   1,  22,   1],
       [  1,   1,  29,   1],
       [  1,   1,  48,   1],
       [  1,   1,  49,   1],
       [  1,   1,  57,   1],
       [  1,   1,  79,   1],
       [  1,   1,  86,   1],
       [  1,   1,  96,   1],
       [  1,   1, 100,   1],
       [  1,   1, 115,   1]], dtype=int32)

In [None]:
"""
LAYER B2 FORMATTING FOR PROPER MULTIVERSE INPUT
"""

In [None]:
F2 = np.genfromtxt('F2.csv', delimiter=',')
F2.shape

In [None]:
N = 500
cos_sim_F2 = []

for i in range(int(F2.shape[0]/N)+1):
    tic = time.time()
    if (i+1)*N > F2.shape[0]:
        final_idx = F2.shape[0]
    else:
        final_idx = (i+1)*N
    cos_sim_i = cosine_similarity(F2, F2[i*N:final_idx,])
    cos_sim_F2.append(cos_sim_i)
    toc = time.time()
    print(f"Iteration {i+1}: {toc-tic} seconds")

tic = time.time()
cos_sim_F2 = np.concatenate(cos_sim_F2, axis=1) # Concat all cos_sim_i
toc = time.time()
print(f"Concatenation took {toc-tic} seconds")
# print('cos_sim:\n', cos_sim_F2)

np.savez_compressed('B2', F2=cos_sim_F2)

In [None]:
# loading cosine similarity matrix for 2nd layer of AminerData
#B1 = np.load('B1.npz')
B2 = np.load('B2.npz')
#B3 = np.load('B3.npz')

#B1['B1'].shape, B2['B2'].shape, B3['B3'].shape
B2['B2'].shape

In [None]:
mean1 = np.mean(B1['B1'])
median1 = np.median(B1['B1'])
std1 = np.std(B1['B1'])
threshold1 = mean1 + 3*std1

print("Cosine Similarities of B1")
print("")
print("mean:", mean1)
print("median:", median1)
print("STD:", std1)
print("99.9th %ile:", mean1 + 3*std1)
print("Number of edges:", len(B1['B1'][B1['B1'] > threshold1]))

plt.hist(B1['B1'][B1['B1'] > threshold1], bins='auto')
plt.title('Largest 0.1% of Similarities')
plt.plot()
plt.show()

In [None]:
# edges (matrix indices) for node similarity above threshold value
B1_edges = np.transpose(np.where(B1['B1'] > thresholdB1)) 
#B2_edges = np.transpose(np.where(B2['B2'] > thresholdB2)) 
type(B1_edges)

In [None]:
"""
edgelist format for MultiVERSE:
(layer, source, target, weight)
"""

iter = 0
B1_new_edges = []
for row in B1_edges:
    row = np.insert(row, 0, 1)
    row = list(np.insert(row, 3, 1))
    B1_new_edges.append(row)
    iter += 1

B1_new_edges = np.asarray(B1_new_edges)
B1_new_edges.shape

In [None]:
layer2_multiplex_edges = np.vstack((A2_new_edges, B2_new_edges))
layer2_multiplex_edges.shape

In [None]:
layer2_multiplex_edges

In [None]:
np.savetxt('Aminer_layer2_multiplex_edges.txt', layer2_multiplex_edges, fmt='%d', delimiter=' ')  # save as input .txt file for MultiVERSE