In [33]:
import mxnet as mx
import numpy as np
import scipy as sp
import sys
import networkx as nx
import igraph as ig

In [65]:
class AutoEncoderModel:
    def __init__(self, dims, internal_act=None, output_act=None):
        self.data = mx.symbol.Variable('data')
        self.y = mx.symbol.Variable('label')
        self.fc1_weight = mx.symbol.Variable('fc1_weight')
        self.fc1_bias = mx.symbol.Variable('fc1_bias')
        self.fc2_weight = mx.symbol.Variable('fc2_weight')
        self.fc2_bias = mx.symbol.Variable('fc2_bias')
        x = mx.symbol.FullyConnected(data=self.data, weight=self.fc1_weight,
                                     bias=self.fc1_bias, num_hidden=dims[1])
        if (internal_act is not None):
            x = mx.symbol.Activation(data=x, act_type=internal_act)
            print("Internal activation: " + internal_act)
        self.layer1 = x
        x = mx.symbol.FullyConnected(data=x, weight=self.fc2_weight,
                                     bias=self.fc2_bias, num_hidden=dims[2])
        if (output_act is not None):
            x = mx.symbol.Activation(data=x, act_type=output_act)
            print("Output activation: " + output_act)
        self.layer2 = x
        self.loss = mx.symbol.LinearRegressionOutput(data=x, label=self.y)

    def fit(self, data, batch_size, params=None, learning_rate=0.005):
        data_iter = mx.io.NDArrayIter(data={'data':data}, label={'label':data},
                batch_size=batch_size, shuffle=True,
                last_batch_handle='roll_over')
        self.model = mx.mod.Module(symbol=self.loss, data_names=['data'], label_names = ['label'])
        self.model.bind(data_shapes=data_iter.provide_data, label_shapes=data_iter.provide_label)
        if (params is not None):
            self.model.set_params(arg_params=params, aux_params=None, force_init=True)
        print("Learning rate: " + str(learning_rate))
        self.model.fit(data_iter, optimizer_params={'learning_rate':learning_rate, 'momentum': 0.9},
                num_epoch=50, eval_metric='mse',
                batch_end_callback = mx.callback.Speedometer(batch_size, 2))

In [66]:
def train(data, num_dims, num_epoc, internal_act=None, output_act=None, learning_rate=0.005):
    int_epoc = 100
    params = None
    for i in range(num_epoc/int_epoc):
        model = AutoEncoderModel([data.shape[1], num_dims, data.shape[1]],
                                 internal_act, output_act)
        model.fit(data, int_epoc, params, learning_rate)

        params = model.model.get_params()[0]
        fc1_weight = params.get('fc1_weight').asnumpy()
        fc1_bias = params.get('fc1_bias').asnumpy()
        fc2_weight = params.get('fc2_weight').asnumpy()
        fc2_bias = params.get('fc2_bias').asnumpy()

        np_data = data.asnumpy()
        hidden = np.dot(np_data, fc1_weight.T) + fc1_bias
        output = np.dot(hidden, fc2_weight.T) + fc2_bias
        print("epoc " + str(i * int_epoc) + ": " + str(np.sum(np.square(output - np_data))))
        sys.stdout.flush()
    return params

## Run on a low-rank data

In [6]:
data1 = mx.ndarray.random_uniform(shape=[10000, 10])
data2 = mx.ndarray.random_uniform(shape=[10, 100])
data = mx.ndarray.dot(data1, data2)
print(data.shape)

(10000L, 100L)


## Run on real data

We compute the embedding on a graph with 81306 vertices and 1768149 vertices. To embed the graph into 10 dimensions, we start with the most densest columns and increase the number of columns to embed. When we increase the number of columns to embed, we use the parameters trained from the previous run (on the dataset with a smaller number of columns).

In [19]:
elg = nx.read_edgelist("/home/ubuntu/datasets/twitter_combined.txt")
spm = nx.to_scipy_sparse_matrix(elg, dtype='f')
colsum = np.ravel(spm.sum(axis=1))

### Compute the embedding on the densest 10 columns.

In [58]:
max10 = np.sort(np.ravel(colsum), axis=None)[len(colsum) - 10]
data10 = mx.ndarray.sparse.csr_matrix(spm[:,colsum >= max10])
print(mx.ndarray.sum(data10, axis=0))
print(data10.shape)


[ 2490.  3383.  2484.  2758.  2476.  1789.  2133.  3011.  3239.  2155.]
<NDArray 10 @cpu(0)>
(81306L, 10L)


In [60]:
params_linear10=train(data10, 10, 5000, internal_act=None)



epoc 0: 3093.22




epoc 100: 1652.24




epoc 200: 814.882




epoc 300: 511.462




epoc 400: 343.608




epoc 500: 258.648




epoc 600: 145.061




epoc 700: 96.793




epoc 800: 90.1161




epoc 900: 89.4603




epoc 1000: 89.1508




epoc 1100: 88.7256




epoc 1200: 88.0646




epoc 1300: 87.0461




epoc 1400: 85.4871




epoc 1500: 83.1234




epoc 1600: 79.6129




epoc 1700: 74.5397




epoc 1800: 67.513




epoc 1900: 58.3442




epoc 2000: 47.3315




epoc 2100: 35.433




epoc 2200: 24.1525




epoc 2300: 14.8976




epoc 2400: 8.34088




epoc 2500: 4.28864




epoc 2600: 2.05944




epoc 2700: 0.93874




epoc 2800: 0.412521




epoc 2900: 0.17679




epoc 3000: 0.0748465




epoc 3100: 0.031476




epoc 3200: 0.0136235




epoc 3300: 0.0065254




epoc 3400: 0.00382589




epoc 3500: 0.00253897




epoc 3600: 0.00194678




epoc 3700: 0.00166125




epoc 3800: 0.00147335




epoc 3900: 0.00138092




epoc 4000: 0.00130549




epoc 4100: 0.00120604




epoc 4200: 0.00112349




epoc 4300: 0.00107369




epoc 4400: 0.00103317




epoc 4500: 0.00100734




epoc 4600: 0.000975417




epoc 4700: 0.00092864




epoc 4800: 0.000900516




epoc 4900: 0.000881277


In [None]:
params_sigmoid10=train(data10, 10, 5000, internal_act='sigmoid', learning_rate=0.001)



Internal activation: sigmoid
epoc 0: 25502.4




Internal activation: sigmoid
epoc 100: 25582.6




Internal activation: sigmoid
epoc 200: 24295.8




Internal activation: sigmoid
epoc 300: 20895.3




Internal activation: sigmoid
epoc 400: 80275.3




Internal activation: sigmoid
epoc 500: 160455.0




Internal activation: sigmoid
epoc 600: 188911.0




Internal activation: sigmoid
epoc 700: 195222.0




Internal activation: sigmoid
epoc 800: 195682.0




Internal activation: sigmoid
epoc 900: 196449.0




Internal activation: sigmoid
epoc 1500: 310680.0




Internal activation: sigmoid
epoc 1600: 327468.0




Internal activation: sigmoid
epoc 1700: 341137.0




Internal activation: sigmoid
epoc 1800: 351887.0




Internal activation: sigmoid
epoc 1900: 360453.0




Internal activation: sigmoid
epoc 2000: 367825.0




Internal activation: sigmoid
epoc 2100: 374935.0




Internal activation: sigmoid
epoc 2200: 382565.0




Internal activation: sigmoid
epoc 2300: 391142.0




Internal activation: sigmoid
epoc 2400: 400780.0




Internal activation: sigmoid
epoc 2500: 411227.0




Internal activation: sigmoid
epoc 2600: 422099.0




Internal activation: sigmoid
epoc 2700: 432990.0




Internal activation: sigmoid


In [53]:
params_relu10=train(data10, 10, 5000, internal_act='relu')



epoc 0: 3573.51




epoc 100: 956.345




epoc 200: 846.814




epoc 300: 635.075




epoc 400: 348.802




epoc 500: 242.406




epoc 600: 133.64




epoc 700: 94.9056




epoc 800: 90.1886




epoc 900: 89.812




epoc 1000: 89.7259




epoc 1100: 89.621




epoc 1200: 89.4616




epoc 1300: 89.2094




epoc 1400: 88.8193




epoc 1500: 88.2182




epoc 1600: 87.2756




epoc 1700: 85.8358




epoc 1800: 83.6505




epoc 1900: 80.3919




epoc 2000: 75.6532




epoc 2100: 69.02




epoc 2200: 60.2623




epoc 2300: 49.541




epoc 2400: 37.7115




epoc 2500: 26.1873




epoc 2600: 16.4648




epoc 2700: 9.38381




epoc 2800: 4.89828




epoc 2900: 2.37953




epoc 3000: 1.0941




epoc 3100: 0.483896




epoc 3200: 0.208503




epoc 3300: 0.088386




epoc 3400: 0.03732




epoc 3500: 0.0160456




epoc 3600: 0.00749139




epoc 3700: 0.0040901




epoc 3800: 0.00261573




epoc 3900: 0.00197444




epoc 4000: 0.00167518




epoc 4100: 0.0014678




epoc 4200: 0.00137893




epoc 4300: 0.00131385




epoc 4400: 0.00120454




epoc 4500: 0.00116306




epoc 4600: 0.00111296




epoc 4700: 0.00108219




epoc 4800: 0.00105612




epoc 4900: 0.0010254


### Compute the embedding on the densest 30 columns.

In [20]:
max30 = np.sort(np.ravel(colsum), axis=None)[len(colsum) - 30]
sp_data30 = spm[:,colsum >= max30]
data30 = mx.ndarray.sparse.csr_matrix(sp_data30)
print(mx.ndarray.sum(data30, axis=0))
print(data30.shape)


[ 1256.  1377.  1467.  1251.  1521.  1229.  2490.  1695.  1291.  1387.
  1275.  1743.  1443.  1497.  3383.  2484.  2758.  1666.  1255.  2476.
  1789.  1358.  1269.  1395.  2133.  3011.  3239.  1568.  2155.  1509.]
<NDArray 30 @cpu(0)>
(81306L, 30L)


We want to start with SVD and see how well it performs.
One question we need to address is **what is the advantage of autoencoder over SVD**.

In [32]:
U, s, Vh = sp.sparse.linalg.svds(sp_data30, k=10)
res = np.dot(sp_data30.dot(Vh.T), Vh)
print("svd error: " + str(np.sum(np.square(res - sp_data30))))

svd error: 19096.5


In [57]:
params_linear30=train(data30, 5000, internal_act=None)



epoc 0: 36343.5




epoc 100: 28741.8




epoc 200: 24386.4




epoc 300: 20273.6




epoc 400: 19463.6




epoc 500: 19367.5




epoc 600: 19306.5




epoc 700: 19260.0




epoc 800: 19224.1




epoc 900: 19196.2




epoc 1000: 19174.4




epoc 1100: 19157.3




epoc 1200: 19143.6




epoc 1300: 19132.7




epoc 1400: 19123.9




epoc 1500: 19116.8




epoc 1600: 19111.0




epoc 1700: 19106.2




epoc 1800: 19102.3




epoc 1900: 19099.0




epoc 2000: 19096.3




epoc 2100: 19094.0




epoc 2200: 19092.1




epoc 2300: 19090.5




epoc 2400: 19089.1




epoc 2500: 19088.0




epoc 2600: 19087.0




epoc 2700: 19086.1




epoc 2800: 19085.4




epoc 2900: 19084.8




epoc 3000: 19084.2




epoc 3100: 19083.8




epoc 3200: 19083.4




epoc 3300: 19083.0




epoc 3400: 19082.8




epoc 3500: 19082.5




epoc 3600: 19082.3




epoc 3700: 19082.1




epoc 3800: 19081.9




epoc 3900: 19081.8




epoc 4000: 19081.6




epoc 4100: 19081.6




epoc 4200: 19081.5




epoc 4300: 19081.4




epoc 4400: 19081.3




epoc 4500: 19081.2




epoc 4600: 19081.2




epoc 4700: 19081.1




epoc 4800: 19081.1




epoc 4900: 19081.1


In [58]:
params_linear30=train(data30, 5000, internal_act='sigmoid')



epoc 0: 35963.1




epoc 100: 28377.2




epoc 200: 23608.4




epoc 300: 20077.8




epoc 400: 19401.5




epoc 500: 19321.8




epoc 600: 19272.9




epoc 700: 19235.8




epoc 800: 19206.8




epoc 900: 19184.0




epoc 1000: 19165.9




epoc 1100: 19151.3




epoc 1200: 19139.4




epoc 1300: 19129.8




epoc 1400: 19121.8




epoc 1500: 19115.2




epoc 1600: 19109.8




epoc 1700: 19105.3




epoc 1800: 19101.6




epoc 1900: 19098.4




epoc 2000: 19095.8




epoc 2100: 19093.6




epoc 2200: 19091.7




epoc 2300: 19090.2




epoc 2400: 19088.9




epoc 2500: 19087.7




epoc 2600: 19086.8




epoc 2700: 19086.0




epoc 2800: 19085.3




epoc 2900: 19084.7




epoc 3000: 19084.2




epoc 3100: 19083.7




epoc 3200: 19083.4




epoc 3300: 19083.0




epoc 3400: 19082.8




epoc 3500: 19082.6




epoc 3600: 19082.3




epoc 3700: 19082.2




epoc 3800: 19082.0




epoc 3900: 19081.9




epoc 4000: 19081.8




epoc 4100: 19081.6




epoc 4200: 19081.6




epoc 4300: 19081.5




epoc 4400: 19081.4




epoc 4500: 19081.4




epoc 4600: 19081.3




epoc 4700: 19081.3




epoc 4800: 19081.2




epoc 4900: 19081.1


### Compute the embedding on the densest 1000 columns.

In [49]:
max1000 = np.sort(np.ravel(colsum), axis=None)[len(colsum) - 1000]
sp_data1000 = spm[:,colsum >= max1000]
data1000 = mx.ndarray.sparse.csr_matrix(sp_data1000)
print(mx.ndarray.sum(data1000, axis=0))
print(data1000.shape)


[ 571.  319.  282. ...,  484.  304.  330.]
<NDArray 1004 @cpu(0)>
(81306L, 1004L)


In [56]:
U, s, Vh = sp.sparse.linalg.svds(sp_data1000, k=100)
res = np.dot(sp_data1000.dot(Vh.T), Vh)
print("svd error: " + str(np.sum(np.square(res - sp_data1000))))

svd error: 203856.0


In [54]:
params_linear1000=train(data1000, num_dims=100, num_epoc=5000, internal_act=None)



epoc 0: 429250.0




epoc 100: 427631.0




epoc 200: 423406.0




epoc 300: 413211.0




epoc 400: 398040.0




epoc 500: 386167.0




epoc 600: 379577.0




epoc 700: 375938.0




epoc 800: 372874.0




epoc 900: 369348.0




epoc 1000: 365236.0




epoc 1100: 360777.0




epoc 1200: 356297.0




epoc 1300: 352035.0




epoc 1400: 348084.0




epoc 1500: 344448.0




epoc 1600: 341119.0




epoc 1700: 338096.0




epoc 1800: 335370.0




epoc 1900: 332904.0




epoc 2000: 330645.0




epoc 2100: 328525.0




epoc 2200: 326488.0




epoc 2300: 324501.0




epoc 2400: 322552.0




epoc 2500: 320649.0




epoc 2600: 318803.0




epoc 2700: 317032.0




epoc 2800: 315343.0




epoc 2900: 313742.0




epoc 3000: 312225.0




epoc 3100: 310788.0




epoc 3200: 309420.0




epoc 3300: 308112.0




epoc 3400: 306851.0




epoc 3500: 305630.0




epoc 3600: 304442.0




epoc 3700: 303280.0




epoc 3800: 302139.0




epoc 3900: 301014.0




epoc 4000: 299901.0




epoc 4100: 298800.0




epoc 4200: 297706.0




epoc 4300: 296620.0




epoc 4400: 295540.0




epoc 4500: 294467.0




epoc 4600: 293402.0




epoc 4700: 292343.0




epoc 4800: 291292.0




epoc 4900: 290252.0


In [47]:
pref_matrix = [[0.9, 0.1], [0.1, 0.9]]
block_sizes = [70, 30]
g = ig.Graph.SBM(100, pref_matrix, block_sizes, directed=True)
sim_spm = g.get_adjacency()