In [1]:
import mxnet as mx
import numpy as np
import scipy as sp
import sys
import networkx as nx
import igraph as ig
import logging

In [102]:
class AutoEncoderModel:
    def __init__(self, dims, internal_act=None, output_act=None):
        self.data = mx.symbol.Variable('data')
        self.y = mx.symbol.Variable('label')
        self.fc1_weight = mx.symbol.Variable('fc1_weight')
        self.fc1_bias = mx.symbol.Variable('fc1_bias')
        self.fc2_weight = mx.symbol.Variable('fc2_weight')
        self.fc2_bias = mx.symbol.Variable('fc2_bias')
        x = mx.symbol.FullyConnected(data=self.data, weight=self.fc1_weight,
                                     bias=self.fc1_bias, num_hidden=dims[1])
        if (internal_act is not None):
            x = mx.symbol.Activation(data=x, act_type=internal_act)
            print("Internal activation: " + internal_act)
        self.layer1 = x
        x = mx.symbol.FullyConnected(data=x, weight=self.fc2_weight,
                                     bias=self.fc2_bias, num_hidden=dims[2])
        if (output_act is not None):
            x = mx.symbol.Activation(data=x, act_type=output_act)
            print("Output activation: " + output_act)
        self.layer2 = x
        self.loss = mx.symbol.LinearRegressionOutput(data=x, label=self.y)
        self.model = mx.mod.Module(symbol=self.loss, data_names=['data'], label_names = ['label'])

    def fit(self, data, batch_size, num_epoch, params=None, learning_rate=0.005, reinit_opt=True):
        data_iter = mx.io.NDArrayIter(data={'data':data}, label={'label':data},
                batch_size=batch_size, shuffle=True,
                last_batch_handle='roll_over')
        
        if (params is None):
            print("Learning rate: " + str(learning_rate))
            print("batch size: " + str(batch_size))
            print("internal #epochs: " + str(num_epoch))
            # allocate memory given the input data and label shapes
            self.model.bind(data_shapes=data_iter.provide_data, label_shapes=data_iter.provide_label)
            # initialize parameters by uniform random numbers
            self.model.init_params(initializer=mx.init.Uniform(scale=.1))
            # use SGD with learning rate 0.1 to train
            self.model.init_optimizer(optimizer='sgd',
                                      optimizer_params={'learning_rate': learning_rate,
                                                        'momentum': 0.9})
        else:
            self.model.set_params(arg_params=params, aux_params=None, force_init=True)
            if (reinit_opt):
                print("reinit optimizer. New learning rate: " + str(learning_rate))
                self.model.init_optimizer(optimizer='sgd',
                                          optimizer_params={'learning_rate': learning_rate,
                                                            'momentum': 0.9}, force_init=True)
        # use accuracy as the metric
        metric = mx.metric.create('acc')
        # train 5 epochs, i.e. going over the data iter one pass
        for epoch in range(num_epoch):
            data_iter.reset()
            metric.reset()
            for batch in data_iter:
                self.model.forward(batch, is_train=True)       # compute predictions
                self.model.update_metric(metric, batch.label)  # accumulate prediction accuracy
                self.model.backward()                          # compute gradients
                self.model.update()                            # update parameters
            #print('Epoch %d, Training %s' % (epoch, metric.get()))
        #self.model.fit(data_iter, optimizer_params={'learning_rate':learning_rate, 'momentum': 0.9},
        #        optimizer='sgd', num_epoch=50, eval_metric='mse', force_rebind=True,
        #        batch_end_callback = mx.callback.Speedometer(batch_size, 2))

In [103]:
def train(data, num_dims, num_epoc, internal_act=None, output_act=None, learning_rate=0.005, batch_size=50):
    int_epoc = 100
    params = None
    model = AutoEncoderModel([data.shape[1], num_dims, data.shape[1]],
                             internal_act, output_act)
    prev_val = None
    reinit_opt = True
    for i in range(num_epoc/int_epoc):
        model.fit(data, batch_size, int_epoc, params, learning_rate, reinit_opt=reinit_opt)
        reinit_opt = False

        params = model.model.get_params()[0]
        fc1_weight = params.get('fc1_weight').asnumpy()
        fc1_bias = params.get('fc1_bias').asnumpy()
        fc2_weight = params.get('fc2_weight').asnumpy()
        fc2_bias = params.get('fc2_bias').asnumpy()

        np_data = data.asnumpy()
        hidden = np.dot(np_data, fc1_weight.T) + fc1_bias
        output = np.dot(hidden, fc2_weight.T) + fc2_bias
        val = np.sum(np.square(output - np_data))
        print("epoc " + str(i * int_epoc) + ": " + str(val))
        if (prev_val is not None and prev_val < val):
            learning_rate = learning_rate / 2
            reinit_opt = True
        prev_val = val
        sys.stdout.flush()
    return params

## Run on a low-rank data

In [63]:
rand_data1 = mx.ndarray.random_uniform(shape=[1000, 10])
rand_data2 = mx.ndarray.random_uniform(shape=[10, 100])
rand_data = mx.ndarray.dot(rand_data1, rand_data2)
print("max: " + str(mx.ndarray.max(rand_data)))
rand_data = rand_data / mx.ndarray.max(rand_data)
print(rand_data.shape)

max: 
[ 6.10102654]
<NDArray 1 @cpu(0)>
(1000L, 100L)


In [107]:
np_rand_data = rand_data.asnumpy()
U, s, Vh = sp.sparse.linalg.svds(np_rand_data, k=10)
low_dim_data = np.dot(np_rand_data, Vh.T)
print(low_dim_data)
print(sum(low_dim_data[low_dim_data > 0]))
print(sum(low_dim_data[low_dim_data < 0]))
res = np.dot(low_dim_data, Vh)
print("svd error: " + str(np.sum(np.square(res - np_rand_data))))

[[ 0.01955516  0.03985536 -0.07798745 ..., -0.05676827 -0.22096498
  -4.35739613]
 [ 0.06220453  0.07652628  0.03788156 ...,  0.16164172  0.0135784
  -3.72151923]
 [-0.0311646   0.03257374 -0.3144455  ..., -0.03104994  0.25370428
  -4.25794554]
 ..., 
 [-0.02363796 -0.00566644 -0.08939761 ...,  0.00696465 -0.15669249
  -3.04979634]
 [-0.0161101  -0.05258452 -0.21494456 ...,  0.14610089 -0.06200607
  -3.78889537]
 [ 0.1601655   0.02495101 -0.0750294  ...,  0.10465452  0.2186821
  -2.50724936]]
502.355621757
-4607.53736694
svd error: 1.68328e-08


In [82]:
params_linear_r10=train(rand_data, 10, 5000, learning_rate=0.2, batch_size=50)

Learning rate: 0.2
batch size: 50
internal #epochs: 100
epoc 0: 157.689
epoc 100: 120.806
epoc 200: 76.3864
epoc 300: 48.0992
epoc 400: 30.3625
epoc 500: 17.6435
epoc 600: 9.88616
epoc 700: 5.48403
epoc 800: 2.71852
epoc 900: 1.1848
epoc 1000: 0.489893
epoc 1100: 0.204632
epoc 1200: 0.088024
epoc 1300: 0.0387866
epoc 1400: 0.0173438
epoc 1500: 0.00782017
epoc 1600: 0.00353995
epoc 1700: 0.00160629
epoc 1800: 0.000729828
epoc 1900: 0.000331956
epoc 2000: 0.00015116
epoc 2100: 6.90351e-05
epoc 2200: 3.17257e-05
epoc 2300: 1.48614e-05
epoc 2400: 7.34132e-06
epoc 2500: 4.08044e-06
epoc 2600: 2.64298e-06
epoc 2700: 2.08031e-06
epoc 2800: 1.73534e-06
epoc 2900: 1.63155e-06
epoc 3000: 1.55802e-06
epoc 3100: 1.51333e-06
epoc 3200: 1.48502e-06
epoc 3300: 1.41454e-06
epoc 3400: 1.40252e-06
epoc 3500: 1.37172e-06
epoc 3600: 1.357e-06
epoc 3700: 1.34649e-06
epoc 3800: 1.34212e-06
epoc 3900: 1.33692e-06
epoc 4000: 1.32821e-06
epoc 4100: 1.32531e-06
epoc 4200: 1.3017e-06
epoc 4300: 1.29849e-06
epoc 

In [108]:
params_sigmoid_r10=train(rand_data, 10, 5000, internal_act='sigmoid', learning_rate=0.0005, batch_size=100)

Internal activation: tanh
Learning rate: 0.0005
batch size: 100
internal #epochs: 100
epoc 0: 14712.6
epoc 100: 10943.1
epoc 200: 6606.42
epoc 300: 3053.52
epoc 400: 1170.41
epoc 500: 740.22
epoc 600: 1099.36
reinit optimizer. New learning rate: 0.00025
epoc 700: 1392.52
reinit optimizer. New learning rate: 0.000125
epoc 800: 1546.96
reinit optimizer. New learning rate: 6.25e-05
epoc 900: 1624.36
reinit optimizer. New learning rate: 3.125e-05
epoc 1000: 1662.95
reinit optimizer. New learning rate: 1.5625e-05
epoc 1100: 1682.19
reinit optimizer. New learning rate: 7.8125e-06
epoc 1200: 1691.79
reinit optimizer. New learning rate: 3.90625e-06
epoc 1300: 1696.7
reinit optimizer. New learning rate: 1.953125e-06
epoc 1400: 1698.88
reinit optimizer. New learning rate: 9.765625e-07
epoc 1500: 1699.7
reinit optimizer. New learning rate: 4.8828125e-07
epoc 1600: 1699.94
reinit optimizer. New learning rate: 2.44140625e-07
epoc 1700: 1699.99
reinit optimizer. New learning rate: 1.220703125e-07
ep

KeyboardInterrupt: 

## Run on real data

We compute the embedding on a graph with 81306 vertices and 1768149 vertices. To embed the graph into 10 dimensions, we start with the most densest columns and increase the number of columns to embed. When we increase the number of columns to embed, we use the parameters trained from the previous run (on the dataset with a smaller number of columns).

In [110]:
elg = nx.read_edgelist("/home/ubuntu/datasets/twitter_combined.txt")
spm = nx.to_scipy_sparse_matrix(elg, dtype='f')
colsum = np.ravel(spm.sum(axis=1))

### Compute the embedding on the densest 10 columns.

In [39]:
max10 = np.sort(np.ravel(colsum), axis=None)[len(colsum) - 10]
data10 = mx.ndarray.sparse.csr_matrix(spm[:,colsum >= max10])
print(mx.ndarray.sum(data10, axis=0))
print(data10.shape)


[ 2490.  3383.  2484.  2758.  2476.  1789.  2133.  3011.  3239.  2155.]
<NDArray 10 @cpu(0)>
(81306L, 10L)


In [114]:
np_data10 = data10.asnumpy()
U, s, Vh = sp.linalg.svd(np_data10, full_matrices=False)
low_dim_data = np.dot(np_data10, Vh.T)
print(low_dim_data.shape)
print(np.max(low_dim_data))
print(np.min(low_dim_data))
res = np.dot(low_dim_data, Vh)
print("svd error: " + str(np.sum(np.square(res - np_data10))))

(81306, 10)
1.21314
-2.3992
svd error: 6.78263e-09


In [40]:
params_linear10=train(data10, 10, 5000, internal_act=None)

Learning rate: 0.005
epoc 0: 3479.27




Learning rate: 0.005
epoc 100: 1024.56




Learning rate: 0.005
epoc 200: 831.03




Learning rate: 0.005
epoc 300: 574.544




Learning rate: 0.005
epoc 400: 356.311




Learning rate: 0.005
epoc 500: 273.701




Learning rate: 0.005
epoc 600: 157.228




Learning rate: 0.005
epoc 700: 99.5471




Learning rate: 0.005
epoc 800: 90.7048




Learning rate: 0.005
epoc 900: 89.9733




Learning rate: 0.005
epoc 1000: 89.8885




Learning rate: 0.005
epoc 1100: 89.8341




Learning rate: 0.005
epoc 1200: 89.7637




Learning rate: 0.005


KeyboardInterrupt: 

In [None]:
params_sigmoid10=train(data10, 10, 5000, internal_act='tanh', learning_rate=0.0001)

Internal activation: tanh
Learning rate: 0.0001
batch size: 50
internal #epochs: 100
epoc 0: 18296.1


In [None]:
params_relu10=train(data10, 10, 5000, internal_act='relu')

### Compute the embedding on the densest 30 columns.

In [None]:
max30 = np.sort(np.ravel(colsum), axis=None)[len(colsum) - 30]
sp_data30 = spm[:,colsum >= max30]
data30 = mx.ndarray.sparse.csr_matrix(sp_data30)
print(mx.ndarray.sum(data30, axis=0))
print(data30.shape)

We want to start with SVD and see how well it performs.
One question we need to address is **what is the advantage of autoencoder over SVD**.

In [None]:
U, s, Vh = sp.sparse.linalg.svds(sp_data30, k=10)
res = np.dot(sp_data30.dot(Vh.T), Vh)
print("svd error: " + str(np.sum(np.square(res - sp_data30))))

In [None]:
params_linear30=train(data30, 5000, internal_act=None)

In [None]:
params_linear30=train(data30, 5000, internal_act='sigmoid')

### Compute the embedding on the densest 1000 columns.

In [None]:
max1000 = np.sort(np.ravel(colsum), axis=None)[len(colsum) - 1000]
sp_data1000 = spm[:,colsum >= max1000]
data1000 = mx.ndarray.sparse.csr_matrix(sp_data1000)
print(mx.ndarray.sum(data1000, axis=0))
print(data1000.shape)

In [None]:
U, s, Vh = sp.sparse.linalg.svds(sp_data1000, k=100)
res = np.dot(sp_data1000.dot(Vh.T), Vh)
print("svd error: " + str(np.sum(np.square(res - sp_data1000))))

In [None]:
params_linear1000=train(data1000, num_dims=100, num_epoc=5000, internal_act=None)

In [None]:
pref_matrix = [[0.9, 0.1], [0.1, 0.9]]
block_sizes = [70, 30]
g = ig.Graph.SBM(100, pref_matrix, block_sizes, directed=True)
sim_spm = g.get_adjacency()