In [10]:
import mxnet as mx
import numpy as np
import sys
import networkx as nx

In [6]:
class AutoEncoderModel:
    def __init__(self, dims, internal_act=None):
        self.data = mx.symbol.Variable('data')
        self.y = mx.symbol.Variable('label')
        self.fc1_weight = mx.symbol.Variable('fc1_weight')
        self.fc1_bias = mx.symbol.Variable('fc1_bias')
        self.fc2_weight = mx.symbol.Variable('fc2_weight')
        self.fc2_bias = mx.symbol.Variable('fc2_bias')
        x = mx.symbol.FullyConnected(data=self.data, weight=self.fc1_weight,
                                     bias=self.fc1_bias, num_hidden=dims[1])
        self.layer1 = x
        if (internal_act is not None):
            x = mx.symbol.Activation(data=x, act_type=internal_act)
        x = mx.symbol.FullyConnected(data=x, weight=self.fc2_weight,
                                     bias=self.fc2_bias, num_hidden=dims[2])
        self.layer2 = x
        self.loss = mx.symbol.LinearRegressionOutput(data=x, label=self.y)

    def fit(self, data, batch_size, params=None):
        data_iter = mx.io.NDArrayIter(data={'data':data}, label={'label':data},
                batch_size=batch_size, shuffle=True,
                last_batch_handle='roll_over')
        self.model = mx.mod.Module(symbol=self.loss, data_names=['data'], label_names = ['label'])
        self.model.bind(data_shapes=data_iter.provide_data, label_shapes=data_iter.provide_label)
        if (params is not None):
            self.model.set_params(arg_params=params, aux_params=None, force_init=True)
        self.model.fit(data_iter, optimizer_params={'learning_rate':0.005, 'momentum': 0.9},
                num_epoch=50, eval_metric='mse',
                batch_end_callback = mx.callback.Speedometer(batch_size, 2))

In [7]:
def train(data, num_epoc, internal_act=None):
    int_epoc = 100
    params = None
    for i in range(num_epoc/int_epoc):
        model = AutoEncoderModel([data.shape[1], 10, data.shape[1]])
        model.fit(data, int_epoc, params)

        params = model.model.get_params()[0]
        fc1_weight = params.get('fc1_weight').asnumpy()
        fc1_bias = params.get('fc1_bias').asnumpy()
        fc2_weight = params.get('fc2_weight').asnumpy()
        fc2_bias = params.get('fc2_bias').asnumpy()

        np_data = data.asnumpy()
        hidden = np.dot(np_data, fc1_weight.T) + fc1_bias
        output = np.dot(hidden, fc2_weight.T) + fc2_bias
        print("epoc " + str(i * int_epoc) + ": " + str(np.sum(np.square(output - np_data))))
        sys.stdout.flush()
    return params

In [8]:
data1 = mx.ndarray.random_uniform(shape=[10000, 10])
data2 = mx.ndarray.random_uniform(shape=[10, 100])
data = mx.ndarray.dot(data1, data2)
print(data.shape)

(10000L, 100L)


## Run on real data

We compute the embedding on a graph with 81306 vertices and 1768149 vertices. To embed the graph into 10 dimensions, we start with the most densest columns and increase the number of columns to embed. When we increase the number of columns to embed, we use the parameters trained from the previous run (on the dataset with a smaller number of columns).

In [44]:
elg = nx.read_edgelist("/home/ubuntu/datasets/twitter_combined.txt")
spm = nx.to_scipy_sparse_matrix(elg)
colsum = np.ravel(spm.sum(axis=1))

### Compute the embedding on the densest 10 columns.

In [51]:
max10 = np.sort(np.ravel(colsum), axis=None)[len(colsum) - 10]
data10 = mx.ndarray.sparse.csr_matrix(spm[:,colsum >= max10])
print(mx.ndarray.sum(data, axis=0))
print(data.shape)


[ 2490.  3383.  2484.  2758.  2476.  1789.  2133.  3011.  3239.  2155.]
<NDArray 10 @cpu(0)>
(81306L, 10L)


In [60]:
params_linear10=train(data10, 5000, internal_act=None)



epoc 0: 3093.22




epoc 100: 1652.24




epoc 200: 814.882




epoc 300: 511.462




epoc 400: 343.608




epoc 500: 258.648




epoc 600: 145.061




epoc 700: 96.793




epoc 800: 90.1161




epoc 900: 89.4603




epoc 1000: 89.1508




epoc 1100: 88.7256




epoc 1200: 88.0646




epoc 1300: 87.0461




epoc 1400: 85.4871




epoc 1500: 83.1234




epoc 1600: 79.6129




epoc 1700: 74.5397




epoc 1800: 67.513




epoc 1900: 58.3442




epoc 2000: 47.3315




epoc 2100: 35.433




epoc 2200: 24.1525




epoc 2300: 14.8976




epoc 2400: 8.34088




epoc 2500: 4.28864




epoc 2600: 2.05944




epoc 2700: 0.93874




epoc 2800: 0.412521




epoc 2900: 0.17679




epoc 3000: 0.0748465




epoc 3100: 0.031476




epoc 3200: 0.0136235




epoc 3300: 0.0065254




epoc 3400: 0.00382589




epoc 3500: 0.00253897




epoc 3600: 0.00194678




epoc 3700: 0.00166125




epoc 3800: 0.00147335




epoc 3900: 0.00138092




epoc 4000: 0.00130549




epoc 4100: 0.00120604




epoc 4200: 0.00112349




epoc 4300: 0.00107369




epoc 4400: 0.00103317




epoc 4500: 0.00100734




epoc 4600: 0.000975417




epoc 4700: 0.00092864




epoc 4800: 0.000900516




epoc 4900: 0.000881277


In [52]:
params_sigmoid10=train(data10, 5000, internal_act='sigmod')



epoc 0: 3203.39




epoc 100: 955.346




epoc 200: 834.44




epoc 300: 581.168




epoc 400: 335.639




epoc 500: 230.148




epoc 600: 126.122




epoc 700: 93.4967




epoc 800: 89.6824




epoc 900: 89.1454




epoc 1000: 88.684




epoc 1100: 87.9974




epoc 1200: 86.936




epoc 1300: 85.3147




epoc 1400: 82.8662




epoc 1500: 79.2316




epoc 1600: 74.0054




epoc 1700: 66.7879




epoc 1800: 57.4339




epoc 1900: 46.2934




epoc 2000: 34.391




epoc 2100: 23.239




epoc 2200: 14.2056




epoc 2300: 7.88946




epoc 2400: 4.02984




epoc 2500: 1.92389




epoc 2600: 0.873445




epoc 2700: 0.382925




epoc 2800: 0.164155




epoc 2900: 0.0694609




epoc 3000: 0.0293272




epoc 3100: 0.0126352




epoc 3200: 0.00589858




epoc 3300: 0.0032963




epoc 3400: 0.00218487




epoc 3500: 0.00166357




epoc 3600: 0.00146754




epoc 3700: 0.00126344




epoc 3800: 0.00118147




epoc 3900: 0.00110551




epoc 4000: 0.00105736




epoc 4100: 0.00103853




epoc 4200: 0.001006




epoc 4300: 0.000992741




epoc 4400: 0.000968575




epoc 4500: 0.000948211




epoc 4600: 0.000939742




epoc 4700: 0.000920065




epoc 4800: 0.000907595




epoc 4900: 0.000887442


In [53]:
params_relu10=train(data10, 5000, internal_act='relu')



epoc 0: 3573.51




epoc 100: 956.345




epoc 200: 846.814




epoc 300: 635.075




epoc 400: 348.802




epoc 500: 242.406




epoc 600: 133.64




epoc 700: 94.9056




epoc 800: 90.1886




epoc 900: 89.812




epoc 1000: 89.7259




epoc 1100: 89.621




epoc 1200: 89.4616




epoc 1300: 89.2094




epoc 1400: 88.8193




epoc 1500: 88.2182




epoc 1600: 87.2756




epoc 1700: 85.8358




epoc 1800: 83.6505




epoc 1900: 80.3919




epoc 2000: 75.6532




epoc 2100: 69.02




epoc 2200: 60.2623




epoc 2300: 49.541




epoc 2400: 37.7115




epoc 2500: 26.1873




epoc 2600: 16.4648




epoc 2700: 9.38381




epoc 2800: 4.89828




epoc 2900: 2.37953




epoc 3000: 1.0941




epoc 3100: 0.483896




epoc 3200: 0.208503




epoc 3300: 0.088386




epoc 3400: 0.03732




epoc 3500: 0.0160456




epoc 3600: 0.00749139




epoc 3700: 0.0040901




epoc 3800: 0.00261573




epoc 3900: 0.00197444




epoc 4000: 0.00167518




epoc 4100: 0.0014678




epoc 4200: 0.00137893




epoc 4300: 0.00131385




epoc 4400: 0.00120454




epoc 4500: 0.00116306




epoc 4600: 0.00111296




epoc 4700: 0.00108219




epoc 4800: 0.00105612




epoc 4900: 0.0010254


### Compute the embedding on the densest 30 columns.

In [55]:
max30 = np.sort(np.ravel(colsum), axis=None)[len(colsum) - 30]
data30 = mx.ndarray.sparse.csr_matrix(spm[:,colsum >= max30])
print(mx.ndarray.sum(data30, axis=0))
print(data30.shape)


[ 1256.  1377.  1467.  1251.  1521.  1229.  2490.  1695.  1291.  1387.
  1275.  1743.  1443.  1497.  3383.  2484.  2758.  1666.  1255.  2476.
  1789.  1358.  1269.  1395.  2133.  3011.  3239.  1568.  2155.  1509.]
<NDArray 30 @cpu(0)>
(81306L, 30L)


In [57]:
params_linear30=train(data30, 5000, internal_act=None)



epoc 0: 36343.5




epoc 100: 28741.8




epoc 200: 24386.4




epoc 300: 20273.6




epoc 400: 19463.6




epoc 500: 19367.5




epoc 600: 19306.5




epoc 700: 19260.0




epoc 800: 19224.1




epoc 900: 19196.2




epoc 1000: 19174.4




epoc 1100: 19157.3




epoc 1200: 19143.6




epoc 1300: 19132.7




epoc 1400: 19123.9




epoc 1500: 19116.8




epoc 1600: 19111.0




epoc 1700: 19106.2




epoc 1800: 19102.3




epoc 1900: 19099.0




epoc 2000: 19096.3




epoc 2100: 19094.0




epoc 2200: 19092.1




epoc 2300: 19090.5




epoc 2400: 19089.1




epoc 2500: 19088.0




epoc 2600: 19087.0




epoc 2700: 19086.1




epoc 2800: 19085.4




epoc 2900: 19084.8




epoc 3000: 19084.2




epoc 3100: 19083.8




epoc 3200: 19083.4




epoc 3300: 19083.0




epoc 3400: 19082.8




epoc 3500: 19082.5




epoc 3600: 19082.3




epoc 3700: 19082.1




epoc 3800: 19081.9




epoc 3900: 19081.8




epoc 4000: 19081.6




epoc 4100: 19081.6




epoc 4200: 19081.5




epoc 4300: 19081.4




epoc 4400: 19081.3




epoc 4500: 19081.2




epoc 4600: 19081.2




epoc 4700: 19081.1




epoc 4800: 19081.1




epoc 4900: 19081.1


In [58]:
params_linear30=train(data30, 5000, internal_act='sigmod')



epoc 0: 35963.1




epoc 100: 28377.2




epoc 200: 23608.4




epoc 300: 20077.8




epoc 400: 19401.5




epoc 500: 19321.8




epoc 600: 19272.9




epoc 700: 19235.8




epoc 800: 19206.8




epoc 900: 19184.0




epoc 1000: 19165.9




epoc 1100: 19151.3




epoc 1200: 19139.4




epoc 1300: 19129.8




epoc 1400: 19121.8




epoc 1500: 19115.2




epoc 1600: 19109.8




epoc 1700: 19105.3




epoc 1800: 19101.6




epoc 1900: 19098.4




epoc 2000: 19095.8




epoc 2100: 19093.6




epoc 2200: 19091.7




epoc 2300: 19090.2




epoc 2400: 19088.9




epoc 2500: 19087.7




epoc 2600: 19086.8




epoc 2700: 19086.0




epoc 2800: 19085.3




epoc 2900: 19084.7




epoc 3000: 19084.2




epoc 3100: 19083.7




epoc 3200: 19083.4




epoc 3300: 19083.0




epoc 3400: 19082.8




epoc 3500: 19082.6




epoc 3600: 19082.3




epoc 3700: 19082.2




epoc 3800: 19082.0




epoc 3900: 19081.9




epoc 4000: 19081.8




epoc 4100: 19081.6




epoc 4200: 19081.6




epoc 4300: 19081.5




epoc 4400: 19081.4




epoc 4500: 19081.4




epoc 4600: 19081.3




epoc 4700: 19081.3




epoc 4800: 19081.2




epoc 4900: 19081.1
