In [1]:
import numpy as np
import h5py
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader
from torch.optim import Adam, SGD
from torch.nn import Linear
from torch.utils.data import Dataset
from sklearn.cluster import KMeans
# from evaluation import eva

import time
import pandas as pd
import csv

# torch.cuda.set_device(3)

In [2]:

'''
class AE(nn.Module):

    # initially n_input,500,500,2000,10,2000,500,500,n_input
    def __init__(self, n_input, n_1, n_2, n_3, n_z, n_d3, n_d2, n_d1 ):
        super(AE, self).__init__()
        self.enc_1 = Linear(n_input, n_1)
        self.enc_2 = Linear(n_1, n_2)
        self.enc_3 = Linear(n_2, n_3)
        self.z_layer = Linear(n_3, n_z)

        self.dec_3 = Linear(n_z, n_d3)
        self.dec_2 = Linear(n_d3, n_d2)
        self.dec_1 = Linear(n_d2, n_d1)
        self.x_bar_layer = Linear(n_d1, n_input)

    def forward(self, x):
        enc_h1 = F.relu(self.enc_1(x))
        enc_h2 = F.relu(self.enc_2(enc_h1))
        enc_h3 = F.relu(self.enc_3(enc_h2))
        z = self.z_layer(enc_h3)

        dec_h3 = F.relu(self.dec_3(z))
        dec_h2 = F.relu(self.dec_2(dec_h3))
        dec_h1 = F.relu(self.dec_1(dec_h2))
        x_bar = self.x_bar_layer(dec_h1)

        return x_bar, z

'''


class LoadDataset(Dataset):
    def __init__(self, data):
        self.x = data
    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return torch.from_numpy(np.array(self.x[idx])).float(), \
               torch.from_numpy(np.array(idx))


def adjust_learning_rate(optimizer, epoch):
    lr = 0.001 * (0.1 ** (epoch // 20))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


In [3]:
# def pretrain_ae(model, dataset, y):
def pretrain_ae(epochs, model, dataset, out_fname, _lr ):
    train_loader = DataLoader(dataset, batch_size=256, shuffle=True)
    print(model)
    optimizer = Adam(model.parameters(), lr=_lr )   # lr
    for epoch in range( epochs ):
        # adjust_learning_rate(optimizer, epoch)
        for batch_idx, (x, _) in enumerate(train_loader):
            x = x.cuda()

            x_bar, _ = model(x)
            loss = F.mse_loss(x_bar, x)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        with torch.no_grad():
            x = torch.Tensor(dataset.x).cuda().float()
            x_bar, z = model(x)
            loss = F.mse_loss(x_bar, x)
            print('{} loss: {}'.format(epoch, loss))           
        
        if epoch == epochs-1:
            print('writing...')
            torch.save(model.state_dict(), out_fname)


In [4]:
def  dopretrain( name, epochs, _n_1, _n_2, _n_3, _n_z, _n_d3, _n_d2, _n_d1, _lr = 1e-3):
    print('begin time:{}'.format(time.asctime(time.localtime(time.time()))))    # time 

    # x = np.loadtxt('dblp.txt', dtype=float)                     # dblp
    # y = np.loadtxt('dblp_label.txt', dtype=int)

#     in_fname = './mydata/bio72.csv'
#     df = pd.read_csv(in_fname, header= 0, index_col= 0)         # bio72
#     x = np.array(df).tolist()

    in_fname = './mydata/ordered_'+ name + '.txt'
    out_fname = './pretrain/' + name + '.pkl'
    
    print( '{} reading...'.format(in_fname) )
    x = np.loadtxt( in_fname, dtype=float)
    _n_input = len(x[0])
    dataset = LoadDataset(x)
    model = AE( 
        n_input = _n_input, n_1=_n_1,  n_2=_n_2,  n_3=_n_3,
        n_z = _n_z,  n_d3=_n_d3,  n_d2=_n_d2, n_d1=_n_d1 ).cuda()
    
    print( 'pretraining...' )
    pretrain_ae(epochs, model, dataset, out_fname, _lr)
    print('end time:{}'.format(time.asctime(time.localtime(time.time()))))    # time 


In [5]:
# do pretrain for embeddings
names = ['dpwk','line','lle','n2v']
for name in names:
    dopretrain(name,50, 500,500,2000,10,2000,500,500, 1e-3)

begin time:Fri Apr 16 13:44:11 2021
./mydata/ordered_dpwk.txt reading...
pretraining...
AE(
  (enc_1): Linear(in_features=128, out_features=500, bias=True)
  (enc_2): Linear(in_features=500, out_features=500, bias=True)
  (enc_3): Linear(in_features=500, out_features=2000, bias=True)
  (z_layer): Linear(in_features=2000, out_features=10, bias=True)
  (dec_3): Linear(in_features=10, out_features=2000, bias=True)
  (dec_2): Linear(in_features=2000, out_features=500, bias=True)
  (dec_1): Linear(in_features=500, out_features=500, bias=True)
  (x_bar_layer): Linear(in_features=500, out_features=128, bias=True)
)
0 loss: 0.08386076986789703
1 loss: 0.06812795996665955
2 loss: 0.06061800196766853
3 loss: 0.05757823958992958
4 loss: 0.05536290258169174
5 loss: 0.053814295679330826
6 loss: 0.052566058933734894
7 loss: 0.05150995030999184
8 loss: 0.050626110285520554
9 loss: 0.04965493828058243
10 loss: 0.04909583181142807
11 loss: 0.04842080548405647
12 loss: 0.048101361840963364
13 loss: 0.04

38 loss: 0.006099383812397718
39 loss: 0.006061929278075695
40 loss: 0.006003014277666807
41 loss: 0.0059440829791128635
42 loss: 0.005954446271061897
43 loss: 0.005918282084167004
44 loss: 0.005827981512993574
45 loss: 0.005762611981481314
46 loss: 0.00575792184099555
47 loss: 0.00572360772639513
48 loss: 0.005682235583662987
49 loss: 0.005712111946195364
writing...
end time:Fri Apr 16 13:47:36 2021
