In [1]:
'''
2022.09.13，复现tri-party deep network representation
github address: https://github.com/GRAND-Lab/TriDNR
packages:
    gensim==3.8.3
    scikit-learn==1.1.2
    numpy==1.23.3
    pandas==1.4.4
    scipy==1.9.1
'''

'\n2022.09.13，复现tri-party deep network representation\ngithub address: https://github.com/GRAND-Lab/TriDNR\n'

# Tri-party Deep Network Representation

In [2]:
'''
tri-party DNR is based on python package gensim and DeepWalk.
dataset: 
    docs.txt: title information of each node in the network, each line represents a node(paper). The first item in each line is the node ID.
    adjedges.txt: neighbor nodes of each node in a network. the first node ID is the pivot node, and the rest items are the neighbor nodes linking to the first node.
    labels.txt: class labels of a node. Each line represents a node ID and its class label.
'''

'\ntri-party DNR is based on python package gensim and DeepWalk.\ndataset: \n    docs.txt: title information of each node in the network, each line represents a node(paper). The first item in each line is the node ID.\n    adjedges.txt: neighbor nodes of each node in a network. the first node ID is the pivot node, and the rest items are the neighbor nodes linking to the first node.\n    labels.txt: class labels of a node. Each line represents a node ID and its class label.\n'

## Networkutils

In [15]:
from collections import namedtuple
from gensim.models.doc2vec import Doc2Vec
from gensim.models.word2vec import Word2Vec
from random import shuffle
from deepwalk import graph
import gensim
import random
import gensim.utils as ut

In [16]:
NetworkSentence = namedtuple('NetworkSentence', 'words tags labels index')
Result = namedtuple('Result', 'alg trainsize acc macro_f1 micro_f1')
AlgResult = namedtuple('AlgResult', 'alg trainsize numfeature mean std')

In [17]:
def readNetworkData(dir, stemmer=0):  # dir, directory of network dataset
    allindex = {}
    alldocs = []
    labelset = set()
    with open(dir+'/docs.txt', 'r', encoding='utf-8') as f1, open(dir+'/labels.txt', 'r', encoding='utf-8') as f2:
        for l1 in f1:
#             tokens = ut.to_unicode(l1.lower()).split()
            if stemmer == 1:
                l1 = gensim.parsing.stem_text(l1)  # step_text=lower() + step()
            else:
                l1 = l1.lower()
            tokens = ut.to_unicode(l1).split()  # to_unicode()转换成unicode编码
            
            words = tokens[1:]  # extract texts of document
            tags = [tokens[0]]  # ID of each document, for doc2vec model
            index = len(alldocs)
            allindex[tokens[0]] = index  # A mapping from documentID to index, start from 0
            
            l2 = f2.readline()
            tokens2 = gensim.utils.to_unicode(l2).split()
            labels = tokens2[1]  # class label
            labelset.add(labels)
            alldocs.append(NetworkSentence(words, tags, labels, index))
    return alldocs, allindex, list(labelset)

In [18]:
def trainDoc2Vec(doc_list=None, buildvoc=1, passes=20, dm=0, size=100, dm_mean=0, window=5,\
                hs=1, negative=5, min_count=1, workers=4):
    model = Doc2Vec(dm=dm, size=size, dm_mean=dm_mean, window=window, hs=hs, negative=negative,\
                   min_count=min_count, workers=workers) # PV-DBOW
    if buildvoc == 1:
        print('Building Vocabulary')
        model.build_vocab(doc_list)  # build vocabulate with words + nodeID
    
    for  epoch in range(passes):
        print('Iteration %d ....' % epoch)
        shuffle(doc_list)  # shuffling gets best results
        model.train(doc_list, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [19]:
def trainWord2Vec(doc_list=None, buildvoc=1, passes=20, sg=1, size=100, dm_mean=0, window=5, hs=1,negative=5,
                 min_count=1, workers=4):
    model = Word2Vec(size=size, sg=sg, window=window, hs=hs, negative=negative, min_count=min_count, workers=workers)
    if buildvoc == 1:
        print('Building Vocabulary')
        model.build_vocab(doc_list)  # build vocabulate with words + nodeID
    for epoch in range(passes):
        print('Iteration %d ...' % epoch)
        shuffle(doc_list)
        model.train(doc_list, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [20]:
def getdeepwalks(directory, number_walks=50, walk_length=10, seed=1):
    Graph = graph.load_adjacencylist(directory+'/adjedges.txt')
    print('Number of nodes: {}'.format(len(Graph.nodes())))
    num_walks = len(Graph.nodes()) * number_walks
    print('Number of walks: {}'.format(num_walks))
    
    print('Walking...')
    walks = graph.build_deepwalk_corpus(Graph, num_paths=number_walks, path_length=walk_length, alpha=0,
                                        rand=random.Random(seed))
    networksentence = []
    raw_walks = []
    for i, x in enumerate(walks):
        sentence = [gensim.utils.to_unicode(str(t)) for t in x]
        s = NetworkSentence(sentence, [sentence[0]], None, i) # label information is not used by random walk
        networksentence.append(s)
        raw_walks.append(sentence)
    return raw_walks, networksentence

In [21]:
def coraEdgeFileToAdjfile(edgfile, adjfile, nodec):
    edgeajd = {str(n): list() for n in range(ndoc)}
    with open(edgefile, 'r') as f:
        for l in f:
            tokens = l.split()
            degeadj[token[0]].append(tokens[1])
            
    wf = open(adjfile, 'w')
    for n in range(nodoc):
        edgestr = ' '.join(map(str, edgeadj[str(n)]))
        wf.write(str(n) + ' ' + edgestr + '\n')
    wf.close()

In [22]:
def cora10groupdataset():
    groupindex = {}
    groupmap = {}
    with open('data2/Cora/CoraHierarchyTree.txt', 'r') as f:
        for l in f:
            tokens = l.split('\t')
            if len(tokens) <= 2:
                continue
            elif len(tokens) == 3:
                currentindex = len(groupindex)
                groupindex[tokens[1]] = currentindex
            elif len(tokens) == 5:
                groupmap[tokens[3]] = currentindex
            elif len(tokens) == 6:
                groupmap[tokens[4]] = currentindex
            else:
                pass

## Evaluation

In [23]:
from sklearn.svm import LinearSVC

In [24]:
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing

from gensim.models.doc2vec import Doc2Vec

In [25]:
def evaluation(train_vec, test_vec, train_y, test_y, classifierStr='SVM', normalize=0):
    if classifierStr == 'KNN':
        print('Training NN classifier...')
        classifier = KNeighborsClassifier(n_neighbors=1)
    else:
        print('Training SVM classifier...')
        classifier = LinearSVC()
    if normalize == 1:
        print('Normalize data')
        allvec = list(train_vec)
        allvec.extend(test_vec)
        allvec_normalized = preprocessing.normalize(allvec, norm='l2', axis=1)
        train_vec = allvec_normalized[0:len(train_y)]
        test_vec = allvec_normalized[len(train_y):]
    
    # training
    classifier.fit(train_vec, train_y)
    y_pred = classifier.predict(test_vec)
    cm = confusion_matrix(test_y, y_pred)  # 混淆矩阵
    print(cm)
    acc = accuracy_score(test_y, y_pred)
    print(acc)
    macro_f1 = f1_score(test_y, y_pred, pos_label=None, average='macro')
    micro_f1 = f1_score(test_y, y_pred, pos_label=None, average='micro')
    
    percent = len(train_y) * 1.0/(len(train_y) + len(test_y))
    print('Classification method:' + classifierStr + '(train, test, Training_percent): (%d, %d, %f)' %
          (len(train_y), len(test_y), percent))
    print('Classification Accuracy=%f, macro_f1=%f, micro_f1=%f' % (acc, macro_f1, micro_f1))
    # print(metrics.classification_report(test_y, y_pred))
    return acc, macro_f1, micro_f1

In [36]:
def evaluationEmbedModelFromTrainTest(model, train, test, classifierStr='SVM', normalize=0):
    if isinstance(model, Doc2Vec):
        # model.docvecs函数将文档ID转换为doc2vec向量
        train_vecs = [model.docvecs[doc.tags[0]] for doc in train]  
        test_vecs = [model.docvecs[doc.tags[0]] for doc in test]
    else: # word2vec model
        train_vecs = [model.wv.get_vector(doc.tags[0]) for doc in train]
        test_vecs = [model.wv.get_vector(doc.tags[0]) for doc in test]
    train_y = [doc.labels for doc in train]
    test_y = [doc.labels for doc in test]
    print('train_y: , test_y: ', len(train_y), len(test_y))
    acc, macro_f1, micro_f1 = evaluation(train_vecs, test_vecs, train_y, test_y, classifierStr, normalize)
    
    return acc, macro_f1, micro_f1

## Tri-party DNR Model

In [27]:
from sklearn.model_selection import train_test_split

from gensim.models.doc2vec import Doc2Vec
from random import shuffle

In [28]:
class TriDNR:
    '''
    Tri-party Deep Network Representation, IJCAI-2016
    Read data from a from a directory which contains text, label, structure information, and initialize the TriDNR from
    Doc2Vec and DeepWalk Models, then iteratively update the model with text, label, and structure information.
    'directory'
        docs.txt -- text document for each node, one line for one node
        labels.txt -- class label for each node, noe line for one node
        adjedges.txt -- edge list for each node, one line for one node
    'train_size': percentage of training data in range 0-1, if train_size==0, it becomes pure unsupervised network representation
    'text_weight': weights for the text information, 0-1
    'size': the dimensionality of the feature vectors.
    'dm': defines doc2vec the training algorithm. dm=1, PV_DM; otherwise, PV-DBOW.
    'min_count': minimum number of counts for words.
    '''
    def __init__(self, directory=None, train_size=0.3, textweight=0.8, size=300, seed=1, workers=1, passes=10, dm=0,
                min_count=3):
        # Read the data
        alldocs, docindex, classlabels = readNetworkData(directory)
        print('%d document, %d classes, training ratio=%f' % (len(alldocs), len(classlabels), train_size))
        
        # Initialize Doc2Vec
        if train_size > 0:  # label information is available for learning
            print('Adding Label Information')
            train, test = train_test_split(alldocs, train_size=train_size, random_state=seed)
            '''
            add supervised information to training data, use label information for learning.
            Specifically, the doc2vec algorithm used the tags information as document IDs, and learn a vector
                representation for each tag(ID),.
            We add the class label into tags, so each class will acts as a ID and is used to learn the latent representation.
            '''
            alldata = train[:]
            for x in alldata:
                x.tags.append('Label'+x.labels)
            alldata.extend(test)
        else:  # no label information is availabel, pure unsupervised learning
            alldata = alldocs[:]
        
        d2v = trainDoc2Vec(alldata, workers=workers, size=size, dm=dm, passes=passes, min_count=min_count)
        
        raw_walks, netwalks = getdeepwalks(directory, number_walks=20, walk_length=8)
        w2v = trainWord2Vec(raw_walks, buildvoc=1, passes=passes, size=size, workers=workers)
        if train_size > 0:  # print out the initial results
            print('initialize Doc2Vec Model with supervised Information...')
            evaluationEmbedModelFromTrainTest(d2v, train, test, classifierStr='SVM')
            print('Initialize DeepWalk model')
            evaluationEmbedModelFromTrainTest(w2v, train, test, classifierStr='SVM')
        
        self.d2v = d2v
        self.w2v = w2v
        self.doctags = [doc.tags[0] for doc in alldocs]
        
        self.train(d2v, w2v, directory, alldata, passes=passes, weight=textweight)
        
        if textweight > 0.5:
            self.model = d2v
        else:
            self.model = w2v
        
    def setWeights(self, d2v_model, w2v_model, weight=1):
        if isinstance(d2v_model, Doc2Vec):
            print('Copy weights from Doc2Vec to Word2Vec')
            keys = w2v_model.wv.vocab.keys()
            for key in keys:
                if key not in self.doctags:
                    continue
                w2v_index = w2v_model.wv.vocab[key].index  # word2Vec index
                w2v_model.wv.syn0[w2v_index] = (1-weight) * w2v_model.wv.syn0[w2v_index] + \
                                weight * d2v_model.docvecs[key]

    def train(self, d2v, w2v, directory, alldata, passes=10, weight=0.9):
        raw_walks, walks = getdeepwalks(directory, number_walks=20, walk_length=10)
        for i in range(passes):
            print('Iterative Runing %d' % i)
            self.setWeights(d2v, w2v, weight=weight)
            # Train Word2Vec
            shuffle(raw_walks)
            print('Update W2V...')
            w2v.train(raw_walks, total_examples=w2v.corpus_count, epochs=w2v.epochs)
            self.setWeights(w2v, d2v, weight=(1-weight))

            print('Update D2V...')
            shuffle(alldata)  # shuffling to get best results
            d2v.train(alldata, total_examples=d2v.corpus_count, epochs=d2v.epochs)

## demo

In [29]:
'''
A Demo comparing several network representation algorithms:

Doc2Vec: paragraph vector model which only use text information.
DeepWalk: DeepWalk algorithm which only use structure information.
Doc2Vec + DeepWalk: simple combination of Doc2Vec and DeepWalk model.

Tri-party DNR: tri-party DNR model, published in IJCAI-2016.
'''

'\nA Demo comparing several network representation algorithms:\n\nDoc2Vec: paragraph vector model which only use text information.\nDeepWalk: DeepWalk algorithm which only use structure information.\nDoc2Vec + DeepWalk: simple combination of Doc2Vec and DeepWalk model.\n\nTri-party DNR: tri-party DNR model, published in IJCAI-2016.\n'

In [30]:
from sklearn.model_selection import train_test_split
import numpy as np

### set parameters

In [31]:
numFea = 100
cores = 4
train_size = 0.2  # percentage of training samples
random_state = 2
dm = 0
passes = 20

directory = 'tri-party data/M10'
alldocs, allsentence, classlabels = readNetworkData(directory)
print('%d document' % len(alldocs))
print('%d classes' % len(classlabels))
doc_list = alldocs[:]  # for reshuffling pass

10310 document
10 classes


### split_dataset

In [32]:
train, test = train_test_split(doc_list, train_size=train_size, random_state=random_state)

### baselines

In [33]:
# baselin 1, Doc2Vec model(PV-DM)
print('#############')
print('baseline 1, Doc2Vec Model dm=%d' % dm)
doc2vec_model = trainDoc2Vec(doc_list, workers=cores, size=numFea, dm=dm, passes=passes, min_count=3)

print('Classification Performance on Doc2Vec Model')
doc2vec_acc, doc2vec_macro_f1, doc2vec_micro_f1 = \
    evaluationEmbedModelFromTrainTest(doc2vec_model, train, test, classifierStr='SVM')
print('#############')

#############
baseline 1, Doc2Vec Model dm=0
Building Vocabulary




Iteration 0 ....
Iteration 1 ....
Iteration 2 ....
Iteration 3 ....
Iteration 4 ....
Iteration 5 ....
Iteration 6 ....
Iteration 7 ....
Iteration 8 ....
Iteration 9 ....
Iteration 10 ....
Iteration 11 ....
Iteration 12 ....
Iteration 13 ....
Iteration 14 ....
Iteration 15 ....
Iteration 16 ....
Iteration 17 ....
Iteration 18 ....
Iteration 19 ....
Classification Performance on Doc2Vec Model
train_y: , test_y:  2062 8248
Training SVM classifier...
[[316   4  22  19  58  31  54  93  31  22]
 [ 18  26  10   9  31   7  28  33   4  13]
 [ 15   6 730  63  21  30  61  38  33  23]
 [ 12   7  68 707  20  60  37  32  27  49]
 [ 28   7  28  23 713  18  48  38  20  26]
 [ 16   5  21  78  21 547  39  31  30  43]
 [ 52  10  40  37  25  46 493 144  69  26]
 [ 72   9  21   9  62  37 147 521  22  26]
 [ 17   7  36  71  28  31 106  40 422  31]
 [ 41   8  37  90  93  90  57  45  42 440]]
0.5959020368574199
Classification method:SVM(train, test, Training_percent): (2062, 8248, 0.200000)
Classification Acc

In [51]:
# baseline 2, DeepWalk model
print('#############')
print('baseline 2, DeepWalk model')
raw_walks, netwalks = getdeepwalks(directory, number_walks=20, walk_length=8)
deepwalk_model = trainWord2Vec(raw_walks, buildvoc=1, sg=1, passes=passes, size=numFea, workers=cores)
print('classification performance on DeepWalk model')
doc2vec_acc, doc2vec_macro_f1, doc2vec_micro_f1 = \
    evaluationEmbedModelFromTrainTest(deepwalk_model, train, test, classifierStr='SVM')
print('##############')

#############
baseline 2, DeepWalk model
Number of nodes: 10310
Number of walks: 206200
Walking...
Building Vocabulary
Iteration 0 ...
Iteration 1 ...
Iteration 2 ...
Iteration 3 ...
Iteration 4 ...
Iteration 5 ...
Iteration 6 ...
Iteration 7 ...
Iteration 8 ...
Iteration 9 ...
Iteration 10 ...
Iteration 11 ...
Iteration 12 ...
Iteration 13 ...
Iteration 14 ...
Iteration 15 ...
Iteration 16 ...
Iteration 17 ...
Iteration 18 ...
Iteration 19 ...


In [66]:
# baseline 3, D2V+DW
print('##############')
print('baseline 3, simple combination of DeepWalk + Doc2Vec')
d2v_train_vecs = [doc2vec_model.docvecs[doc.tags[0]] for doc in train]
d2v_test_vecs = [doc2vec_model.docvecs[doc.tags[0]] for doc in test]

dw_train_vecs = [deepwalk_model.wv.word_vec(doc.tags[0]) for doc in train]
dw_test_vecs = [deepwalk_model.wv.word_vec(doc.tags[0]) for doc in test]

train_y = [doc.labels for doc in train]
test_y = [doc.labels for doc in test]

# concanate two vectors
train_vecs = [np.append(l, dw_train_vecs[i]) for i,l in enumerate(d2v_train_vecs)]
test_vecs = [np.append(l, dw_test_vecs[i]) for i,l in enumerate(d2v_test_vecs)]

print('train_y: , test_y: ', len(train_y), len(test_y))
print('Classifcation Performance on Doc2Vec + DeepWalk')

acc, macro_f1, micro_f1 = evaluation(train_vecs, test_vecs, train_y, test_y, classifierStr='SVM')

  d2v_train_vecs = [doc2vec_model.docvecs[doc.tags[0]] for doc in train]
  d2v_test_vecs = [doc2vec_model.docvecs[doc.tags[0]] for doc in test]
  dw_train_vecs = [deepwalk_model.wv.word_vec(doc.tags[0]) for doc in train]
  dw_test_vecs = [deepwalk_model.wv.word_vec(doc.tags[0]) for doc in test]


##############
baseline 3, simple combination of DeepWalk + Doc2Vec
train_y: , test_y:  2062 8248
Classifcation Performance on Doc2Vec + DeepWalk
Training SVM classifier...
[[359  15   8   7  13  16  85 129  12   6]
 [ 26  31   5   8   2   0  56  36   7   8]
 [  5  13 764  48  22  32  15  14  74  33]
 [ 20   5  54 687  23  57  38  27  55  53]
 [ 12   3  23   7 769  25  14  16  30  50]
 [ 11   4  33  70  39 537  20  21  44  52]
 [ 70  24  23  16  12  30 531 176  43  17]
 [ 95  26  11  10  22  22 185 520  19  16]
 [  8  10  46  41  50  41  48  31 469  45]
 [ 28  11  41  84  83 101  26  33  63 473]]
0.6231813773035888
Classification method:SVM(train, test, Training_percent): (2062, 8248, 0.200000)
Classification Accuracy=0.623181, macro_f1=0.582696, micro_f1=0.623181




### tri-dnr method

In [34]:
tridnr_model = TriDNR(directory, size=numFea, dm=0, textweight=0.8, train_size=train_size, seed=random_state,
                     passes=10)
evaluationEmbedModelFromTrainTest(tridnr_model.model, train, test, classifierStr='SVM')

10310 document, 10 classes, training ratio=0.200000
Adding Label Information
Building Vocabulary




Iteration 0 ....
Iteration 1 ....
Iteration 2 ....
Iteration 3 ....
Iteration 4 ....
Iteration 5 ....
Iteration 6 ....
Iteration 7 ....
Iteration 8 ....
Iteration 9 ....
Number of nodes: 10310
Number of walks: 206200
Walking...
Building Vocabulary
Iteration 0 ...
Iteration 1 ...
Iteration 2 ...
Iteration 3 ...
Iteration 4 ...
Iteration 5 ...
Iteration 6 ...
Iteration 7 ...
Iteration 8 ...
Iteration 9 ...
initialize Doc2Vec Model with supervised Information...
train_y: , test_y:  2062 8248
Training SVM classifier...
[[387  15  23  11  19  15  63  72  22  23]
 [ 27  46  13  11  10   9  24  19   7  13]
 [ 13  28 710  99  10  11  46  38  45  20]
 [ 15  11  73 692  18  59  48  30  21  52]
 [ 33   3  23  22 698  15  45  41  30  39]
 [ 27   2  28  74  18 500  30  60  41  51]
 [ 30  18  27  42  17  46 573 113  54  22]
 [ 62   8  18  20  33  36 128 570  32  19]
 [ 16  11  36  45  28  33  91  28 471  30]
 [ 38  18  28  70  65  69  54  43  22 536]]
0.6283947623666344
Classification method:SVM(tra



[[  2   2  11   9   8  20   2 564  19  13]
 [  0   0   6   7   1  11   2 143   1   8]
 [  0   7 691  75  48 104   6   1  37  51]
 [  1   2  76 656  50 110   4   3  42  75]
 [  0   3  61  66 656  63   6   1  45  48]
 [  0   5  54 119  53 418  11   5  46 120]
 [  1   6  34  35  22  54   7 726  30  27]
 [  0   6  22  19  24  28   2 782  21  22]
 [  0   8  65  81  87 166   4   3 283  92]
 [  1   7  79 172  92 197   8   4  61 322]]
0.4627788554801164
Classification method:SVM(train, test, Training_percent): (2062, 8248, 0.200000)
Classification Accuracy=0.462779, macro_f1=0.361170, micro_f1=0.462779
Number of nodes: 10310
Number of walks: 206200
Walking...
Iterative Runing 0
Copy weights from Doc2Vec to Word2Vec


  w2v_model.wv.syn0[w2v_index] = (1-weight) * w2v_model.wv.syn0[w2v_index] + \


Update W2V...
Update D2V...
Iterative Runing 1
Copy weights from Doc2Vec to Word2Vec
Update W2V...
Update D2V...
Iterative Runing 2
Copy weights from Doc2Vec to Word2Vec
Update W2V...
Update D2V...
Iterative Runing 3
Copy weights from Doc2Vec to Word2Vec
Update W2V...
Update D2V...
Iterative Runing 4
Copy weights from Doc2Vec to Word2Vec
Update W2V...
Update D2V...
Iterative Runing 5
Copy weights from Doc2Vec to Word2Vec
Update W2V...
Update D2V...
Iterative Runing 6
Copy weights from Doc2Vec to Word2Vec
Update W2V...
Update D2V...
Iterative Runing 7
Copy weights from Doc2Vec to Word2Vec
Update W2V...
Update D2V...
Iterative Runing 8
Copy weights from Doc2Vec to Word2Vec
Update W2V...
Update D2V...
Iterative Runing 9
Copy weights from Doc2Vec to Word2Vec
Update W2V...
Update D2V...
