In [23]:
import time
import difflib
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import dgl
import dgl.function as fn
from dgl import DGLGraph

import networkx as nx

In [48]:
#来自DGLGraph tutorial
class NodeApplyModule(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(NodeApplyModule, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)
        self.activation = activation

    def forward(self, node):
        h = self.linear(node.data['h'])
        if self.activation is not None:
            h = self.activation(h)
        return {'h' : h}

gcn_msg = fn.copy_src(src='h', out='m')
gcn_reduce = fn.sum(msg='m', out='h')

class GCN(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(GCN, self).__init__()
        self.apply_mod = NodeApplyModule(in_feats, out_feats, activation)

    def forward(self, g, feature):
        g.ndata['h'] = feature
        g.update_all(gcn_msg, gcn_reduce)
        g.apply_nodes(func=self.apply_mod)
        return g.ndata.pop('h')
    
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.gcn1 = GCN(768, 32, F.relu)
        self.gcn2 = GCN(32, 2, None)

    def forward(self, g, features):
        x = self.gcn1(g, features)
        x = self.gcn2(g, x)
        return x

In [None]:
#读取DealWithData中golden数据
data_golden = pd.read_csv( './DataSet/book/golden/claims_golden.txt' , sep='\t' )
data_golden['encode'] = torch.load('./DataSet/book/golden/claims_golden_encode.pt')
GoldenLabel = pd.read_table("./DataSet/book/book_golden.txt" , sep='\t' , header=None , names=['isbn','author'])

In [None]:
#随意点划分训练集和测试集，直觉要根据'isbn'划分
data_train = pd.DataFrame(data_golden.drop(data_golden.index,inplace=False))
data_test = pd.DataFrame(data_golden.drop(data_golden.index,inplace=False))
for i in range(0,int(len(GoldenLabel)/2+1)):
    data_trainSlice = data_golden[data_golden['isbn']==GoldenLabel.loc[i]['isbn']]
    data_train = data_train.append(data_trainSlice)
for i in range(int(len(GoldenLabel)/2+1),len(GoldenLabel)):
    data_testSlice = data_golden[data_golden['isbn']==GoldenLabel.loc[i]['isbn']]
    data_test = data_test.append(data_testSlice)
data_train.reset_index(drop=True,inplace=True)
data_test.reset_index(drop=True,inplace=True)

In [26]:
#构建图结构函数，根据训练集和测试集连边
#此处可优化复杂度？n**2/2 -> kn
#此处可根据'book_name'相似度进一步加边
def generate_DGLGraph(df):
    edge_norm = []
    g = DGLGraph()
    g.add_nodes(df.shape[0])
    for i in range(0,len(df)):
        for j in range(i+1,len(df)):
            if (df.loc[i]["source"]==df.loc[j]["source"]):
                g.add_edge(i,j)
                #edge_norm.append(1.0)
                g.add_edge(j,i)
                #edge_norm.append(1.0)
            elif (df.loc[i]["isbn"]==df.loc[j]["isbn"]):
                str1 = df.loc[i]['author']
                str2 = df.loc[j]['author']
                #print(str1,str2,difflib.SequenceMatcher(None,str1,str2).quick_ratio())
                if ( difflib.SequenceMatcher(None,str1,str2).quick_ratio()>0.8 ):
                    g.add_edge(i,j)
                    #edge_norm.append(1.0)
                    g.add_edge(j,i)
                    #edge_norm.append(1.0)
    #edge_norm = torch.Tensor(edge_norm).unsqueeze(1)
    #g.edata.update({ 'norm': edge_norm })
    return g

graph_train = generate_DGLGraph(data_train)
graph_test = generate_DGLGraph(data_test)

In [62]:
#存储图结构
import pickle
file = open('./DataSet/book/golden/graph_train.pickle', 'wb')
pickle.dump(graph_train, file)
file.close()
file = open('./DataSet/book/golden/graph_test.pickle', 'wb')
pickle.dump(graph_test, file)
file.close()

In [63]:
#加载图结构
with open('./DataSet/book/golden/graph_train.pickle', 'rb') as file:
    graph_train =pickle.load(file)
with open('./DataSet/book/golden/graph_test.pickle', 'rb') as file:
    graph_test =pickle.load(file)

In [28]:
#存储和加载图结构的失败尝试
'''
graph_train_netx = graph_train.to_networkx()
graph_test_netx = graph_test.to_networkx()
nx.write_gexf(graph_train_netx,'./DataSet/book/golden/train_graph.gexf')
nx.write_gexf(graph_test_netx,'./DataSet/book/golden/test_graph.gexf')

graph_train_netx = nx.read_gexf('./DataSet/book/golden/train_graph.gexf')
graph_test_netx = nx.read_gexf('./DataSet/book/golden/test_graph.gexf')

graph_train2 = DGLGraph(graph_train_netx)
#graph_train2.from_networkx(graph_train_netx)
graph_test2 = DGLGraph(graph_test_netx)
#graph_test2.from_networkx(graph_test_netx)
'''

In [31]:
def extract_featureNlabel(df):
    a = torch.zeros(1,768)
    c = torch.zeros(1)
    for _,row in df.iterrows():
        b = row['encode'].reshape([-1,768])
        a = torch.cat((a,b),0)
        if(row['label']):
            d = torch.ones(1)
        else:
            d = torch.zeros(1)
        c = torch.cat((c,d),-1)
    return a[1:,:],c[1:].long()
train_feature,train_label = extract_featureNlabel(data_train)
test_feature,test_label = extract_featureNlabel(data_test)

In [None]:
logp

In [53]:
net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-5)
dur = []
for epoch in range(50):
    #if epoch >= 3:
    t0 = time.time()

    logits = net(graph_train, train_feature)
    logp = F.log_softmax(logits, 1)
    loss = F.nll_loss(logp, train_label)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    #if epoch >=3:
    dur.append(time.time() - t0)

    print("Epoch {:05d} | Loss {:.4f} | Time(s) {:.4f}".format(
            epoch, loss.item(), np.mean(dur)))
    print(logp,'\n')

Epoch 00000 | Loss 2115.1045 | Time(s) 0.3333
tensor([[     0.0000,   -474.9843],
        [     0.0000,   -490.0583],
        [     0.0000,    -38.5609],
        ...,
        [     0.0000, -10261.2266],
        [     0.0000,  -6876.0869],
        [     0.0000,  -6765.7183]], grad_fn=<LogSoftmaxBackward>) 

Epoch 00001 | Loss 1957.4723 | Time(s) 0.3261
tensor([[    0.0000,  -435.8128],
        [    0.0000,  -449.5630],
        [    0.0000,   -35.7825],
        ...,
        [    0.0000, -9625.3809],
        [    0.0000, -6446.9688],
        [    0.0000, -6246.3188]], grad_fn=<LogSoftmaxBackward>) 

Epoch 00002 | Loss 1799.9904 | Time(s) 0.3169
tensor([[    0.0000,  -397.2705],
        [    0.0000,  -409.6799],
        [    0.0000,   -32.9936],
        ...,
        [    0.0000, -8982.5068],
        [    0.0000, -6018.7417],
        [    0.0000, -5709.8628]], grad_fn=<LogSoftmaxBackward>) 

Epoch 00003 | Loss 1646.5574 | Time(s) 0.3152
tensor([[    0.0000,  -361.9127],
        [    0.0000,

Epoch 00028 | Loss 133.3254 | Time(s) 0.3092
tensor([[-8.6090e+01,  0.0000e+00],
        [-8.9952e+01,  0.0000e+00],
        [-1.8235e+00, -1.7610e-01],
        ...,
        [-7.1224e+02,  0.0000e+00],
        [-4.6374e+02,  0.0000e+00],
        [-1.0699e+03,  0.0000e+00]], grad_fn=<LogSoftmaxBackward>) 

Epoch 00029 | Loss 109.1926 | Time(s) 0.3092
tensor([[-7.3531e+01,  0.0000e+00],
        [-7.7008e+01,  0.0000e+00],
        [-1.1450e+00, -3.8305e-01],
        ...,
        [-5.0501e+02,  0.0000e+00],
        [-3.2068e+02,  0.0000e+00],
        [-8.9904e+02,  0.0000e+00]], grad_fn=<LogSoftmaxBackward>) 

Epoch 00030 | Loss 93.1483 | Time(s) 0.3098
tensor([[-6.0620e+01,  0.0000e+00],
        [-6.3702e+01,  0.0000e+00],
        [-6.2151e-01, -7.7031e-01],
        ...,
        [-2.9174e+02,  0.0000e+00],
        [-1.7348e+02,  0.0000e+00],
        [-7.2336e+02,  0.0000e+00]], grad_fn=<LogSoftmaxBackward>) 

Epoch 00031 | Loss 79.3669 | Time(s) 0.3097
tensor([[-4.7625e+01,  0.0000e+00],


In [None]:
#玩具数据集用以检验图结构生成的准确率。
#测试发现difflib的相似度量是字符级的
'''
dataframe = pd.DataFrame([
        ["a", "111222", "computer Science", "bruce"],
        ["b", "111222", "computer Science", "Bruce Lee"],
        ["c", "111222", "computer Science", "mike ,john"],
        ["a", "111223", "Hassdsdsaad", "kkl"],
        ["d", "111223", "Hassdsdaaad", "kkkl"],
        ["c", "111224", "asdfgh", "zxcr"]
    ],
    columns=["source", "isbn", "name", "author"]
)
g = generate_DGLGraph(dataframe)
'''