In [1]:
import time
import difflib
import pickle
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import dgl
import dgl.function as fn
from dgl import DGLGraph

import networkx as nx
from sklearn.metrics import accuracy_score

In [2]:
#来自DGLGraph tutorial
class NodeApplyModule(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(NodeApplyModule, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)
        #self.conv = nn.Conv1d(in_feats,out_feats,1)
        self.activation = activation

    def forward(self, node):
        #h = self.conv(node.data['h'])
        h = self.linear(node.data['h'])
        if self.activation is not None:
            h = self.activation(h)
        return {'h' : h}

gcn_msg = fn.copy_src(src='h', out='m')
gcn_reduce = fn.sum(msg='m', out='h')

class GCN(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(GCN, self).__init__()
        self.apply_mod = NodeApplyModule(in_feats, out_feats, activation)

    def forward(self, g, feature):
        g.ndata['h'] = feature
        g.update_all(gcn_msg, gcn_reduce)
        g.apply_nodes(func=self.apply_mod)
        return g.ndata.pop('h')
    
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.gcn1 = GCN(768, 384, F.relu)
        self.gcn2 = GCN(384, 192, F.relu)
        self.fc = nn.Linear(192, 2)

    def forward(self, g, features):
        x = self.gcn1(g, features)
        x = self.gcn2(g, x)
        x = self.fc(x)
        return x
    
    def predict(self, g, features):
        pred = F.softmax(self.forward(g, features))
        ans = []
        for t in pred:
            if t[0]>t[1]:
                ans.append(0)
            else:
                ans.append(1)
        return torch.tensor(ans)
net = Net()

In [3]:
#读取DealWithData中golden数据
data_golden = pd.read_csv( './DataSet/book/golden/claims_golden.txt' , sep='\t' )
data_golden['encode'] = torch.load('./DataSet/book/golden/claims_golden_encode.pt')
GoldenLabel = pd.read_table("./DataSet/book/book_golden.txt" , sep='\t' , header=None , names=['isbn','author'])

In [5]:
#随意点划分训练集和测试集，直觉要根据'isbn'划分 -> 改为不直接划分数据集，采用mask方式划分
'''
data_train = pd.DataFrame(data_golden.drop(data_golden.index,inplace=False))
data_test = pd.DataFrame(data_golden.drop(data_golden.index,inplace=False))
for i in range(0,int(len(GoldenLabel)/2+1)):
    data_trainSlice = data_golden[data_golden['isbn']==GoldenLabel.loc[i]['isbn']]
    data_train = data_train.append(data_trainSlice)
for i in range(int(len(GoldenLabel)/2+1),len(GoldenLabel)):
    data_testSlice = data_golden[data_golden['isbn']==GoldenLabel.loc[i]['isbn']]
    data_test = data_test.append(data_testSlice)
data_train.reset_index(drop=True,inplace=True)
data_test.reset_index(drop=True,inplace=True)
'''

In [67]:
def divide_dataset(dfw,dfk,test_ratio):
    a = np.random.choice(len(dfk), int(len(dfk)*ratio), replace=False)
    test_set = set()
    for i in range(0,a.shape[0]):
        test_set.add(dfk.loc[a[i]]['isbn'])
    train_mask = torch.Tensor(size=[len(dfw)]).bool()
    test_mask = torch.Tensor(size=[len(dfw)]).bool()
    for i in range(0,len(dfw)):
        if dfw.loc[i]['isbn'] in test_set:
            test_mask[i] = True
            train_mask[i] = False
        else:
            test_mask[i] = False
            train_mask[i] = True
    return train_mask,test_mask
train_mask,test_mask = divide_dataset(data_golden,GoldenLabel,0.5)

In [24]:
#构建图结构函数，根据训练集和测试集连边 -> 改为基于全体数据连边
#此处可优化复杂度？n**2/2 -> kn
#此处可根据'book_name'相似度进一步加边
def generate_DGLGraph(df):
    edge_norm = []
    g = DGLGraph()
    g.add_nodes(df.shape[0])
    for i in range(0,len(df)):
        for j in range(i+1,len(df)):
            if (df.loc[i]["source"]==df.loc[j]["source"]):
                g.add_edge(i,j)
                #edge_norm.append(1.0)
                g.add_edge(j,i)
                #edge_norm.append(1.0)
            elif (df.loc[i]["isbn"]==df.loc[j]["isbn"]):
                str1 = df.loc[i]['author']
                str2 = df.loc[j]['author']
                #print(str1,str2,difflib.SequenceMatcher(None,str1,str2).quick_ratio())
                if ( difflib.SequenceMatcher(None,str1,str2).quick_ratio()>0.8 ):
                    g.add_edge(i,j)
                    #edge_norm.append(1.0)
                    g.add_edge(j,i)
                    #edge_norm.append(1.0)
    #edge_norm = torch.Tensor(edge_norm).unsqueeze(1)
    #g.edata.update({ 'norm': edge_norm })
    return g

graph_whole = generate_DGLGraph(data_golden)
#graph_train = generate_DGLGraph(data_train)
#graph_test = generate_DGLGraph(data_test)

In [25]:
#存储图结构
file = open('./DataSet/book/golden/graph_whole.pickle', 'wb')
pickle.dump(graph_whole, file)
file.close()
#file = open('./DataSet/book/golden/graph_train.pickle', 'wb')
#pickle.dump(graph_train, file)
#file.close()
#file = open('./DataSet/book/golden/graph_test.pickle', 'wb')
#pickle.dump(graph_test, file)
#file.close()

In [63]:
#加载图结构
with open('./DataSet/book/golden/graph_whole.pickle', 'rb') as file:
    graph_whole =pickle.load(file)
#with open('./DataSet/book/golden/graph_train.pickle', 'rb') as file:
#    graph_train =pickle.load(file)
#with open('./DataSet/book/golden/graph_test.pickle', 'rb') as file:
#    graph_test =pickle.load(file)

In [None]:
#存储和加载图结构的失败尝试
'''
graph_train_netx = graph_train.to_networkx()
graph_test_netx = graph_test.to_networkx()
nx.write_gexf(graph_train_netx,'./DataSet/book/golden/train_graph.gexf')
nx.write_gexf(graph_test_netx,'./DataSet/book/golden/test_graph.gexf')

graph_train_netx = nx.read_gexf('./DataSet/book/golden/train_graph.gexf')
graph_test_netx = nx.read_gexf('./DataSet/book/golden/test_graph.gexf')

graph_train2 = DGLGraph(graph_train_netx)
#graph_train2.from_networkx(graph_train_netx)
graph_test2 = DGLGraph(graph_test_netx)
#graph_test2.from_networkx(graph_test_netx)
'''

In [61]:
def extract_featureNlabel(df):
    a = torch.zeros(1,768)
    c = torch.zeros(1)
    for _,row in df.iterrows():
        b = row['encode'].reshape([-1,768])
        a = torch.cat((a,b),0)
        if(row['label']):
            d = torch.ones(1)
        else:
            d = torch.zeros(1)
        c = torch.cat((c,d),-1)
    return a[1:,:],c[1:].long()
whole_feature,whole_label = extract_featureNlabel(data_golden)
#train_feature,train_label = extract_featureNlabel(data_train)
#test_feature,test_label = extract_featureNlabel(data_test)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)
dur = []
for epoch in range(50):
    
    t0 = time.time()

    pred_prob = net.forward(graph_whole, whole_feature)
    loss = criterion(pred_prob[train_mask],whole_label[train_mask])
    #pred_prob = net.forward(graph_train, train_feature)
    #loss = criterion(pred_prob,train_label)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    

    pred_label = net.predict(graph_whole, whole_feature)
    train_accu = accuracy_score(pred_label[train_mask],whole_label[train_mask])
    test_accu = accuracy_score(pred_label[test_mask],whole_label[test_mask])

    dur.append(time.time() - t0)
    print("Epoch {:05d} | Loss {:.4f} | Time(s) {:.4f} | Train_Accu {:4f} | Test_Accu {:4f}".format(
        epoch, loss.item(), np.mean(dur), train_accu,test_accu))



Epoch 00000 | Loss 2.1384 | Time(s) 2.8671 | Train_Accu 0.445899 | Test_Accu 0.398068
Epoch 00001 | Loss 122.8407 | Time(s) 2.8757 | Train_Accu 0.445899 | Test_Accu 0.398068
Epoch 00002 | Loss 80.7766 | Time(s) 2.8903 | Train_Accu 0.632635 | Test_Accu 0.579710
Epoch 00003 | Loss 5.3182 | Time(s) 2.8895 | Train_Accu 0.561082 | Test_Accu 0.609662
Epoch 00004 | Loss 55.0678 | Time(s) 2.9035 | Train_Accu 0.561082 | Test_Accu 0.608696
Epoch 00005 | Loss 81.9156 | Time(s) 2.9230 | Train_Accu 0.561082 | Test_Accu 0.608696
Epoch 00006 | Loss 84.1792 | Time(s) 2.9162 | Train_Accu 0.561082 | Test_Accu 0.609662
Epoch 00007 | Loss 69.6552 | Time(s) 2.9148 | Train_Accu 0.562827 | Test_Accu 0.612560
Epoch 00008 | Loss 42.2779 | Time(s) 2.9155 | Train_Accu 0.724258 | Test_Accu 0.730435
Epoch 00009 | Loss 6.2826 | Time(s) 2.9131 | Train_Accu 0.445899 | Test_Accu 0.398068
Epoch 00010 | Loss 34.8739 | Time(s) 2.8912 | Train_Accu 0.445026 | Test_Accu 0.398068
Epoch 00011 | Loss 48.8157 | Time(s) 2.8950 |

In [None]:
#玩具数据集用以检验图结构生成的准确率。
#测试发现difflib的相似度量是字符级的
'''
dataframe = pd.DataFrame([
        ["a", "111222", "computer Science", "bruce"],
        ["b", "111222", "computer Science", "Bruce Lee"],
        ["c", "111222", "computer Science", "mike ,john"],
        ["a", "111223", "Hassdsdsaad", "kkl"],
        ["d", "111223", "Hassdsdaaad", "kkkl"],
        ["c", "111224", "asdfgh", "zxcr"]
    ],
    columns=["source", "isbn", "name", "author"]
)
g = generate_DGLGraph(dataframe)
'''