In [2]:
import time
import difflib
import pickle
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import dgl
import dgl.function as fn
from dgl import DGLGraph

from sklearn.metrics import accuracy_score

In [10]:
#来自DGLGraph tutorial，in_feats、out_feats需要根据feature长度和分类数改动改动
class NodeApplyModule(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(NodeApplyModule, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)
        #self.conv = nn.Conv1d(in_feats,out_feats,1)
        self.activation = activation

    def forward(self, node):
        #h = self.conv(node.data['h'])
        h = self.linear(node.data['h'])
        if self.activation is not None:
            h = self.activation(h,inplace=True)
        return {'h' : h}

gcn_msg = fn.copy_src(src='h', out='m')
gcn_reduce = fn.sum(msg='m', out='h')

class GCN(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(GCN, self).__init__()
        self.apply_mod = NodeApplyModule(in_feats, out_feats, activation)

    def forward(self, g, feature):
        g.ndata['h'] = feature
        g.update_all(gcn_msg, gcn_reduce)
        g.apply_nodes(func=self.apply_mod)
        return g.ndata.pop('h')
    
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        #self.gcn1 = GCN(605, 384, F.relu)
        #self.gcn1 = GCN(2304, 384, F.relu)
        #self.gcn1 = GCN(768, 384, F.relu)
        #self.gcn2 = GCN(384, 192, F.relu)
        #self.fc = nn.Linear(192, 2)
        #self.fc1 = nn.Linear(605,288)
        self.fc1_1 = nn.Linear(768,96)
        self.fc1_2 = nn.Linear(768,96)
        self.fc1_3 = nn.Linear(768,96)
        self.gcn2 = GCN(288,96,F.relu)
        #self.gcn3 = GCN(96,96,F.relu)
        self.fc4 = nn.Linear(96,2)
        
    def forward(self, g, features):
        #x = self.fc1(features)
        x1 = self.fc1_1(features[:,0:768])
        x2 = self.fc1_2(features[:,768:-768])
        x3 = self.fc1_3(features[:,-768:])
        x = torch.cat((x1,x2,x3),1)
        x = self.gcn2(g,x)
        #x = self.gcn3(g,x)
        x = self.fc4(x)
        return x
    
    def predict(self, pred_prob):
        #self.eval()
        pred = F.softmax(pred_prob)
        ans = []
        for t in pred:
            if t[0]>t[1]:
                ans.append(0)
            else:
                ans.append(1)
        return torch.tensor(ans)
    '''
    def predict(self, g, features):
        #self.eval()
        pred = F.softmax(self.forward(g, features))
        ans = []
        for t in pred:
            if t[0]>t[1]:
                ans.append(0)
            else:
                ans.append(1)
        return torch.tensor(ans)
    '''
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)

In [3]:
#读取DealWithData中golden数据
data_golden = pd.read_csv( './DataSet/book/golden/claims_golden2.txt' , sep='\t' )
data_golden['encode'] = torch.load('./DataSet/book/golden/claims_golden_encode_v2.pt')
#data_golden['encode'] = torch.load('./DataSet/book/golden/claims_golden_encode.pt')
GoldenLabel = pd.read_table("./DataSet/book/book_golden.txt" , sep='\t' , header=None , names=['isbn','author'])

In [17]:
#使用遮盖而非分隔的方式划分数据集
def divide_dataset(dfw,dfk,test_ratio):
    a = np.random.choice(len(dfk), int(len(dfk)*test_ratio), replace=False)
    test_set = set()
    for i in range(0,a.shape[0]):
        test_set.add(dfk.loc[a[i]]['isbn'])
    train_mask = torch.Tensor(size=[len(dfw)]).bool()
    test_mask = torch.Tensor(size=[len(dfw)]).bool()
    for i in range(0,len(dfw)):
        if dfw.loc[i]['isbn'] in test_set:
            test_mask[i] = True
            train_mask[i] = False
        else:
            test_mask[i] = False
            train_mask[i] = True
    return train_mask,test_mask
train_mask,test_mask = divide_dataset(data_golden,GoldenLabel,0.6)

In [None]:
#构建图结构函数，根据训练集和测试集连边 -> 改为基于全体数据连边
#此处可优化复杂度？n**2/2 -> kn^?
#此处可根据'book_name'相似度进一步加边
#graph_whole3为添加自环的版本 124468 -> 126807

#使用Jaccard相似度之后边数 124446 -> 124468 ，影响不大？

def sim_Jaccard (str1,str2) :
    set1 = set( str1.lower().replace(';',' ').replace(',',' ').replace('.',' ').replace(':',' ').replace('&',' ').
               replace('/',' ').replace('\'',' ').replace('(author)',' ').replace('(joint author)',' ').split() )
    set2 = set( str2.lower().replace(';',' ').replace(',',' ').replace('.',' ').replace(':',' ').replace('&',' ').
               replace('/',' ').replace('\'',' ').replace('(author)',' ').replace('(joint author)',' ').split() )
    return len(set1&set2)/len(set1|set2)

def generate_DGLGraph(df):
    g = DGLGraph()
    g.add_nodes(df.shape[0])
    
    source_list = df['source'].drop_duplicates().reset_index(drop=True)
    isbn_list = df['isbn'].drop_duplicates().reset_index(drop=True)
    
    for index,row in df.iterrows():
        g.add_edge(index,index)
    
    for index,value in source_list.iteritems():
        df_slice = df[df['source']==value]
    #for k in range(0,1):
    #    df_slice = df[df['source']==source_list.loc[k]]
        for i in range(0,len(df_slice)):
            for j in range(i+1,len(df_slice)):
                g.add_edge(df_slice.iloc[i].name,df_slice.iloc[j].name)
                g.add_edge(df_slice.iloc[j].name,df_slice.iloc[i].name)

    for index,value in isbn_list.iteritems():
        df_slice = df[df['isbn']==value]
    #for k in range(0,1):
    #    df_slice = df[df['isbn']==isbn_list.loc[k]]
        for i in range(0,len(df_slice)):
            for j in range(i+1,len(df_slice)):
                if( sim_Jaccard( df_slice.iloc[i]['author'] , df_slice.iloc[j]['author'] ) >= 0.8 ):
                    g.add_edge(df_slice.iloc[i].name,df_slice.iloc[j].name)
                    g.add_edge(df_slice.iloc[j].name,df_slice.iloc[i].name)

    return g


graph_whole = generate_DGLGraph(data_golden)
#graph_train = generate_DGLGraph(data_train)
#graph_test = generate_DGLGraph(data_test)

In [None]:
'''
构建图结构函数，该版本弃置
def generate_DGLGraph(df):
    #edge_norm = []
    g = DGLGraph()
    g.add_nodes(df.shape[0])
    for i in range(0,len(df)):
        g.add_edge(i,i)
    for i in range(0,len(df)):
        for j in range(i+1,len(df)):
            if (df.loc[i]["source"]==df.loc[j]["source"]):
                g.add_edge(i,j)
                #edge_norm.append(1.0)
                g.add_edge(j,i)
                #edge_norm.append(1.0)
            elif (df.loc[i]["isbn"]==df.loc[j]["isbn"]):
                str1 = df.loc[i]['author']
                str2 = df.loc[j]['author']
                #print(str1,str2,difflib.SequenceMatcher(None,str1,str2).quick_ratio())
                #if ( difflib.SequenceMatcher(None,str1,str2).quick_ratio()>0.8 ):
                if ( sim_Jaccard(str1,str2)>0.8 ):
                    g.add_edge(i,j)
                    #edge_norm.append(1.0)
                    g.add_edge(j,i)
                    #edge_norm.append(1.0)
    #edge_norm = torch.Tensor(edge_norm).unsqueeze(1)
    #g.edata.update({ 'norm': edge_norm })
    return g
'''

In [None]:
#存储图结构
file = open('./DataSet/book/golden/graph_whole3.pickle', 'wb')
pickle.dump(graph_whole, file)
file.close()

In [5]:
#加载图结构
with open('./DataSet/book/golden/graph_whole3.pickle', 'rb') as file:
    graph_whole =pickle.load(file)
#with open('./DataSet/book/golden/graph_train.pickle', 'rb') as file:
#    graph_train =pickle.load(file)
#with open('./DataSet/book/golden/graph_test.pickle', 'rb') as file:
#    graph_test =pickle.load(file)

In [None]:
#存储和加载图结构的失败尝试
'''
graph_train_netx = graph_train.to_networkx()
graph_test_netx = graph_test.to_networkx()
nx.write_gexf(graph_train_netx,'./DataSet/book/golden/train_graph.gexf')
nx.write_gexf(graph_test_netx,'./DataSet/book/golden/test_graph.gexf')

graph_train_netx = nx.read_gexf('./DataSet/book/golden/train_graph.gexf')
graph_test_netx = nx.read_gexf('./DataSet/book/golden/test_graph.gexf')

graph_train2 = DGLGraph(graph_train_netx)
#graph_train2.from_networkx(graph_train_netx)
graph_test2 = DGLGraph(graph_test_netx)
#graph_test2.from_networkx(graph_test_netx)
'''

In [4]:
def extract_featureNlabel(df):
    a = torch.zeros(1,len(df.loc[0]['encode']))
    c = torch.zeros(1)
    for _,row in df.iterrows():
        b = row['encode'].reshape([1,-1])
        a = torch.cat((a,b),0)
        if(row['label']):
            d = torch.ones(1)
        else:
            d = torch.zeros(1)
        c = torch.cat((c,d),-1)
    return a[1:,:],c[1:].long()
#_,whole_label = extract_featureNlabel(data_golden)
#whole_feature = torch.load('./DataSet/book/golden/claims_golden_encode_tfidf.pt')
whole_feature,whole_label = extract_featureNlabel(data_golden)
#train_feature,train_label = extract_featureNlabel(data_train)
#test_feature,test_label = extract_featureNlabel(data_test)

结果比对

a:1\*768(bert)

b:1\*3\*768(bert_v2)

c:1\*605(tfidf)

train:test=1:1

| method | len | lr | epoch | loss | accu |
|:----: | :----: | :----: | :----: | :----: | :----: |
| a | 768 | 1e-5 | 200 | 1.xx | 0.6 | 
| b | 2304 | 1e-5 | 200 | 5.xx | 0.7 |
| c | 605 | 1e-5 | 200 | 0.5x | 0.7 |

In [18]:
dur = []
for epoch in range(100):
    
    t0 = time.time()

    pred_prob = net.forward(graph_whole, whole_feature)
    loss = criterion(pred_prob[train_mask],whole_label[train_mask])
    
    #pred_prob = net.forward(graph_train, train_feature)
    #loss = criterion(pred_prob,train_label)
    #pred_label = net.predict(graph_whole, whole_feature)
    pred_label = net.predict(pred_prob)
    train_accu = accuracy_score(pred_label[train_mask],whole_label[train_mask])
    test_accu = accuracy_score(pred_label[test_mask],whole_label[test_mask])
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    dur.append(time.time() - t0)
    print("Epoch {:05d} | Loss {:.4f} | Time(s) {:.4f} | Train_Accu {:4f} | Test_Accu {:4f}".format(
        epoch, loss.item(), np.mean(dur), train_accu,test_accu))



Epoch 00000 | Loss 0.9127 | Time(s) 0.5376 | Train_Accu 0.673885 | Test_Accu 0.742837
Epoch 00001 | Loss 1.4365 | Time(s) 0.5191 | Train_Accu 0.653503 | Test_Accu 0.654011
Epoch 00002 | Loss 0.9632 | Time(s) 0.5196 | Train_Accu 0.676433 | Test_Accu 0.739255
Epoch 00003 | Loss 0.8208 | Time(s) 0.5146 | Train_Accu 0.692994 | Test_Accu 0.684814
Epoch 00004 | Loss 0.7858 | Time(s) 0.5118 | Train_Accu 0.694268 | Test_Accu 0.713467
Epoch 00005 | Loss 0.7928 | Time(s) 0.5105 | Train_Accu 0.719745 | Test_Accu 0.748567
Epoch 00006 | Loss 0.8344 | Time(s) 0.5101 | Train_Accu 0.704459 | Test_Accu 0.680516
Epoch 00007 | Loss 0.8699 | Time(s) 0.5099 | Train_Accu 0.695541 | Test_Accu 0.738539
Epoch 00008 | Loss 1.0205 | Time(s) 0.5096 | Train_Accu 0.698089 | Test_Accu 0.670487
Epoch 00009 | Loss 0.7886 | Time(s) 0.5086 | Train_Accu 0.712102 | Test_Accu 0.738539
Epoch 00010 | Loss 0.6978 | Time(s) 0.5097 | Train_Accu 0.732484 | Test_Accu 0.699140
Epoch 00011 | Loss 0.6121 | Time(s) 0.5092 | Train_Acc

Epoch 00096 | Loss 0.4126 | Time(s) 0.5120 | Train_Accu 0.836943 | Test_Accu 0.740688
Epoch 00097 | Loss 0.3411 | Time(s) 0.5119 | Train_Accu 0.858599 | Test_Accu 0.729943
Epoch 00098 | Loss 0.3012 | Time(s) 0.5120 | Train_Accu 0.867516 | Test_Accu 0.744986
Epoch 00099 | Loss 0.2968 | Time(s) 0.5119 | Train_Accu 0.867516 | Test_Accu 0.749284


In [None]:
#玩具数据集用以检验图结构生成的准确率。
#测试发现difflib的相似度量是字符级的
'''
dataframe = pd.DataFrame([
        ["a", "111222", "computer Science", "bruce"],
        ["b", "111222", "computer Science", "Bruce Lee"],
        ["c", "111222", "computer Science", "mike ,john"],
        ["a", "111223", "Hassdsdsaad", "kkl"],
        ["d", "111223", "Hassdsdaaad", "kkkl"],
        ["c", "111224", "asdfgh", "zxcr"]
    ],
    columns=["source", "isbn", "name", "author"]
)
g = generate_DGLGraph(dataframe)
'''

In [19]:
def add_confidence(df,prob,col_name='fact_confidence'):
    df[col_name] = None
    for i in range(len(df)):
        df.loc[i,col_name] = float(prob[i][1])
    return df

def sim_Jaccard (str1,str2) :
    set1 = set( str1.lower().replace(';',' ').replace(',',' ').replace('.',' ').replace(':',' ').replace('&',' ').
               replace('/',' ').replace('\'',' ').replace('(author)',' ').replace('(joint author)',' ').split() )
    set2 = set( str2.lower().replace(';',' ').replace(',',' ').replace('.',' ').replace(':',' ').replace('&',' ').
               replace('/',' ').replace('\'',' ').replace('(author)',' ').replace('(joint author)',' ').split() )
    return len(set1&set2)/len(set1|set2)

def MV(df,indexK='isbn',answer='author',withWeight=False,weight='confidence'):
    df_mv = pd.DataFrame(columns=[indexK,answer])
    for indexV in df[indexK].unique():
        data_slice = df[df[indexK]==indexV]
        vote_dict = {}
        for index,row in data_slice.iterrows():
            flag = False
            for key in vote_dict.keys():
                if ( sim_Jaccard(key,row[answer])>=0.8 ):
                    flag = True
                    if(not withWeight):
                        vote_dict[key] += 1
                    else:
                        vote_dict[key] += float(row[weight])
                    break
            if (not flag):
                if(not withWeight):
                    vote_dict[row[answer]] = 1
                else:
                    vote_dict[row[answer]] = float(row[weight])
        vote_list = sorted(vote_dict.items(), key=lambda d:d[1],reverse=True)
        #print({indexK:indexV,answer:vote_list[0][0]})
        df_mv = df_mv.append({indexK:indexV,answer:vote_list[0][0]},ignore_index=True)
    return df_mv

def JudgeAccu(label,pred,pred_col='author'):
    score1 = 0
    score2 = 0
    for index,row in pred.iterrows():
        if not(index in label.index):
            print(index,'no answer')
            score1 += 0 
            score1 += 0
        elif sim_Jaccard(row[pred_col],label.loc[index][pred_col])>=0.9:
            score1 +=1
            score2 +=1
        else:
            print(row[pred_col],"vs",label.loc[index][pred_col],sim_Jaccard(row[pred_col],label.loc[index][pred_col]))
            score1 += 0
            score2 += sim_Jaccard(row[pred_col],label.loc[index][pred_col])
    return score1/len(pred),score2/len(pred)

data_withConfidence = add_confidence(data_golden,F.softmax(pred_prob))

df_mv = MV(data_withConfidence[test_mask.numpy()],withWeight=True,weight='fact_confidence')
df_mv.to_csv( './DataSet/book/golden/GCNResult.txt' , sep='\t' , index=False )

label = pd.read_csv('./DataSet/book/book_golden.txt',sep='\t',low_memory=False,names=['isbn','author'],header=None,index_col=0)
pred = pd.read_csv('./DataSet/book/golden/GCNResult.txt',sep='\t',low_memory=False,index_col=0)

print(JudgeAccu(label,pred))



Meadors, Todd; Schmidt, Cheryl Ann vs meadors, todd ;  schmidt, cheryl a.;   0.6666666666666666
C Bala Kumar, Paul Kline, Tim Thompson vs kumar, c bala;  kline, paul j.;  thompson, tim j.;   0.875
By (author) Hoos, Holger H. By (author) St&uuml;tzle, Thomas vs hoos, holger h.;   stutzle, thomas;   0.4444444444444444
Loshin, Peter vs loshin, pete ;   0.3333333333333333
By (author) Harmon, Paul vs harmon, paul ;   0.6666666666666666
Edited by Jones, Karen Sparck Edited by Willett, Peter vs jones, karen sparck;  willett, peter ;   0.7142857142857143
Fortier, Paul J.; Michel, Howard vs fortier, paul j.;  michel, howard e.;   0.8333333333333334
Guy Steele vs steele, guy l.;   0.6666666666666666
C. B. Jenssen, T. Kvamdal, H. I. Andersson vs jenssen, c. b.;  kvamdal, t. ;  andersson, h. i.;  Ecer, A.;  Periaux, J.;  Satofuka, N.;  Fox, P.; 0.5
George F. Coulouris , Jean Dollimore , Tim Kindberg vs coulouris, george ;  dollimore, jean ;  kindberg, tim ;   0.8571428571428571
Nick Rozanski, E&oa

In [20]:
df_mv = MV(data_withConfidence[test_mask.numpy()],withWeight=False)
df_mv.to_csv( './DataSet/book/golden/MVResult.txt' , sep='\t' , index=False )

label = pd.read_csv('./DataSet/book/book_golden.txt',sep='\t',low_memory=False,names=['isbn','author'],header=None,index_col=0)
pred = pd.read_csv('./DataSet/book/golden/MVResult.txt',sep='\t',low_memory=False,index_col=0)

print(JudgeAccu(label,pred))

Meadors, Todd; Schmidt, Cheryl Ann vs meadors, todd ;  schmidt, cheryl a.;   0.6666666666666666
C Bala Kumar, Paul Kline, Tim Thompson vs kumar, c bala;  kline, paul j.;  thompson, tim j.;   0.875
Hoos, Holger vs hoos, holger h.;   stutzle, thomas;   0.4
Loshin, Peter vs loshin, pete ;   0.3333333333333333
Eberhart, Russell vs kennedy, james ;  eberhart, russell c.;   0.4
Edited by Jones, Karen Sparck Edited by Willett, Peter vs jones, karen sparck;  willett, peter ;   0.7142857142857143
Widom, Jennifer vs widom, jennifer ;  ceri, stefano; 0.5
Fortier, Paul J.; Michel, Howard vs fortier, paul j.;  michel, howard e.;   0.8333333333333334
Guy Steele vs steele, guy l.;   0.6666666666666666
Jenssen, C. B. vs jenssen, c. b.;  kvamdal, t. ;  andersson, h. i.;  Ecer, A.;  Periaux, J.;  Satofuka, N.;  Fox, P.; 0.1875
Dowd, Mark; McDonald, John vs dowd, mark ;  mcdonald, john ;  schuh, justin ;   0.6666666666666666
George F. Coulouris , Jean Dollimore , Tim Kindberg vs coulouris, george ;  doll

In [None]:
len(pred)