In [2]:
import time
import difflib
import pickle
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import dgl
import dgl.function as fn
from dgl import DGLGraph

import networkx as nx
from sklearn.metrics import accuracy_score

In [26]:
#来自DGLGraph tutorial，in_feats、out_feats需要根据feature长度和分类数改动改动
class NodeApplyModule(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(NodeApplyModule, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)
        #self.conv = nn.Conv1d(in_feats,out_feats,1)
        self.activation = activation

    def forward(self, node):
        #h = self.conv(node.data['h'])
        h = self.linear(node.data['h'])
        if self.activation is not None:
            h = self.activation(h,inplace=True)
        return {'h' : h}

gcn_msg = fn.copy_src(src='h', out='m')
gcn_reduce = fn.mean(msg='m', out='h')

class GCN(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(GCN, self).__init__()
        self.apply_mod = NodeApplyModule(in_feats, out_feats, activation)

    def forward(self, g, feature):
        g.ndata['h'] = feature
        g.update_all(gcn_msg, gcn_reduce)
        g.apply_nodes(func=self.apply_mod)
        return g.ndata.pop('h')
    
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        #self.gcn1 = GCN(605, 384, F.relu)
        #self.gcn1 = GCN(2304, 384, F.relu)
        #self.gcn1 = GCN(768, 384, F.relu)
        #self.gcn2 = GCN(384, 192, F.relu)
        #self.fc = nn.Linear(192, 2)
        self.fc1_1 = nn.Linear(768,96)
        self.fc1_2 = nn.Linear(768,96)
        self.fc1_3 = nn.Linear(768,96)
        #self.gcn2 = GCN(288,96,F.relu)
        #self.gcn3 = GCN(96,96,F.relu)
        self.fc4 = nn.Linear(288,2)
        
    def forward(self, g, features):
        x1 = self.fc1_1(features[:,0:768])
        x2 = self.fc1_2(features[:,768:768*2])
        x3 = self.fc1_3(features[:,768*2:768*3])
        x = torch.cat((x1,x2,x3),1)
        #x = self.gcn2(g,x)
        #x = self.gcn3(g,x)
        x = self.fc4(x)
        return x
    
    def predict(self, pred_prob):
        #self.eval()
        pred = F.softmax(pred_prob)
        ans = []
        for t in pred:
            if t[0]>t[1]:
                ans.append(0)
            else:
                ans.append(1)
        return torch.tensor(ans)
    '''
    def predict(self, g, features):
        #self.eval()
        pred = F.softmax(self.forward(g, features))
        ans = []
        for t in pred:
            if t[0]>t[1]:
                ans.append(0)
            else:
                ans.append(1)
        return torch.tensor(ans)
    '''
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)

In [4]:
#读取DealWithData中golden数据
data_golden = pd.read_csv( './DataSet/book/golden/claims_golden2.txt' , sep='\t' )
data_golden['encode'] = torch.load('./DataSet/book/golden/claims_golden_encode_v2.pt')
#data_golden['encode'] = torch.load('./DataSet/book/golden/claims_golden_encode.pt')
GoldenLabel = pd.read_table("./DataSet/book/book_golden.txt" , sep='\t' , header=None , names=['isbn','author'])

In [None]:
#随意点划分训练集和测试集，直觉要根据'isbn'划分 -> 改为不直接划分数据集，采用mask方式划分
'''
data_train = pd.DataFrame(data_golden.drop(data_golden.index,inplace=False))
data_test = pd.DataFrame(data_golden.drop(data_golden.index,inplace=False))
for i in range(0,int(len(GoldenLabel)/2+1)):
    data_trainSlice = data_golden[data_golden['isbn']==GoldenLabel.loc[i]['isbn']]
    data_train = data_train.append(data_trainSlice)
for i in range(int(len(GoldenLabel)/2+1),len(GoldenLabel)):
    data_testSlice = data_golden[data_golden['isbn']==GoldenLabel.loc[i]['isbn']]
    data_test = data_test.append(data_testSlice)
data_train.reset_index(drop=True,inplace=True)
data_test.reset_index(drop=True,inplace=True)
'''

In [5]:
#使用遮盖而非分隔的方式划分数据集
def divide_dataset(dfw,dfk,test_ratio):
    a = np.random.choice(len(dfk), int(len(dfk)*test_ratio), replace=False)
    test_set = set()
    for i in range(0,a.shape[0]):
        test_set.add(dfk.loc[a[i]]['isbn'])
    train_mask = torch.Tensor(size=[len(dfw)]).bool()
    test_mask = torch.Tensor(size=[len(dfw)]).bool()
    for i in range(0,len(dfw)):
        if dfw.loc[i]['isbn'] in test_set:
            test_mask[i] = True
            train_mask[i] = False
        else:
            test_mask[i] = False
            train_mask[i] = True
    return train_mask,test_mask
train_mask,test_mask = divide_dataset(data_golden,GoldenLabel,0.8)

In [None]:
#构建图结构函数，根据训练集和测试集连边 -> 改为基于全体数据连边
#此处可优化复杂度？n**2/2 -> kn
#此处可根据'book_name'相似度进一步加边

#使用Jaccard相似度之后边数 124446 -> 124468 ，影响不大？

def sim_Jaccard (str1,str2) :
    set1 = set( str1.lower().replace(';',' ').replace(',',' ').replace('.',' ').replace(':',' ').replace('&',' ').
               replace('/',' ').replace('\'',' ').replace('(author)',' ').replace('(joint author)',' ').split() )
    set2 = set( str2.lower().replace(';',' ').replace(',',' ').replace('.',' ').replace(':',' ').replace('&',' ').
               replace('/',' ').replace('\'',' ').replace('(author)',' ').replace('(joint author)',' ').split() )
    return len(set1&set2)/len(set1|set2)

def generate_DGLGraph(df):
    #edge_norm = []
    g = DGLGraph()
    g.add_nodes(df.shape[0])
    for i in range(0,len(df)):
        for j in range(i+1,len(df)):
            if (df.loc[i]["source"]==df.loc[j]["source"]):
                g.add_edge(i,j)
                #edge_norm.append(1.0)
                g.add_edge(j,i)
                #edge_norm.append(1.0)
            elif (df.loc[i]["isbn"]==df.loc[j]["isbn"]):
                str1 = df.loc[i]['author']
                str2 = df.loc[j]['author']
                #print(str1,str2,difflib.SequenceMatcher(None,str1,str2).quick_ratio())
                #if ( difflib.SequenceMatcher(None,str1,str2).quick_ratio()>0.8 ):
                if ( sim_Jaccard(str1,str2)>0.8 ):
                    g.add_edge(i,j)
                    #edge_norm.append(1.0)
                    g.add_edge(j,i)
                    #edge_norm.append(1.0)
    #edge_norm = torch.Tensor(edge_norm).unsqueeze(1)
    #g.edata.update({ 'norm': edge_norm })
    return g

graph_whole = generate_DGLGraph(data_golden)
#graph_train = generate_DGLGraph(data_train)
#graph_test = generate_DGLGraph(data_test)

In [None]:
#存储图结构
file = open('./DataSet/book/golden/graph_whole2.pickle', 'wb')
pickle.dump(graph_whole, file)
file.close()
#file = open('./DataSet/book/golden/graph_train.pickle', 'wb')
#pickle.dump(graph_train, file)
#file.close()
#file = open('./DataSet/book/golden/graph_test.pickle', 'wb')
#pickle.dump(graph_test, file)
#file.close()

In [6]:
#加载图结构
with open('./DataSet/book/golden/graph_whole2.pickle', 'rb') as file:
    graph_whole =pickle.load(file)
#with open('./DataSet/book/golden/graph_train.pickle', 'rb') as file:
#    graph_train =pickle.load(file)
#with open('./DataSet/book/golden/graph_test.pickle', 'rb') as file:
#    graph_test =pickle.load(file)

In [None]:
#存储和加载图结构的失败尝试
'''
graph_train_netx = graph_train.to_networkx()
graph_test_netx = graph_test.to_networkx()
nx.write_gexf(graph_train_netx,'./DataSet/book/golden/train_graph.gexf')
nx.write_gexf(graph_test_netx,'./DataSet/book/golden/test_graph.gexf')

graph_train_netx = nx.read_gexf('./DataSet/book/golden/train_graph.gexf')
graph_test_netx = nx.read_gexf('./DataSet/book/golden/test_graph.gexf')

graph_train2 = DGLGraph(graph_train_netx)
#graph_train2.from_networkx(graph_train_netx)
graph_test2 = DGLGraph(graph_test_netx)
#graph_test2.from_networkx(graph_test_netx)
'''

In [7]:
def extract_featureNlabel(df):
    a = torch.zeros(1,len(df.loc[0]['encode']))
    c = torch.zeros(1)
    for _,row in df.iterrows():
        b = row['encode'].reshape([1,-1])
        a = torch.cat((a,b),0)
        if(row['label']):
            d = torch.ones(1)
        else:
            d = torch.zeros(1)
        c = torch.cat((c,d),-1)
    return a[1:,:],c[1:].long()
#_,whole_label = extract_featureNlabel(data_golden)
#whole_feature = torch.load('./DataSet/book/golden/claims_golden_encode_tfidf.pt')
whole_feature,whole_label = extract_featureNlabel(data_golden)
#train_feature,train_label = extract_featureNlabel(data_train)
#test_feature,test_label = extract_featureNlabel(data_test)

结果比对

a:1\*768(bert)

b:1\*3\*768(bert_v2)

c:1\*605(tfidf)

train:test=1:1

| method | len | lr | epoch | loss | accu |
|:----: | :----: | :----: | :----: | :----: | :----: |
| a | 768 | 1e-5 | 200 | 1.xx | 0.6 | 
| b | 2304 | 1e-5 | 200 | 5.xx | 0.7 |
| c | 605 | 1e-5 | 200 | 0.5x | 0.7 |

In [27]:
dur = []
for epoch in range(100):
    
    t0 = time.time()

    pred_prob = net.forward(graph_whole, whole_feature)
    loss = criterion(pred_prob[train_mask],whole_label[train_mask])
    
    #pred_prob = net.forward(graph_train, train_feature)
    #loss = criterion(pred_prob,train_label)
    #pred_label = net.predict(graph_whole, whole_feature)
    pred_label = net.predict(pred_prob)
    train_accu = accuracy_score(pred_label[train_mask],whole_label[train_mask])
    test_accu = accuracy_score(pred_label[test_mask],whole_label[test_mask])
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    dur.append(time.time() - t0)
    print("Epoch {:05d} | Loss {:.4f} | Time(s) {:.4f} | Train_Accu {:4f} | Test_Accu {:4f}".format(
        epoch, loss.item(), np.mean(dur), train_accu,test_accu))



Epoch 00000 | Loss 0.7364 | Time(s) 0.1652 | Train_Accu 0.523385 | Test_Accu 0.444573
Epoch 00001 | Loss 0.6816 | Time(s) 0.1610 | Train_Accu 0.567929 | Test_Accu 0.559469
Epoch 00002 | Loss 0.7011 | Time(s) 0.1586 | Train_Accu 0.545657 | Test_Accu 0.588915
Epoch 00003 | Loss 0.6578 | Time(s) 0.1535 | Train_Accu 0.594655 | Test_Accu 0.583718
Epoch 00004 | Loss 0.6308 | Time(s) 0.1542 | Train_Accu 0.641425 | Test_Accu 0.516744
Epoch 00005 | Loss 0.6363 | Time(s) 0.1549 | Train_Accu 0.625835 | Test_Accu 0.484988
Epoch 00006 | Loss 0.6348 | Time(s) 0.1531 | Train_Accu 0.628062 | Test_Accu 0.472286
Epoch 00007 | Loss 0.6139 | Time(s) 0.1522 | Train_Accu 0.663697 | Test_Accu 0.498845
Epoch 00008 | Loss 0.5931 | Time(s) 0.1530 | Train_Accu 0.681514 | Test_Accu 0.544457
Epoch 00009 | Loss 0.5867 | Time(s) 0.1529 | Train_Accu 0.685969 | Test_Accu 0.595266
Epoch 00010 | Loss 0.5873 | Time(s) 0.1539 | Train_Accu 0.679287 | Test_Accu 0.623557
Epoch 00011 | Loss 0.5809 | Time(s) 0.1540 | Train_Acc

Epoch 00096 | Loss 0.2762 | Time(s) 0.1666 | Train_Accu 0.919822 | Test_Accu 0.652425
Epoch 00097 | Loss 0.2743 | Time(s) 0.1664 | Train_Accu 0.919822 | Test_Accu 0.653580
Epoch 00098 | Loss 0.2724 | Time(s) 0.1664 | Train_Accu 0.919822 | Test_Accu 0.653580
Epoch 00099 | Loss 0.2706 | Time(s) 0.1664 | Train_Accu 0.919822 | Test_Accu 0.655312


In [None]:
#玩具数据集用以检验图结构生成的准确率。
#测试发现difflib的相似度量是字符级的
'''
dataframe = pd.DataFrame([
        ["a", "111222", "computer Science", "bruce"],
        ["b", "111222", "computer Science", "Bruce Lee"],
        ["c", "111222", "computer Science", "mike ,john"],
        ["a", "111223", "Hassdsdsaad", "kkl"],
        ["d", "111223", "Hassdsdaaad", "kkkl"],
        ["c", "111224", "asdfgh", "zxcr"]
    ],
    columns=["source", "isbn", "name", "author"]
)
g = generate_DGLGraph(dataframe)
'''

In [28]:
def add_confidence(df,prob,col_name='fact_confidence'):
    df[col_name] = None
    for i in range(len(df)):
        df.loc[i,col_name] = float(prob[i][1])
    return df

def sim_Jaccard (str1,str2) :
    set1 = set( str1.lower().replace(';',' ').replace(',',' ').replace('.',' ').replace(':',' ').replace('&',' ').
               replace('/',' ').replace('\'',' ').replace('(author)',' ').replace('(joint author)',' ').split() )
    set2 = set( str2.lower().replace(';',' ').replace(',',' ').replace('.',' ').replace(':',' ').replace('&',' ').
               replace('/',' ').replace('\'',' ').replace('(author)',' ').replace('(joint author)',' ').split() )
    return len(set1&set2)/len(set1|set2)

def MV(df,indexK='isbn',answer='author',withWeight=False,weight='confidence'):
    df_mv = pd.DataFrame(columns=[indexK,answer])
    for indexV in df[indexK].unique():
        data_slice = df[df[indexK]==indexV]
        vote_dict = {}
        for index,row in data_slice.iterrows():
            flag = False
            for key in vote_dict.keys():
                if ( sim_Jaccard(key,row[answer])>=0.8 ):
                    flag = True
                    if(not withWeight):
                        vote_dict[key] += 1
                    else:
                        vote_dict[key] += float(row[weight])
                    break
            if (not flag):
                if(not withWeight):
                    vote_dict[row[answer]] = 1
                else:
                    vote_dict[row[answer]] = float(row[weight])
        vote_list = sorted(vote_dict.items(), key=lambda d:d[1],reverse=True)
        #print({indexK:indexV,answer:vote_list[0][0]})
        df_mv = df_mv.append({indexK:indexV,answer:vote_list[0][0]},ignore_index=True)
    return df_mv

def JudgeAccu(label,pred,pred_col='author'):
    score = 0
    for index,row in pred.iterrows():
        if not(index in label.index):
            print(index,'no answer')
            score += 0 
        elif sim_Jaccard(row[pred_col],label.loc[index][pred_col])>=0.8:
            score +=1
        else:
            #print(row[pred_col],"vs",label.loc[index][pred_col])
            score += 0
    return score/len(pred)

data_withConfidence = add_confidence(data_golden,F.softmax(pred_prob))

df_mv = MV(data_withConfidence[test_mask.numpy()],withWeight=True,weight='fact_confidence')
df_mv.to_csv( './DataSet/book/golden/GCNResult.txt' , sep='\t' , index=False )

label = pd.read_csv('./DataSet/book/book_golden.txt',sep='\t',low_memory=False,names=['isbn','author'],header=None,index_col=0)
pred = pd.read_csv('./DataSet/book/golden/GCNResult.txt',sep='\t',low_memory=False,index_col=0)

print(JudgeAccu(label,pred))



0.8625


In [14]:
df_mv = MV(data_withConfidence[test_mask.numpy()],withWeight=False)
df_mv.to_csv( './DataSet/book/golden/MVResult.txt' , sep='\t' , index=False )

label = pd.read_csv('./DataSet/book/book_golden.txt',sep='\t',low_memory=False,names=['isbn','author'],header=None,index_col=0)
pred = pd.read_csv('./DataSet/book/golden/MVResult.txt',sep='\t',low_memory=False,index_col=0)

print(JudgeAccu(label,pred))

0.7875


In [25]:
len(pred)

80