In [13]:
import time
import difflib
import pickle
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import dgl
import dgl.function as fn
from dgl import DGLGraph

import networkx as nx
from sklearn.metrics import accuracy_score

In [None]:
'''
    net1: 无自环
        self.fc1_1 = nn.Linear(768,192)
        self.fc1_2 = nn.Linear(768,192)
        self.fc1_3 = nn.Linear(768,192)
        self.fc2 = nn.Linear(576,2)
    50 epoch, train 0.75, test 0.63
    
    net2: 无自环
        self.fc1_1 = nn.Linear(768,192)
        self.fc1_2 = nn.Linear(768,192)
        self.fc1_3 = nn.Linear(768,192)
        self.gcn2 = GCN(576,384,F.relu)
        self.fc3 = nn.Linear(384,2)
    100 epoch, train 0.85, test 0.65
    
    net3: 无自环
        self.fc1_1 = nn.Linear(768,192)
        self.fc1_2 = nn.Linear(768,192)
        self.fc1_3 = nn.Linear(768,192)
        self.gcn2 = GCN(576,384,F.relu)
        self.gcn3 = GCN(384,64,F.relu)
        self.fc4 = nn.Linear(64,2)
    100 epoch, train 0.75, test 0.60
'''


In [14]:
#来自DGLGraph tutorial，in_feats、out_feats需要根据feature长度和分类数改动改动
class NodeApplyModule(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(NodeApplyModule, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)
        self.activation = activation

    def forward(self, node):
        h = self.linear(node.data['h'])
        if self.activation is not None:
            h = self.activation(h,inplace=True)
        return {'h' : h}

gcn_msg = fn.copy_src(src='h', out='m')
gcn_reduce = fn.sum(msg='m', out='h')

class GCN(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(GCN, self).__init__()
        self.apply_mod = NodeApplyModule(in_feats, out_feats, activation)

    def forward(self, g, feature):
        g.ndata['h'] = feature
        g.update_all(gcn_msg, gcn_reduce)
        g.apply_nodes(func=self.apply_mod)
        return g.ndata.pop('h')
    
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.fc1_1 = nn.Linear(768,96)
        self.fc1_2 = nn.Linear(768,96)
        self.fc1_3 = nn.Linear(768,96)
        self.gcn2 = GCN(288,64,F.relu)
        self.gcn3 = GCN(64,64,F.relu)
        self.fc4 = nn.Linear(64,1)
        
    def forward(self,g,features):
        x_1 = self.fc1_1(features[:,0:768])
        x_2 = self.fc1_2(features[:,768:768*2])
        x_3 = self.fc1_3(features[:,768*2:768*3])
        x = torch.cat( ( x_1,x_2,x_3 ) , 1 )
        x = self.gcn2(g,x)
        x = self.gcn3(g,x)
        x = self.fc4(x)
        return x.squeeze()
    
    def predict(self, pred_prob):
        #pred = F.softmax(pred_prob,axis=1)
        pred = F.sigmoid(pred_prob)
        ans = []
        for t in pred:
            ans.append(float(t>0.5))
            #if t[0]>t[1]:
            #    ans.append(0)
            #else:
            #    ans.append(1)
        return torch.FloatTensor(ans)

In [22]:
net = Net()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)

In [16]:
#读取DealWithData中golden数据和label数据
#该版本削减了同source不同big_cate下的边，得到正反馈的结果
GoldenLabel = pd.read_table('./DataSet/vldbBook/book_truth.txt',low_memory=False)
GoldenLabel.rename(columns={'isbn_10':'isbn','authors_truth':'author'},inplace=True)
data_golden = pd.read_csv( './DataSet/vldbBook/data_golden/data_golden.txt'  , sep='\t' )
data_golden['encode'] = torch.load('./DataSet/vldbBook/data_golden/bertEncodeFull.pt')
with open('./DataSet/vldbBook/data_golden/graph.pickle', 'rb') as file:
    graph_whole =pickle.load(file)

In [21]:
#使用遮盖而非分隔的方式划分数据集
def divide_dataset(dfw,dfk,test_ratio):
    a = np.random.choice(len(dfk), int(len(dfk)*test_ratio), replace=False)
    test_set = set()
    for i in range(0,a.shape[0]):
        test_set.add(dfk.loc[a[i]]['isbn'])
    train_mask = torch.Tensor(size=[len(dfw)]).bool()
    test_mask = torch.Tensor(size=[len(dfw)]).bool()
    for i in range(0,len(dfw)):
        if dfw.loc[i]['isbn'] in test_set:
            test_mask[i] = True
            train_mask[i] = False
        else:
            test_mask[i] = False
            train_mask[i] = True
    return train_mask,test_mask
train_mask,test_mask = divide_dataset(data_golden,GoldenLabel,0.5)

In [8]:
#从dataframe中提取feature以及label
def extract_featureNlabel(df):
    a = torch.zeros(1,len(df.loc[0]['encode']))
    c = torch.zeros(1)
    for _,row in df.iterrows():
        b = row['encode'].reshape([1,-1])
        a = torch.cat((a,b),0)
        if(row['label']):
            d = torch.ones(1)
        else:
            d = torch.zeros(1)
        c = torch.cat((c,d),-1)
    return a[1:,:],c[1:].long()

whole_feature,whole_label = extract_featureNlabel(data_golden)
#使用pickle将特征与标签存储在文件中以便复用
file = open('./DataSet/vldbBook/data_golden/whole_feature.pickle', 'wb')
pickle.dump(whole_feature, file)
file.close()
file = open('./DataSet/vldbBook/data_golden/whole_label.pickle', 'wb')
pickle.dump(whole_label, file)
file.close()

In [18]:
#从文件中读取提取出的特征和标签
with open('./DataSet/vldbBook/data_golden/whole_feature.pickle', 'rb') as file:
    whole_feature =pickle.load(file)
with open('./DataSet/vldbBook/data_golden/whole_label.pickle', 'rb') as file:
    whole_label =pickle.load(file)

In [13]:
torch.save(net,'./DataSet/vldbBook/data_golden/net.pkl')
net = torch.load('./DataSet/vldbBook/data_golden/net.pkl')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [23]:
dur = []
for epoch in range(50):
    
    t0 = time.time()

    pred_prob = net.forward(graph_whole, whole_feature)
    loss = criterion( pred_prob[train_mask] , whole_label[train_mask].float() )
    
    pred_label = net.predict(pred_prob)
    train_accu = accuracy_score( pred_label[train_mask] , whole_label[train_mask] )
    test_accu = accuracy_score(pred_label[test_mask],whole_label[test_mask])
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    dur.append(time.time() - t0)
    print("Epoch {:03d} | Loss {:.4f} | Time(s) {:.4f} | Train_Accu {:4f} | Test_Accu {:4f}".format(
        epoch, loss.item(), np.mean(dur), train_accu,test_accu))

Epoch 000 | Loss 47.5400 | Time(s) 2.1318 | Train_Accu 0.510790 | Test_Accu 0.527438
Epoch 001 | Loss 1299.9746 | Time(s) 2.1430 | Train_Accu 0.509153 | Test_Accu 0.486559
Epoch 002 | Loss 369.4326 | Time(s) 2.1457 | Train_Accu 0.509153 | Test_Accu 0.486559
Epoch 003 | Loss 39.4337 | Time(s) 2.1722 | Train_Accu 0.521804 | Test_Accu 0.538769
Epoch 004 | Loss 51.2407 | Time(s) 2.1667 | Train_Accu 0.510790 | Test_Accu 0.529660
Epoch 005 | Loss 16.3463 | Time(s) 2.1648 | Train_Accu 0.450960 | Test_Accu 0.505221
Epoch 006 | Loss 11.5075 | Time(s) 2.1611 | Train_Accu 0.534306 | Test_Accu 0.554543
Epoch 007 | Loss 138.4835 | Time(s) 2.1623 | Train_Accu 0.509153 | Test_Accu 0.486559
Epoch 008 | Loss 23.9365 | Time(s) 2.1602 | Train_Accu 0.524334 | Test_Accu 0.541213
Epoch 009 | Loss 40.0762 | Time(s) 2.1545 | Train_Accu 0.521506 | Test_Accu 0.537880
Epoch 010 | Loss 20.6567 | Time(s) 2.1558 | Train_Accu 0.527757 | Test_Accu 0.543657
Epoch 011 | Loss 56.7509 | Time(s) 2.1548 | Train_Accu 0.5091

In [24]:
def add_confidence(df,prob,col_name='fact_confidence'):
    df[col_name] = None
    for i in range(len(df)):
        df.loc[i,col_name] = float(prob[i])
    return df

def sim_Jaccard (str1,str2) :
    set1 = set( str1.lower().replace(';',' ').replace(',',' ').replace('.',' ').replace(':',' ').replace('&',' ').
               replace('/',' ').replace('\'',' ').replace('(author)',' ').replace('(joint author)',' ').split() )
    set2 = set( str2.lower().replace(';',' ').replace(',',' ').replace('.',' ').replace(':',' ').replace('&',' ').
               replace('/',' ').replace('\'',' ').replace('(author)',' ').replace('(joint author)',' ').split() )
    return len(set1&set2)/len(set1|set2)

def MV(df,indexK='isbn',answer='author',withWeight=False,weight='confidence'):
    df_mv = pd.DataFrame(columns=[indexK,answer])
    for indexV in df[indexK].unique():
        data_slice = df[df[indexK]==indexV]
        vote_dict = {}
        for index,row in data_slice.iterrows():
            flag = False
            for key in vote_dict.keys():
                if ( sim_Jaccard(key,row[answer])>=0.8 ):
                    flag = True
                    if(not withWeight):
                        vote_dict[key] += 1
                    else:
                        vote_dict[key] += float(row[weight])
                    break
            if (not flag):
                if(not withWeight):
                    vote_dict[row[answer]] = 1
                else:
                    vote_dict[row[answer]] = float(row[weight])
        vote_list = sorted(vote_dict.items(), key=lambda d:d[1],reverse=True)
        #print({indexK:indexV,answer:vote_list[0][0]})
        df_mv = df_mv.append({indexK:indexV,answer:vote_list[0][0]},ignore_index=True)
    return df_mv

def JudgeAccu(label,pred,pred_col='author'):
    score = 0
    for index,row in pred.iterrows():
        if not(index in label.index):
            print(index,'no answer')
            score += 0 
        elif sim_Jaccard(row[pred_col],label.loc[index][pred_col])>=0.8:
            score +=1
        else:
            #print(row[pred_col],"vs",label.loc[index][pred_col])
            score += 0
    return score/len(pred)

data_withConfidence = add_confidence(data_golden,F.sigmoid(pred_prob))

df_mv = MV(data_withConfidence[test_mask.numpy()],withWeight=True,weight='fact_confidence')
df_mv.to_csv( './DataSet/vldbBook/GCNResult.txt' , sep='\t' , index=False )

label = pd.read_csv('./DataSet/vldbBook/book_truth.txt',sep='\t',low_memory=False,index_col=0)
label.rename(columns={'isbn_10':'isbn','authors_truth':'author'},inplace=True)
pred = pd.read_csv('./DataSet/vldbBook/GCNResult.txt',sep='\t',low_memory=False,index_col=0)

print(JudgeAccu(label,pred))

0.4691358024691358
