In [12]:
import numpy as np
from numpy.linalg import norm
import math
import csv
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore', category=PendingDeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

In [13]:
def sim_Jaccard (str1,str2) :
    set1 = set( str1.lower().replace(';',' ').replace(',',' ').replace('.',' ').replace(':',' ').replace('&',' ').
               replace('/',' ').replace('\'',' ').replace('(author)',' ').replace('(joint author)',' ').split() )
    set2 = set( str2.lower().replace(';',' ').replace(',',' ').replace('.',' ').replace(':',' ').replace('&',' ').
               replace('/',' ').replace('\'',' ').replace('(author)',' ').replace('(joint author)',' ').split() )
    return len(set1&set2)/len(set1|set2)

In [41]:
def sigmoid(x):
    return  1 / ( 1 + math.exp(-x) )

#imp(str1->str2)
def imp_Jaccard (str1,str2) :
    set1 = set( str1.lower().replace(';',' ').replace(',',' ').replace('.',' ').replace(':',' ').replace('&',' ').
               replace('/',' ').replace('\'',' ').replace('(author)',' ').replace('(joint author)',' ').split() )
    set2 = set( str2.lower().replace(';',' ').replace(',',' ').replace('.',' ').replace(':',' ').replace('&',' ').
               replace('/',' ').replace('\'',' ').replace('(author)',' ').replace('(joint author)',' ').split() )
    imp_rate = len(set1&set2)/len(set1)
    return imp_rate-0.5

class TruthFinder(object):
    
    def __init__(self,implication,dampening_factor=0.3,influence_related=0.5,source_col='source',key_col='isbn',ans_col='author'):
        assert(0 < dampening_factor < 1)
        assert(0 <= influence_related <= 1)
        self.implication = implication
        self.dampening_factor = dampening_factor
        self.influence_related = influence_related
        self.source_col = source_col
        self.key_col = key_col
        self.ans_col = ans_col
        
    def train(self,dataframe,max_iterations=10,
              threshold=1e-4,initial_trustworthiness=0.9):
        dataframe["trustworthiness"]=\
            np.ones(len(dataframe.index))*initial_trustworthiness
        dataframe["fact_confidence"] = np.zeros(len(dataframe.index))
        for i in range(max_iterations):
            print('iteration',i)
            t1 = dataframe.drop_duplicates( self.source_col )["trustworthiness"]
            dataframe = self.iteration(dataframe)
            t2 = dataframe.drop_duplicates( self.source_col )["trustworthiness"]
            if self.stop_condition(t1,t2,threshold*len(dataframe)):
                return dataframe
        return dataframe
        
    def iteration(self,df):
        df = self.update_fact_confidence(df)
        df = self.update_website_trustworthiness(df)
        return df
    
    def stop_condition(self,t1,t2,threshold):
        return norm(t2-t1)<threshold
        
    def update_fact_confidence(self,df):
        for object_ in df[ self.key_col ].unique():
            indices = df[ self.key_col ] == object_
            d = df.loc[indices]
            d = self.calculate_confidence(d)
            s = self.adjust_confidence(d)
            df.loc[indices] = self.compute_fact_confidence(d)
        return df
    
    def calculate_confidence(self,df):
        #Eq 3,5
        truthworthiness_score = lambda x: -math.log(1.0-x)
        for i,row in df.iterrows():
            ts = df.loc[df[ self.ans_col ]==row[ self.ans_col ],"trustworthiness"]
            try:
                v = sum(truthworthiness_score(t) for t in ts)
            except ValueError:
                print(i,row)
                print(df.loc[df[self.key_col]==row[self.key_col]])
                
            df.set_value(i,"fact_confidence",v)
        return df
    
    def adjust_confidence(self,df):
        #Eq 6
        update = {}
        for i,row1 in df.iterrows():
            f1 = row1[ self.ans_col ]
            s = 0
            for j,row2 in df.drop_duplicates( self.ans_col ).iterrows():
                f2 = row2[ self.ans_col ]
                if f1==f2:
                    continue
                s += row2["fact_confidence"] * self.implication(f2,f1)
            update[i] = self.influence_related * s + row1["fact_confidence"]
        for i,row1 in df.iterrows():
            df.set_value(i,"fact_confidence",update[i])
        return df
    
    def compute_fact_confidence(self,df):
        #Eq 8
        f = lambda x: sigmoid(self.dampening_factor*x)
        for i,row in df.iterrows():
            df.set_value(i,"fact_confidence",f(row["fact_confidence"]))
        return df
    
    def update_website_trustworthiness(self,df):
        #Eq 1
        for website in df[ self.source_col ].unique():
            indices = df[ self.source_col ]==website
            cs = df.loc[indices,"fact_confidence"]
            df.loc[indices,"trustworthiness"] = sum(cs)/len(cs)
        return df

In [42]:
dataframe = pd.read_csv( './DataSet/book/golden/claims_golden.txt' , sep='\t' )
finder = TruthFinder(imp_Jaccard,dampening_factor=0.3,influence_related=0.5)
dataframe = finder.train(dataframe)

iteration 0
iteration 1
iteration 2
1455 source                                                 ER Books
isbn                                                 0201889544
name               The C++ Programming Language (Third Edition)
author                                       Stroustrup, Bjarne
encode                                                      NaN
label                                                      True
trustworthiness                                               1
fact_confidence                                               1
Name: 1455, dtype: object
                                       source        isbn  \
1453                             Frugal Media  0201889544   
1454     Rainbow Bear Books and Miscellaneous  0201889544   
1455                                 ER Books  0201889544   
1456                             Sandra Gudac  0201889544   
1457                               Cobain LLC  0201889544   
1458                          Cellar of Books  02018895

1458 source                          Cellar of Books
isbn                                 0201889544
name               The C++ Programming Language
author                       Stroustrup, Bjarne
encode                                      NaN
label                                      True
trustworthiness                               1
fact_confidence                               1
Name: 1458, dtype: object
                                       source        isbn  \
1453                             Frugal Media  0201889544   
1454     Rainbow Bear Books and Miscellaneous  0201889544   
1455                                 ER Books  0201889544   
1456                             Sandra Gudac  0201889544   
1457                               Cobain LLC  0201889544   
1458                          Cellar of Books  0201889544   
1459                         BridgeTown Books  0201889544   
1460                          Powell's  Books  0201889544   
1461                               x

1494         1.000000  
1484 source                   Books2Anywhere.com
isbn                             0201889544
name               C++ Programming Language
author                   Stroustrup, Bjarne
encode                                  NaN
label                                  True
trustworthiness                    0.993047
fact_confidence                           1
Name: 1484, dtype: object
                                       source        isbn  \
1453                             Frugal Media  0201889544   
1454     Rainbow Bear Books and Miscellaneous  0201889544   
1455                                 ER Books  0201889544   
1456                             Sandra Gudac  0201889544   
1457                               Cobain LLC  0201889544   
1458                          Cellar of Books  0201889544   
1459                         BridgeTown Books  0201889544   
1460                          Powell's  Books  0201889544   
1461                               xpresstex

1203 source                                             The Library Store
isbn                                                      0321228103
name               Open Source .NET Development: Programming With...
author                                                  Nantz, Brian
encode                                                           NaN
label                                                           True
trustworthiness                                                    1
fact_confidence                                                    1
Name: 1203, dtype: object
                                    source        isbn  \
1203                     The Library Store  0321228103   
1204                           eCampus.com  0321228103   
1205                            Orca Books  0321228103   
1206                             Indoo.com  0321228103   
1207                                Caiman  0321228103   
1208                       textbookxdotcom  0321228103   
1209       

UnboundLocalError: local variable 'v' referenced before assignment

In [None]:
#1.一些文章中提到答案完全一样的问题对于提升结果没有帮助
#2.存在一些fact_confidence变为1的情况，可能是同一问题下的答案同质化
#3.暂且搁置该段代码

def RemoveUnhelpfulRows(df,key_col='isbn',ans_col='author'):
    i = 0
    for isbn in df[key_col].unique():
        indices = df[key_col]==isbn
        if len(df.loc[indices][ans_col].unique())==1:
            df.drop(index=df.loc[indices].index,inplace=True)
    return df
df_t = RemoveUnhelpfulRows(dataframe)

In [17]:
def PickHighest(df,indexK='isbn',answer='author',weight='fact_confidence'):
    df_ph = pd.DataFrame(columns=[indexK,answer])
    for indexV in df[indexK].unique():
        df_slice = df[ df[indexK]==indexV ]
        df_slice = df_slice.sort_values( by=weight, ascending=False )
        df_ph = df_ph.append({indexK:df_slice.iloc[0][indexK],answer:df_slice.iloc[0][answer]},ignore_index=True)
    return df_ph

In [36]:
df_ph = PickHighest(dataframe)
df_ph.set_index('isbn',inplace=True)
label = pd.read_csv('./DataSet/book/book_golden.txt',sep='\t',low_memory=False,names=['isbn','author'],header=None,index_col=0)
#TruthFinder_result.to_csv( './DataSet/vldbBook/TruthFinderResult.txt' , sep='\t' , index=False )

In [37]:
JudgeAccu(label,df_ph)

Dennis Suhanovs, Press Certification, Certification Press vs suhanovs, dennis ;
By (author) Hoos, Holger H. By (author) St&uuml;tzle, Thomas vs hoos, holger h.;   stutzle, thomas;  
John Strassner vs strassner, john c.;  
David B. Makofske, Kevin C. Almeroth vs makofske, david ;  almeroth, kevin ;  
Edited by Lacroix, Zo&eacute; Edited by Critchlow, Terence vs lacroix, zoe ;  critchlow, terence;
Loshin, Peter vs loshin, pete ;  
Puder, Arno; R&ouml;mer, Kay; Pilhofer, Frank vs puder, arno ;  romer, kay ;  pilhofer, frank ;  
By (author) Eberhart, Russell C. By (author) Shi, Yuhui By (author) Kennedy, James vs kennedy, james ;  eberhart, russell c.;  
Edited by Jones, Karen Sparck Edited by Willett, Peter vs jones, karen sparck;  willett, peter ;  
By (author) Duffy, Michael D vs duffy, michael d.;  
Etzel, Michael vs etzel, michael ;  dickinson, karen ;  
Guy Steele vs steele, guy l.;  
Shaw, Paul vs shaw, paul d.;  
By (author) McManus, John vs mcmanus, john ;  
McManus, Jeffrey P.; G

0.9294113756613755

In [28]:
def MV(df,indexK='isbn',answer='author',withWeight=False,weight='confidence'):
    df_mv = pd.DataFrame(columns=[indexK,answer])
    for indexV in df[indexK].unique():
        data_slice = df[df[indexK]==indexV]
        vote_dict = {}
        for index,row in data_slice.iterrows():
            flag = False
            for key in vote_dict.keys():
                if ( sim_Jaccard(key,row[answer])>=0.8 ):
                    flag = True
                    if(not withWeight):
                        vote_dict[key] += 1
                    else:
                        vote_dict[key] += float(row[weight])
                    break
            if (not flag):
                if(not withWeight):
                    vote_dict[row[answer]] = 1
                else:
                    vote_dict[row[answer]] = float(row[weight])
        vote_list = sorted(vote_dict.items(), key=lambda d:d[1],reverse=True)
        #print({indexK:indexV,answer:vote_list[0][0]})
        df_mv = df_mv.append({indexK:indexV,answer:vote_list[0][0]},ignore_index=True)
    return df_mv

In [38]:
df_mv = MV(dataframe,withWeight=True,weight='fact_confidence')
df_mv.set_index('isbn',inplace=True)
#df_mv.to_csv( './DataSet/vldbBook/TruthFinderResult_mv.txt' , sep='\t' , index=False )

In [39]:
JudgeAccu(label,df_mv)

Conklin, Art; Davis, Roger; Williams, Dwayne; Cothren, Chuck vs conklin, wm. arthur;  white, gregory b.;  cothren, chuck ;  williams, dwayne ;  davis, roger l.;
Dennis Suhanovs, Press Certification, Certification Press vs suhanovs, dennis ;
White, Gregory B. vs white, gregory ;  
Meadors, Todd; Schmidt, Cheryl Ann vs meadors, todd ;  schmidt, cheryl a.;  
Hoos, Holger vs hoos, holger h.;   stutzle, thomas;  
John Strassner vs strassner, john c.;  
Zoe Lacroix vs lacroix, zoe ;  critchlow, terence;
Loshin, Peter vs loshin, pete ;  
Eberhart, Russell vs kennedy, james ;  eberhart, russell c.;  
Edited by Jones, Karen Sparck Edited by Willett, Peter vs jones, karen sparck;  willett, peter ;  
Erbschloe vs erbschloe, michael ;  
Etzel, Michael vs etzel, michael ;  dickinson, karen ;  
Guy Steele vs steele, guy l.;  
Shaw, Paul vs shaw, paul d.;  
Jenssen, C. B. vs jenssen, c. b.;  kvamdal, t. ;  andersson, h. i.;  Ecer, A.;  Periaux, J.;  Satofuka, N.;  Fox, P.;
Dowd, Mark; McDonald, John 

0.9121369047619048

In [31]:
def JudgeAccu(label,pred,pred_col='author'):
    score = 0
    for index,row in pred.iterrows():
        if not(index in label.index):
            print(index,'no answer')
            score += 0 
        elif sim_Jaccard(row[pred_col],label.loc[index][pred_col])>=0.8:
            score +=1
        else:
            print(row[pred_col],"vs",label.loc[index][pred_col])
            score += sim_Jaccard(row[pred_col],label.loc[index][pred_col])
    return score/len(pred)

In [None]:
label = pd.read_csv('./DataSet/vldbBook/book_truth.txt',sep='\t',low_memory=False,index_col=0)
label.rename(columns={'isbn_10':'isbn','authors_truth':'author'},inplace=True)
pred = pd.read_csv('./DataSet/vldbBook/TruthFinderResult_mv.txt',sep='\t',low_memory=False,index_col=0)
print(JudgeAccu(label,pred))