In [1]:
import pandas as pd
import os.path as op
import numpy as np
import seaborn as sns

In [2]:
DATA_DIR = '../data'
DATA_SET = 'population'

In [3]:
truths = pd.read_csv(op.join(DATA_DIR, DATA_SET, 'truths.csv'))
claims = pd.read_csv(op.join(DATA_DIR, DATA_SET, 'claims.csv'))

In [4]:
def majority_vote(claims):
    """perform truth discovery using majority voting
    
    Parameters
    ----------
    claims: pd.DataFrame
        a data frame that has columns [source_id, object_id, value]
        
    Returns
    -------
    discovered_truths: pd.DataFrame
        a data frame that has [object_id, value]
    """
    c_df = claims[['source_id', 'object_id', 'value']].copy()
    discovered_truths = c_df.groupby(['object_id']).apply(lambda x: elect(x))
    discovered_truths = pd.DataFrame(discovered_truths)
    discovered_truths = discovered_truths.rename(columns={0: 'value'}).reset_index()
    return discovered_truths

def elect(x):
    """compute the truth value based on voting; the value received the most votes (by sources) is returned
    
    Parameters
    ----------
    x: pd.DataFrame
    
    Returns
    -------
    discovered_truth: pd.DataFrame
        the discovered truth
    """
    return x.value.value_counts().idxmax()

In [5]:
discovered_truths = majority_vote(claims)

In [6]:
discovered_truths.to_csv(op.join(DATA_DIR, DATA_SET, 'discovered_truths_majority_vote.csv'))

# Evaluation

In [7]:
from utils import accuracy

In [8]:
accuracy(truths, discovered_truths)

0.6943521594684385