In [1]:
import json
import pandas as pd
from os import listdir
from os.path import isfile, join

In [2]:
# Discard duplicate evaluations when the second does not provide more information.
def createResultsList(resultsDir):
    resultFiles = [join(resultsDir,file) for file in listdir(resultsDir) if isfile(join(resultsDir, file))]
    
    results = {}
    
    for filePath in resultFiles:
        
        file = open(filePath, 'r')
        firstLine = file.readline()
        
        resultId = firstLine.split('#')[1]
        numLines = sum(1 for line in file)
        
        if (resultId in results):
            if (numLines >= results[resultId]["lines"]):
            
                resultFiles.remove(results[resultId]["file"])
                results[resultId]["file"]  = filePath
                results[resultId]["lines"] = numLines
            
            else:
                resultFiles.remove(filePath);
                
        else:            
            results[resultId] = {"file": filePath, "lines": numLines}
        
        file.close()
        
    return resultFiles

In [3]:
# Read data of evaluatin into a dataframe.
def readEvaluationData(evaluationFileName):
            
    with open(join("../annotation/claim-justification_annotator/public/",evaluationFileName), 'r') as evaluationJson:
        evaluationData = json.loads(evaluationJson.read())
    
    dataFrame = pd.read_json(join("../annotation/claim-justification_annotator/public/",evaluationFileName))

    return dataFrame

In [4]:
# Find json_file_id when given simple id
def findJsonFileId(claimId,evaluationData):
    
    valueRow = evaluationData.loc[evaluationData['id'] == claimId]
    return valueRow["json_file_id"].item()

In [5]:
# Find claim and justification given json_file_id
def findClamJustification(jsonFileId, evaluationData):
    
    valueRow = evaluationData.loc[evaluationData['json_file_id'] == jsonFileId]
    return valueRow["claim"].item(), valueRow["justification"].item()

In [6]:
def aggregateAnnotationResults(resultsList):
    
    # Keep all annotation results in the results object.    
    results = {}
    
    # Keep basic information (claim, justification)
    basicInfo = {}

    # For each kept evaluation.    
    for file in resultsList:
        resultsFile = open(file, 'r')
        resultsLines = resultsFile.readlines()

        currEvaluation = resultsLines[0].split('#')[2].replace("\n",'')
        evaluationData = readEvaluationData(currEvaluation)
        
        # For each data line.        
        for line in resultsLines[2:]:

            dataList = line.split(',')

            for i in range (0,len(dataList)):
                dataList[i] = dataList[i].replace('"', '').strip()

            resultId = dataList[0]
            jsonFileId = findJsonFileId(int(resultId),evaluationData)

            # Check if claim has been ecountered in the past.          
            if jsonFileId in results:
                result = results[jsonFileId]
                basic  = basicInfo[jsonFileId]
            else:
                result = {"distortion": 0,
                          "emphasis": 0,
                          "unfounded": 0,
                          "unclear": 0}
                
                basic = {"claim": None,
                         "justification": None}

            # Find assigned label and check for anomalies occured during the process.            
            annotationCounter = 0
            if (dataList[1] == "true"):
                label = "distortion" 
                annotationCounter += 1
            elif (dataList[2] == "true"):
                label = "emphasis"
                annotationCounter += 1
            elif (dataList[3] == "true"):
                label = "unfounded"
                annotationCounter += 1
            elif (dataList[4] == "true"):
                label = "unclear"
                annotationCounter += 1
            else:
                print("Error. Claim is Unlabeld.")
                continue
            
            if (annotationCounter > 1):
                print("Error. Claim should be assigned only one label.")
                continue
                
            result[label] = result[label] + 1
            basic["claim"], basic["justification"] = findClamJustification(jsonFileId,evaluationData)
            
            results[jsonFileId]   = result
            basicInfo[jsonFileId] = basic
    
    return results, basicInfo

In [7]:
resultsList = createResultsList("../data/Annotation_Results/")
results, basicInfo = aggregateAnnotationResults(resultsList)

Error. Claim is Unlabeld.
Error. Claim is Unlabeld.
Error. Claim is Unlabeld.
Error. Claim is Unlabeld.
Error. Claim is Unlabeld.


In [8]:
def sumOfVotes(result):
    return sum(result.values())

In [9]:
def isTie(result):
    values = list(result.values())
    
    for value in values:
        if not value:
            values.remove(value)
    
    if len(values) >= 2:
        testValue = values[0]
        for value in values[1:]:
            if value != testValue:
                return False
    
    return True

In [10]:
def removeUncertainties(results):

    minimumVotes = 3
    
    unResolvedData = {}
    for key in list(results.keys()):        
        if (sumOfVotes(results[key]) < minimumVotes) or (isTie(results[key])):
            unResolvedData[key] = results[key]
            results.pop(key)
    
    return results, unResolvedData

In [11]:
# print(json.dumps(annotationResults, indent=4, sort_keys=True))

In [12]:
annotationResults, unresolvedData = removeUncertainties(results)

In [13]:
uniqueId = "json_file_id"
claim = "claim"
just  = "justification"
label = "justification_label"



dataList = []
for key in annotationResults:
    
    majorityLabel = max(annotationResults[key], key=annotationResults[key].get)
    dataList.append({uniqueId: key, 
                     claim: basicInfo[key][claim],
                     just:  basicInfo[key][just],
                     label: majorityLabel})


resultsColumns = [uniqueId, claim, just, label]
data = pd.DataFrame(dataList,columns=resultsColumns)
display(data)

Unnamed: 0,json_file_id,claim,justification,justification_label
0,10540.json,When did the decline of coal start? It started...,"Surovell said the decline of coal ""started whe...",distortion
1,4148.json,"Since 2000, nearly 12 million Americans have s...","So where does this leave us?On Sanders side, h...",emphasis
2,8705.json,Most of the (Affordable Care Act) has already ...,With all the talk about problems with the heal...,distortion
3,7057.json,Says Mitt Romney wants to get rid of Planned P...,The Planned Parenthood Action Fund aid said th...,distortion
4,9727.json,What (the Obama administration is) going to co...,"Capito said ""What (Obama is) going to come out...",unfounded
5,5352.json,Says a U.S. Supreme Court justice suggested th...,Rubio mentioned a female Supreme Court Judge a...,distortion
6,1926.json,We have a director of homeland security who ca...,"Just two weeks ago, American hotels were the t...",unfounded
7,11418.json,African-American youth unemployment is 51 perc...,"During the debate, Sanders said that ""African-...",unfounded
8,3652.json,President Barack Obama took exactly none of hi...,If Boehner had offered a more measured assessm...,unfounded
9,4930.json,Says the paperback edition of Mitt Romneys boo...,Perry's right that Romney's comments about hea...,distortion
