In [1]:
import json
import pandas as pd
from os import listdir
from os.path import isfile, join

In [2]:
# Discard duplicate evaluations when the second does not provide more information.
def createResultsList(resultsDir):
    resultFiles = [join(resultsDir,file) for file in listdir(resultsDir) if isfile(join(resultsDir, file))]
    
    results = {}
    
    for filePath in resultFiles:

        if "Forms_per_Annotator" in filePath:
            resultFiles.remove(filePath);
            continue

        file = open(filePath, 'r')
        firstLine = file.readline()
        
        resultId = firstLine.split('#')[1]
        numLines = sum(1 for line in file)
        
        if (resultId in results):
            if (numLines >= results[resultId]["lines"]):
            
                resultFiles.remove(results[resultId]["file"])
                results[resultId]["file"]  = filePath
                results[resultId]["lines"] = numLines
            
            else:
                resultFiles.remove(filePath);
                
        else:            
            results[resultId] = {"file": filePath, "lines": numLines}
        
        file.close()
        
    return resultFiles

In [3]:
# Read data of evaluatin into a dataframe.
def readEvaluationData(evaluationFileName):
            
    with open(join("../annotation/claim-justification_annotator/public/",evaluationFileName), 'r') as evaluationJson:
        evaluationData = json.loads(evaluationJson.read())
    
    dataFrame = pd.read_json(join("../annotation/claim-justification_annotator/public/",evaluationFileName))

    return dataFrame

In [4]:
# Find json_file_id when given simple id
def findJsonFileId(claimId,evaluationData):
    
    valueRow = evaluationData.loc[evaluationData['id'] == claimId]
    return valueRow["json_file_id"].item()

In [5]:
# Find claim and justification given json_file_id
def findClaimJustification(jsonFileId, evaluationData):
    
    valueRow = evaluationData.loc[evaluationData['json_file_id'] == jsonFileId]
    return valueRow["claim"].item(), valueRow["justification"].item()

In [6]:
def aggregateAnnotationResults(resultsList):
    
    # Keep all annotation results in the results object.    
    results = {}
    
    # Keep basic information (claim, justification)
    basicInfo = {}

    # For each kept evaluation.    
    for file in resultsList:
        resultsFile = open(file, 'r')
        resultsLines = resultsFile.readlines()

        currEvaluation = resultsLines[0].split('#')[2].replace("\n",'')
        evaluationData = readEvaluationData(currEvaluation)
        
        # For each data line.        
        for line in resultsLines[2:]:

            dataList = line.split(',')

            for i in range (0,len(dataList)):
                dataList[i] = dataList[i].replace('"', '').strip()

            resultId = dataList[0]
            jsonFileId = findJsonFileId(int(resultId),evaluationData)

            # Check if claim has been ecountered in the past.          
            if jsonFileId in results:
                result = results[jsonFileId]
                basic  = basicInfo[jsonFileId]
            else:
                result = {"distortion": 0,
                          "emphasis": 0,
                          "unfounded": 0,
                          "unclear": 0}
                
                basic = {"claim": None,
                         "justification": None}

            # Find assigned label and check for anomalies occured during the process.            
            annotationCounter = 0
            if (dataList[1] == "true"):
                label = "distortion" 
                annotationCounter += 1
            elif (dataList[2] == "true"):
                label = "emphasis"
                annotationCounter += 1
            elif (dataList[3] == "true"):
                label = "unfounded"
                annotationCounter += 1
            elif (dataList[4] == "true"):
                label = "unclear"
                annotationCounter += 1
            else:
                print("Error. Claim is Unlabeld.")
                continue
            
            if (annotationCounter > 1):
                print("Error. Claim should be assigned only one label.")
                continue
                
            result[label] = result[label] + 1
            basic["claim"], basic["justification"] = findClaimJustification(jsonFileId,evaluationData)
            
            results[jsonFileId]   = result
            basicInfo[jsonFileId] = basic
    
    return results, basicInfo

In [7]:
def sumOfVotes(result):
    return sum(result.values())

In [8]:
def isTie(result):
    values = list(result.values())
    
    for value in values:
        if not value:
            values.remove(value)
    
    if len(values) >= 2:
        testValue = values[0]
        for value in values[1:]:
            if value != testValue:
                return False
    
    return True

In [9]:
def majorityVote(results, key):
    
    vote = max(results[key], key=results[key].get)
    if vote == "unclear":
        return None
    else:
        return vote

In [10]:
def removeUncertainties(results):

    minimumVotes = 3
    
    unResolvedData = {}
    for key in list(results.keys()):        
        if (sumOfVotes(results[key]) < minimumVotes) or (isTie(results[key])):
            unResolvedData[key] = results[key]
            results.pop(key)
    
    return results, unResolvedData

In [11]:
resultsList = createResultsList("../data/Annotation_Results/")
results, basicInfo = aggregateAnnotationResults(resultsList)

Error. Claim is Unlabeld.


In [12]:
annotationResults, unresolvedData = removeUncertainties(results)

In [13]:
uniqueId = "json_file_id"
claim = "claim"
just  = "justification"
label = "justification_label"



dataList = []
for key in annotationResults:
    
    majorityLabel = majorityVote(annotationResults, key)
    if majorityLabel is not None:
        dataList.append({uniqueId: key, 
                         claim: basicInfo[key][claim],
                         just:  basicInfo[key][just],
                         label: majorityLabel})


resultsColumns = [uniqueId, claim, just, label]
data = pd.DataFrame(dataList,columns=resultsColumns)
display(data)

Unnamed: 0,json_file_id,claim,justification,justification_label
0,10540.json,When did the decline of coal start? It started...,"Surovell said the decline of coal ""started whe...",distortion
1,4148.json,"Since 2000, nearly 12 million Americans have s...","So where does this leave us?On Sanders side, h...",emphasis
2,8705.json,Most of the (Affordable Care Act) has already ...,With all the talk about problems with the heal...,distortion
3,10683.json,"In this last election in November, ... 63 perc...","Sanders said that ""in this last election in No...",emphasis
4,7057.json,Says Mitt Romney wants to get rid of Planned P...,The Planned Parenthood Action Fund aid said th...,distortion
...,...,...,...,...
169,9547.json,David Perdue said he'd raise taxes.,Basing the statement alone on Perdues meeting ...,unfounded
170,12697.json,Says no one from the Bush family attended the ...,Priebus said its not a big deal a Bush isnt at...,unfounded
171,12499.json,Crime is rising.,Speaking generally about the state of the coun...,emphasis
172,4928.json,Weve got a personal tax system thats so compli...,"Added together, this is likely a bit lower tha...",emphasis


In [14]:
data["justification_label"].value_counts()

distortion    71
unfounded     56
emphasis      47
Name: justification_label, dtype: int64

In [15]:
# for key in unresolvedData:
#     if isTie(unresolvedData[key]):
#         print(key)
#         print(json.dumps(unresolvedData[key], indent=4, sort_keys=True))
print("Remaining Unresolved Data Are: ", len(unresolvedData.keys()))

Remaining Unresolved Data Are:  5


In [16]:
data.to_csv('../data/Justification_Data/Justification_Data.tsv', sep="\t")