In [1]:
import re
import json
import pandas as pd

In [2]:
# Read data from tsv into dataframe
trainData = pd.read_csv ('../data/LIAR-PLUS/dataset/tsv/test2.tsv', sep='\t', header=None)

# Add collumn names in the data.
columnNames = ['id', 'json_file_id', 'label', 'claim', 'topics', 'originator', 'title', 'state', 'party','c1', 'c2', 'c3', 'c4', 'c5', 'location', 'justification']
trainData.columns = columnNames

In [3]:
# Read forbidden words and put them in a list
forbiddenWords = []
with open('../data/LIAR-PLUS/forbidden_words.txt') as forbiddenWordsFile:
    lines = forbiddenWordsFile.readlines()
    for line in lines:
        forbiddenWords.append(line.replace('\n',''))

In [4]:
# Transform truth values from six classes to two. 'True' and 'False'
def transformTruthValues(dataFrame, trueLabels=[], falseLabels=[]):
    
    trueList  = []
    falseList = []

    for index, row in dataFrame.iterrows():
        if row['label'] in trueLabels:
            dataFrame.at[index, 'label'] = 'true'
            trueList.append(dataFrame.iloc[index].values)
        else:
            dataFrame.at[index, 'label'] = 'false'
            falseList.append(dataFrame.iloc[index].values)

    trueData  = pd.DataFrame(trueList,  columns=columnNames)
    falseData = pd.DataFrame(falseList, columns=columnNames)
    
    return trueData, falseData

In [5]:
# Fix dataframe ids to start from 0
def fixIds(dataFrame):    
    id = 0
    for index, row in dataFrame.iterrows():
        dataFrame.at[index, 'id'] = id
        id += 1
    
    return dataFrame

In [6]:
# Transform truth values from six classes to two. 'True' and 'False'

trueLabels  = ['mostly-true', 'true']
falseLabels = ['pants-fire', 'false', 'barely-true', 'half-true']

trueData, falseData = transformTruthValues(trainData, trueLabels, falseLabels)

trueData.dropna(inplace=True)
falseData.dropna(inplace=True)

In [7]:
totalJustifications = 0
totalSentences = 0
for index, row in falseData.iterrows():
    totalJustifications += 1
    totalSentences += len(re.split(r'[.!?\n]', str(row['justification']).strip())) - 1

print("Total number of justifications is: " + str(totalJustifications))
print("Total number of sentences is: " + str(totalSentences))
print("Average number of sentences per justification is: " + str(totalSentences/totalJustifications))
    
for index, row in falseData.iterrows():
    if (len(re.split(r'[.!?\n]', str(row['justification']).strip())) - 1) < 4:
        falseData.drop(index, inplace=True)
    else:
        if any(word in str(row['justification']).strip() for word in forbiddenWords):
            falseData.drop(index, inplace=True)

newJustifications = 0
for index, row in falseData.iterrows():
    newJustifications += 1

print("New number of justifications is: " + str(newJustifications))

Total number of justifications is: 529
Total number of sentences is: 2278
Average number of sentences per justification is: 4.306238185255198
New number of justifications is: 288


In [8]:
# Set pandas options show that dataframes appear not truncated.
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [9]:
start = 0
step  = 50

for i in range(1,6):
    # Fix ids
    data = fixIds(falseData[start:start+step])
    # Transform data to json string
    evalData  = data.to_json(orient="records")
    # Write as json to file
    evaluation = open("evaluation" + str(i) + ".json", "w")
    evaluation.write(json.dumps(json.loads(evalData), indent=4, sort_keys=True))
    evaluation.close()
    
    start = start + step