In [None]:
import numpy as np
import pandas as pd
import json
import random
import nltk
from unidecode import unidecode
import string

In [None]:
class DataProcessor:

    def __init__(self, evdn_file, tran_file, deva_file, devb_file, test_file):
        
        print("Loading data started...")
        
        self.corpus_t1 = set()
        self.sentence_t1 = []
        self.corpus_t2 = set()
        self.sentence_t2 = []
        
        self.evdn_origin = self.read(evdn_file)
        self.tran_origin = self.read(tran_file)
        self.deva_origin = self.read(deva_file)
        self.test_origin = self.read(test_file)
        
        self.retain_evidence_list = set(self.retain_evidence(self.tran_origin, self.deva_origin))

        print("   1/10 Loading evidence (part) for t1 started...")
        self.evdn_df_part_t1 = self.preprocess_evdn_t1_part(data=self.evdn_origin)
        print("        Loading evidence (part) for t1 finished.")
        print("   2/10 Loading evidence (full) for t1 started...")
        self.evdn_df_full_t1 = self.preprocess_evdn_t1_full(data=self.evdn_origin)
        print("        Loading evidence (full) for t1 finished.")
        print("   3/10 Loading training data for t1 started...")
        self.tran_df_t1 = self.preprocess_data_t1(data=self.tran_origin, test=False)
        print("        Loading training data for t1 finished.")
        print("   4/10 Loading devalopment data for t1 started...")
        self.deva_df_t1 = self.preprocess_data_t1(data=self.deva_origin, test=False)
        print("        Loading devalopment data for t1 finished.")
        print("   5/10 Loading testing data for t1 started...")
        self.test_df_t1 = self.preprocess_data_t1(data=self.test_origin, test=True)
        print("        Loading testing data for t1 finished.")
        
        print("   6/10 Loading evidence (part) for t2 started...")
        self.evdn_df_part_t2 = self.preprocess_evdn_t2_part(data=self.evdn_origin)
        print("        Loading evidence (part) for t2 finished.")
        print("   7/10 Loading evidence (full) for t2 started...")
        self.evdn_df_part_t2 = self.preprocess_evdn_t2_full(data=self.evdn_origin)
        print("        Loading evidence (full) for t2 finished.")
        print("   8/10 Loading training data for t2 started...")
        self.tran_df_t2 = self.preprocess_data_t2(data=self.tran_origin, test=False)
        print("        Loading training data for t2 finished.")
        print("   9/10 Loading devalopment data for t2 started...")
        self.deva_df_t2 = self.preprocess_data_t2(data=self.deva_origin, test=False)
        print("        Loading devalopment data for t2 finished.")
        print("  10/10 Loading testing data for t2 started...")
        self.test_df_t2 = self.preprocess_data_t2(data=self.test_origin, test=True)
        print("        Loading testing data for t2 finished.")
        print()
        
        self.summary_statistics = self.show_statistics()
        
    def read(self, file_name):
        with open(file_name, 'r', encoding='utf8') as data:
            json_data = json.load(data)
        return json_data
    
    def write(self, y_pred_final, evidence_index_list):
        final_dict = self.test_origin
        index_list = list(self.test_dataframe["index"])
        for i in range(len(index_list)):
            index = index_list[i]
            evidence = []
            for evidence_index in evidence_index_list[i]:
                evidence.append("evidence-"+str(evidence_index))
            label = y_pred_final[i]
            final_dict["claim-"+str(index)]["claim_label"] = label
            final_dict["claim-"+str(index)]["evidences"] = evidence
        json_str = json.dumps(final_dict)
        with open('test-claims-predictions.json', 'w') as json_file:
            json_file.write(json_str)
    
    def show_statistics(self):
        print("Loading data finished:")
        
        evdn_len_full = []
        for sentence in self.evdn_df_full_t1["evidence"]:
            evdn_len_full.append(len(nltk.tokenize.word_tokenize(sentence)))
        evdn_len_part = []
        for sentence in self.evdn_df_part_t1["evidence"]:
            evdn_len_part.append(len(nltk.tokenize.word_tokenize(sentence)))
            
        evdn_text_leng_list = evdn_len_full + evdn_len_part
        
        tran_evdn_leng_list = []
        deva_evdn_leng_list = []
        for key, value in self.tran_origin.items():
            count = 0
            for evdn in value["evidences"]:
                count += 1
            tran_evdn_leng_list.append(count)
        for key, value in self.deva_origin.items():
            count = 0
            for evdn in value["evidences"]:
                count += 1
            deva_evdn_leng_list.append(count)
        
        data_evdn_leng_list = tran_evdn_leng_list + deva_evdn_leng_list
        
        tran_token_leng_list = []
        deva_token_leng_list = []
        test_token_leng_list = []
        for key, value in self.tran_origin.items():
            tran_token_leng_list.append(len(nltk.tokenize.word_tokenize(value["claim_text"])))
        for key, value in self.deva_origin.items():
            deva_token_leng_list.append(len(nltk.tokenize.word_tokenize(value["claim_text"])))
        for key, value in self.test_origin.items():
            test_token_leng_list.append(len(nltk.tokenize.word_tokenize(value["claim_text"])))
        
        data_text_leng_list = tran_token_leng_list + deva_token_leng_list + test_token_leng_list
        
        print("Data Summary:")
        print("   1. Evidence")
        print("      (1) Full set of evidence")
        print("          Original number of evidence = " + str(len(self.evdn_df_full_t1)) + ".")
        print("          Maximum number of tokens in evidence text = " + str(round(np.max(evdn_len_full),0)) + ".")
        print("          Minimum number of tokens in evidence text = " + str(round(np.min(evdn_len_full),0)) + ".")
        print("          Average number of tokens in evidence text = " + str(round(np.mean(evdn_len_full),2)) + ".")
        print("      (2) Retained set of evidence")
        print("          Retained number of evidence = " + str(len(self.evdn_df_part_t1)) + ".")
        print("          Maximum number of tokens in text = " + str(round(np.max(evdn_len_part),0)) + ".")
        print("          Minimum number of tokens in text = " + str(round(np.min(evdn_len_part),0)) + ".")
        print("          Average number of tokens in text = " + str(round(np.mean(evdn_len_part),2)) + ".")
        print("   2. Data Sets")
        print("      (1) Training Data")
        print("          Sample size of training data = " + str(len(self.tran_origin)) + ".")
        print("          Maximum number of tokens in claim text = " + str(round(np.max(tran_token_leng_list),0)) + ".")
        print("          Minimum number of tokens in claim text = " + str(round(np.min(tran_token_leng_list),0)) + ".")
        print("          Average number of tokens in claim text = " + str(round(np.mean(tran_token_leng_list),2)) + ".")
        print("          Maximum number of evidence for training data = " + str(round(np.max(tran_evdn_leng_list),0)) + ".")
        print("          Minimum number of evidence for training data = " + str(round(np.min(tran_evdn_leng_list),0)) + ".")
        print("          Average number of evidence for training data = " + str(round(np.mean(tran_evdn_leng_list),2)) + ".")
        print("      (2) Developing Data")
        print("          Sample size of developing data = " + str(len(self.deva_origin)) + ".")
        print("          Maximum number of tokens in claim text = " + str(round(np.max(deva_token_leng_list),0)) + ".")
        print("          Minimum number of tokens in claim text = " + str(round(np.min(deva_token_leng_list),0)) + ".")
        print("          Average number of tokens in claim text = " + str(round(np.mean(deva_token_leng_list),2)) + ".")
        print("          Maximum number of evidence for developing data = " + str(round(np.max(deva_evdn_leng_list),0)) + ".")
        print("          Minimum number of evidence for developing data = " + str(round(np.min(deva_evdn_leng_list),0)) + ".")
        print("          Average number of evidence for developing data = " + str(round(np.mean(deva_evdn_leng_list),2)) + ".")
        print("      (3) Testing Data")
        print("          Sample size of testing test = " + str(len(self.test_origin)) + ".")
        print("          Maximum number of tokens in claim text = " + str(round(np.max(test_token_leng_list),0)) + ".")
        print("          Minimum number of tokens in claim text = " + str(round(np.min(test_token_leng_list),0)) + ".")
        print("          Average number of tokens in claim text = " + str(round(np.mean(test_token_leng_list),2)) + ".")
        
        return evdn_text_leng_list, data_evdn_leng_list, data_text_leng_list
        
    def retain_evidence(self,tran,deva):
        retain_evidence_list = []
        for key, value in tran.items():
            for evdn in value["evidences"]:
                retain_evidence_list.append(int(evdn.split("-")[1]))
        for key, value in deva.items():
            for evdn in value["evidences"]:
                retain_evidence_list.append(int(evdn.split("-")[1]))
        return retain_evidence_list
        
    def process_sentence_t1(self, sentence):
        stop_words = set(nltk.corpus.stopwords.words('english')) | set(list(string.punctuation))
        lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
        stemmer = nltk.stem.PorterStemmer()
        valid_list = " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
        
        if isinstance(sentence, str):
            sentence_new = ""
            for char in sentence:
                if char in valid_list:
                    sentence_new += char
            words = []
            for word in [i for i in nltk.tokenize.word_tokenize(sentence_new.lower()) if i not in stop_words]:
                words.append(stemmer.stem(lemmatizer.lemmatize(word)))
            sentence_final = unidecode(" ".join(words)).strip()
            if sentence_final == "":
                return "nan nan"
            else:
                return sentence_final
        else:
            return "nan nan"
    
    def process_sentence_t2(self, sentence):
        lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
        valid_list = " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
        if isinstance(sentence, str):
            sentence_new = ""
            for char in sentence:
                if char in valid_list:
                    sentence_new += char
            words = []
            for word in [i for i in nltk.tokenize.word_tokenize(sentence_new.lower())]:
                words.append(lemmatizer.lemmatize(word))
            sentence_final = unidecode(" ".join(words)).strip()
            if sentence_final == "":
                return "nan nan"
            else:
                return sentence_final
        else:
            return "nan nan"
    
    def preprocess_data_t1(self, data, test):
        preprocessed_data = []
        for key, value in data.items():   
            claim_index = int(key.split("-")[1])
            
            claim_text = self.process_sentence_t1(value["claim_text"])
            self.sentence_t1.append(claim_text)
            for word in nltk.tokenize.word_tokenize(claim_text):
                self.corpus_t1.add(word)
            
            if test == False:
                label = value["claim_label"]
                evidence_index_string = ""
                evidence_index_list = []
                for evdn in value["evidences"]:
                    index = int(evdn.split("-")[1])
                    evidence_index_string += str(index) + " "
                    evidence_index_list.append(index)
                evidence_string = ""
                for i in evidence_index_list:
                    evidence_string += self.evdn_df_part_t1["evidence"][i] + " "
                preprocessed_data.append([claim_index, claim_text, evidence_index_string, evidence_string, label])
            else:
                preprocessed_data.append([claim_index, claim_text])
            
        if test == False:
            preprocessed_data_df = pd.DataFrame(preprocessed_data)
            preprocessed_data_df.columns = ["claim_index", "claim", "evidence_index", "evidence_text", "label"]
        else:
            preprocessed_data_df = pd.DataFrame(preprocessed_data)
            preprocessed_data_df.columns = ["claim_index", "claim"]
        
        return preprocessed_data_df
    
    def preprocess_data_t2(self, data, test):
        preprocessed_data = []
        for key, value in data.items():
            claim_index = int(key.split("-")[1])

            claim_text = self.process_sentence_t2(value["claim_text"])
            self.sentence_t2.append(claim_text)

            for word in nltk.tokenize.word_tokenize(claim_text):
                self.corpus_t2.add(word)
    
            if test == False:
                label = value["claim_label"]
                evidence_index_string = ""
                evidence_index_list = []
                for evdn in value["evidences"]:
                    index = int(evdn.split("-")[1])
                    evidence_index_string += str(index) + " "
                    evidence_index_list.append(index)
                evidence_string = ""
                for i in evidence_index_list:
                    evidence_string += self.evdn_df_part_t1["evidence"][i] + " "
                preprocessed_data.append([claim_index, claim_text, evidence_index_string, evidence_string, label])
            else:
                preprocessed_data.append([claim_index, claim_text])
        if test == False:
            preprocessed_data_df = pd.DataFrame(preprocessed_data)
            preprocessed_data_df.columns = ["claim_index", "claim", "evidence_index", "evidence_text", "label"]
        else:
            preprocessed_data_df = pd.DataFrame(preprocessed_data)
            preprocessed_data_df.columns = ["claim_index", "claim"]
        
        return preprocessed_data_df
    
    def preprocess_evdn_t1_full(self, data):
        preprocessed_evdn = []
        for key, value in data.items():
            evdn_index = int(key.split("-")[1]) 
            evdn_text = self.process_sentence_t1(value)
            self.sentence_t1.append(evdn_text)
            for word in evdn_text.split():
                self.corpus_t1.add(word.lower())
            preprocessed_evdn.append([evdn_index, evdn_text])
        preprocessed_evdn_df = pd.DataFrame(preprocessed_evdn)
        preprocessed_evdn_df.columns = ["evdn_index", "evidence"]
        preprocessed_evdn_df = preprocessed_evdn_df.set_index("evdn_index")
        return preprocessed_evdn_df
    
    def preprocess_evdn_t2_full(self, data):
        preprocessed_evdn = []
        for key, value in data.items():
            evdn_index = int(key.split("-")[1])
            evdn_text = self.process_sentence_t1(value)
            self.sentence_t2.append(evdn_text)
            for word in evdn_text.split():
                self.corpus_t2.add(word.lower())
            preprocessed_evdn.append([evdn_index, evdn_text])    
        preprocessed_evdn_df = pd.DataFrame(preprocessed_evdn)
        preprocessed_evdn_df.columns = ["evdn_index", "evidence"]
        preprocessed_evdn_df = preprocessed_evdn_df.set_index("evdn_index")
        return preprocessed_evdn_df
    
    def preprocess_evdn_t1_part(self, data):
        preprocessed_evdn = []
        for key, value in data.items():
            evdn_index = int(key.split("-")[1])
            if evdn_index in self.retain_evidence_list:
                evdn_text = self.process_sentence_t1(value)
                self.sentence_t1.append(evdn_text)
                for word in evdn_text.split():
                    self.corpus_t1.add(word.lower())
                preprocessed_evdn.append([evdn_index, evdn_text])
        preprocessed_evdn_df = pd.DataFrame(preprocessed_evdn)
        preprocessed_evdn_df.columns = ["evdn_index", "evidence"]
        preprocessed_evdn_df = preprocessed_evdn_df.set_index("evdn_index")
        return preprocessed_evdn_df
    
    def preprocess_evdn_t2_part(self, data):
        preprocessed_evdn = []
        for key, value in data.items():
            evdn_index = int(key.split("-")[1])
            if evdn_index in self.retain_evidence_list:
                evdn_text = self.process_sentence_t1(value)
                self.sentence_t2.append(evdn_text)
                for word in evdn_text.split():
                    self.corpus_t2.add(word.lower())
                preprocessed_evdn.append([evdn_index, evdn_text])    
        preprocessed_evdn_df = pd.DataFrame(preprocessed_evdn)
        preprocessed_evdn_df.columns = ["evdn_index", "evidence"]
        preprocessed_evdn_df = preprocessed_evdn_df.set_index("evdn_index")
        return preprocessed_evdn_df
    

In [None]:
## Data processor
data_processor = DataProcessor(evdn_file="data_raw/evidence.json", 
                               tran_file="data_raw/train-claims.json", 
                               deva_file="data_raw/dev-claims.json",
                               devb_file="data_raw/dev-claims-baseline.json",
                               test_file="data_raw/test-claims-unlabelled.json")

In [None]:
data_processor.evdn_df_full_t1.to_csv("data_processed/evdn_full_df_t1.csv")
data_processor.evdn_df_part_t1.to_csv("data_processed/evdn_part_df_t1.csv")
data_processor.tran_df_t1.to_csv("data_processed/tran_df_t1.csv")
data_processor.deva_df_t1.to_csv("data_processed/deva_df_t1.csv")
data_processor.test_df_t1.to_csv("data_processed/test_df_t1.csv")

data_processor.evdn_df_full_t1.to_csv("data_processed/evdn_full_df_t2.csv")
data_processor.evdn_df_part_t1.to_csv("data_processed/evdn_part_df_t2.csv")
data_processor.tran_df_t2.to_csv("data_processed/tran_df_t2.csv")
data_processor.deva_df_t2.to_csv("data_processed/deva_df_t2.csv")
data_processor.test_df_t2.to_csv("data_processed/test_df_t2.csv")