# Read Data

In [5]:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import pickle
import re
import json
import zipfile
import logging
import os
import unicodedata
################################################################
# ENVIRON
################################################################
# This is server-1
server1_homepath = "/home/ubuntu/workspace/codelab/"
server2_homepath = "/home/ubuntu/workspace/codelab/"
gpu_homepath = "/home/shawn/workspace/research/codelab/"
jun_homepath = "/home/junw/workspace/Fact_Checking/"

# choose from the server1, server2, gpu, jun.
SERVERNAME = 'gpu'
HOMEPATH = {'server1':server1_homepath, 'server2':server2_homepath, 'gpu':gpu_homepath, 'jun':jun_homepath}[SERVERNAME]

TASKNAME = 'preprocess'

## get the path of the wiki files
DATAPATH = HOMEPATH + "data"
INTERMEDIATE_DATAPATH = HOMEPATH + "automatic_fact_verification/intermediate_data/final/"
################################################################
# ENVIRON
################################################################

def scanFile(path):
    files_list=[]
    for dirpath,dirnames,filenames in os.walk(path):
        for special_file in filenames:
            if special_file.endswith(".txt"):
                files_list.append(os.path.join(dirpath,special_file))                                        
    return files_list

def is_number(s):
    try:
        int(s)
        return True
    except ValueError:
        pass

def preprocessed_title(title):
    title = unicodedata.normalize('NFC', title)
    title = title.replace("-COLON-"," -COLON-")
    # replaced = re.sub('_-LRB-.*', '', title)
    title = re.sub('–', '-', title)
    return title.replace('_',' ').strip()

def preprocessed_sentence(sent):
    new_sent = sent[0] + " , "
    for token in sent[2:]:
        new_sent += token + " "
    new_sent = preprocessed_claim_sentence(new_sent)
    return new_sent
    
def preprocessed_claim_sentence(claim):
    claim = unicodedata.normalize('NFC', claim)
    claim = claim.replace(':','-COLON-')
    claim = claim.replace('-COLON-',' -COLON-')
    claim = claim.replace('(','-LRB-')
    claim = claim.replace(')','-RRB-')
    claim = claim.replace("_"," ").replace("-LRB-","-LRB- ").replace("-RRB-"," -RRB")
    claim = re.sub('–', '-', claim)
    claim = claim.replace("`","'")
    claim = claim.replace("  "," ")
    # replaced = re.sub('_-LRB-.*', '', title)
    return claim.strip()

def get_training_devset_test(path):
    logger.info('Start merge training and devset!')
    with open(path + "/train.json",'r') as f:
        train_dict = json.load(f)
        # devset data
    with open(path + "/devset.json",'r') as f:
        devset_dict = json.load(f)
    with open(path + "/test-unlabelled.json",'r') as f:
        test_dict = json.load(f)
    return train_dict, devset_dict, test_dict


def prepare_data():
    train_dict, devset_dict, test_dict = get_training_devset_test(DATAPATH)
    wiki_sentence = create_wiki_sentence(DATAPATH, firstTime=True)
    wiki = create_wiki(DATAPATH, wiki_sentence, firstTime=True)
    wiki_title = create_wiki_title(DATAPATH, wiki, firstTime=True)
    return train_dict, devset_dict, test_dict, wiki, wiki_sentence, wiki_title

if __name__ == '__main__':    
    filename = SERVERNAME + '-' + TASKNAME + '.log'
    try:
        os.remove(filename)
    except OSError:
        pass
    # K = 10
    # K_list = [100]
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    handler = logging.FileHandler(filename)
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.info('------------------------Start the task: {}------------------------'.format(TASKNAME))
    #

In [6]:
from collections import defaultdict
class Wiki:
    def __init__(self, path, firstTime, INTERMEDIATE_DATAPATH):
        if firstTime:
            file_list = scanFile(path)
            repeated_cnt = 0
            error_cnt = 0
            self.wiki = defaultdict(dict)
            self.wiki_titles = defaultdict(dict)
           # self.wiki_title_dict = {}
            for file in file_list:
                with open(file, 'rb') as f:
                    for line in f:
                        line = unicodedata.normalize('NFC', line.decode('utf-8')[:-1])
                        line_split = line.split(" ")
                        title = line_split[0]
                        try:
                            if is_number(line_split[1]):
                                if (not title in self.wiki.keys()):
                                    self.wiki[title] = {}
                                if int(line_split[1]) in self.wiki[title]:
                                    repeated_cnt += 1
                                else:
                                    self.wiki[title][int(line_split[1])] = preprocessed_sentence(line_split)
                                
                                ori_title = re.sub('_-LRB.*RRB-',"",title)
                                ori_title = preprocessed_claim_sentence(ori_title)
                                if ori_title not in self.wiki_titles.keys():
                                    self.wiki_titles[ori_title] = []
                                if title not in self.wiki_titles[ori_title]:
                                    self.wiki_titles[ori_title].append(title)
                        
                        except ValueError as e:
                            error_cnt += 1
            with open(INTERMEDIATE_DATAPATH + 'wiki.pkl', 'wb') as fp:
                pickle.dump(self.wiki, fp)
            with open(INTERMEDIATE_DATAPATH + 'wiki_titles.pkl', 'wb') as fp:
                pickle.dump(self.wiki_titles, fp)
            #with open(INTERMEDIATE_DATAPATH + 'wiki_title_dict.pkl', 'wb') as fp:
                #pickle.dump(self.wiki_title_dict, fp)
        else:
            with open(INTERMEDIATE_DATAPATH + 'wiki.pkl', 'rb') as fp:
                self.wiki = pickle.load(fp)
            with open(INTERMEDIATE_DATAPATH + 'wiki_titles.pkl', 'rb') as fp:
                self.wiki_titles = pickle.load(fp)
            #with open(INTERMEDIATE_DATAPATH + 'wiki_title_dict.pkl', 'rb') as fp:
                #self.wiki_title_dict = pickle.load(fp)   
            
            
    def wiki(self):
        return self.wiki
    
    def single_sent(self,sent):
        sent[0] = unicodedata.normalize('NFC',sent[0])
        return self.wiki[sent[0]][sent[1]]
    
    def single_doc(self,title):
        doc = ""
        for i,sent in enumerate(self.wiki[title].values()):
            if i != 0:
                string =""
                for i in sent.split(",")[1:]:
                    string += i +","
                doc += string[:-1]
            else:
                doc += sent + " "
        return doc[:-1]
    
    def multi_docs(self,titles):
        docs = []
        for title in titles:
            docs.append(self.single_doc(title))
        return docs
    
    def multi_sents(self,sents):
        docs = ""
        for sent in sents:
            docs += self.single_sent(sent) +" "
        return docs[:-1]
    
    def alltitles(self):
        return list(self.wiki_titles.keys())
    
    def dertitles(self,title):
        return self.wiki_titles[title]

In [7]:
train,dev,test = get_training_devset_test(DATAPATH)
wiki = Wiki(DATAPATH, False ,INTERMEDIATE_DATAPATH )

# Document Retrieval

In [8]:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import preprocess
import re
import json
import os
import sys
from collections import defaultdict
import nltk
from collections import Counter
from math import log, sqrt
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import unicodedata
import logging
from nltk.stem import WordNetLemmatizer
import csv
from tqdm import tqdm
stemmer = nltk.stem.PorterStemmer()
nltk.download('stopwords')
nltk.download('wordnet')
stopWords = set(stopwords.words('english'))
stopWords.add("'s")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/shawn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/shawn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
class InvertedIndex:
    def __init__(self, vocab, doc_term_freqs):
        self.vocab = vocab
        self.doc_len = [0] * len(doc_term_freqs)
        self.doc_term_freqs = [[] for i in range(len(vocab))]
        self.doc_ids = [[] for i in range(len(vocab))]
        self.doc_freqs = [0] * len(vocab)
        self.total_num_docs = 0
        self.max_doc_len = 0
        for docid, term_freqs in enumerate(doc_term_freqs):
            doc_len = sum(term_freqs.values())
            self.max_doc_len = max(doc_len, self.max_doc_len)
            self.doc_len[docid] = doc_len
            self.total_num_docs += 1
            for term, freq in term_freqs.items():
                term_id = vocab[term]
                self.doc_ids[term_id].append(docid)
                self.doc_term_freqs[term_id].append(freq)
                self.doc_freqs[term_id] += 1

    def num_terms(self):
        return len(self.doc_ids)

    def num_docs(self):
        return self.total_num_docs

    def docids(self, term):
        term_id = self.vocab[term]
        return self.doc_ids[term_id]

    def freqs(self, term):
        term_id = self.vocab[term]
        return self.doc_term_freqs[term_id]

    def f_t(self, term):
        term_id = self.vocab[term]
        return self.doc_freqs[term_id]
    
def processed(docs, lower = False, stem = False):
# processed_docs stores the list of processed docs
    processed_docs = []
    # vocab contains (term, term id) pairs
    vocab = {}
    # total_tokens stores the total number of tokens
    total_tokens = 0
    for raw_doc in docs:
        # norm_doc stores the normalized tokens of a doc
        norm_doc = []
        if stem == True:
            tokenized_sentence = raw_doc.split(" ")
        else:
            tokenized_sentence = nltk.word_tokenize(raw_doc)##tokenize
        for token in tokenized_sentence:
            if lower == True:
                token = token.lower()
            if stem == True:
                token = stemmer.stem(token.lower())
            if not (token in vocab.keys()):
                vocab[token] = len(vocab) ##add into the vocab,len(vocab) will be the id
            norm_doc.append(token)
            total_tokens += 1
        processed_docs.append(norm_doc)
        
    doc_term_freqs = []
    for doc in processed_docs:
        doc_term_freqs.append(Counter(doc))
        
    invindex = InvertedIndex(vocab, doc_term_freqs)
    
    return invindex
    #return processed_docs,vocab,doc_term_freqs,invindex

def query_doc(query, index, k = 20 , lower = False, stem = False):
    # scores stores doc ids and their scores
    scores = Counter()
    query = nltk.word_tokenize(query)
    query = list(set(query))
    for word in query:
        if lower == True:
            word = word.lower()
        if stem == True:
            word = stemmer.stem(word.lower())
        if word in index.vocab:  #The word will not be counted if the word doesn't exist in the vocab,
            # 这个word在多少个document中出现过。
            for i in range(0,len(index.docids(word))):
                doc_id = index.docids(word)[i]
                # index.doc_len[doc_id] 本身标题有多长
                scores[doc_id] = scores[doc_id] + 1/index.doc_len[doc_id] #update the score

    return scores.most_common(k)

def TB_search(query, index = invindex, doc = wiki.alltitles(), low = False, stem = False):
    """
    按照query进行查询匹配的结果。
    """
    query = preprocessed_claim_sentence(query)
    results = query_doc(query,index,lower = low,stem = stem)
    titles = []
    for i in results:
        doc_id = i[0]
        t = doc[doc_id]
        if (t in query) & (t.lower() not in stopWords):
            titles.append(t)
    titles = sorted(titles, key = lambda i:len(i), reverse = True)
    new_titles = []
    for title in titles:
        if title in query:
            query = query.replace(title,"")
            new_titles.append(title)
    new_results = []
    for i in new_titles:
        for j in wiki.dertitles(i):
            new_results.append(j)
    return new_results,new_titles

def extract_title(evi):
    titles =[]
    for j in evi:
        title = re.sub('_-LRB.*RRB-',"",j).replace("_"," ")
        if not title in titles:
            titles.append(title)
    return titles
def remove_stopwords(evi):
    new_evi = []
    for j in evi:
        title = re.sub('_-LRB.*RRB-',"",j).lower()
        if not title in stopWords:
            new_evi.append(j)
    return new_evi
def doc_eval(setname,predict,prn = False):
    wrong = []
    miss = 0
    for key in setname:
        for evi in setname[key]['evidence']:
            title = unicodedata.normalize('NFC', evi[0])
            if title not in predict[key]:
                if key not in wrong:
                    wrong.append(key)
                    miss += 1
    if prn == True:
        print("Total Miss Matched : ", 1 - miss/len(setname))
        for i in wrong:
            print("==============Claim id: ", i , " ============")
            print("Claim : ", setname[i]['claim'])
            print("Target evidence : ", setname[i]['evidence'])
            print("Guess document : ", predict[i])
    return miss,wrong
def title_based(setname,invindex,wiki = wiki, lower = False, stem = False):
    title_based_doc = {}
    titles = {}
    for key in tqdm(setname):
        title_based_doc[key] = {}
        result,title = TB_search(setname[key]['claim'],low = lower, stem = stem)
        title_based_doc[key] = result
        titles[key] = title
    return title_based_doc,titles

NameError: name 'invindex' is not defined

In [None]:
invindex = processed(wiki.alltitles())

In [24]:
TB_dev, titles_dev = title_based(dev,invindex)

NameError: name 'title_based' is not defined

In [None]:
missed, wrong_list = doc_eval(dev,TB_dev,False)

In [5]:
#####这一步，读取pyluence的结果，最好可以打包完直接从这边直接跑。
import pickle
with open(INTERMEDIATE_DATAPATH+"/merged_devset_dict",'rb') as fp:
    merged_devset_dict = pickle.load(fp)
cont_dev ={}
for key in merged_devset_dict:
    cont_dev[key] = merged_devset_dict[key]['matched']

In [None]:
def query_sim(query, index, k=20 , th = 0):
    # scores stores doc ids and their scores
    scores = Counter()
    query = preprocessed_claim_sentence(query)
    query = nltk.word_tokenize(query) 
    for word in query:
        word = stemmer.stem(word.lower())
        if word in index.vocab:  #The word will not be counted if the word doesn't exist in the vocab,
            for i in range(0,len(index.docids(word))):
                doc_id = index.docids(word)[i]
                dts = 1/sqrt(index.doc_len[doc_id]) * log(1 + index.freqs(word)[i]) * log(index.num_docs() / index.f_t(word)) #calculate the tf-idf score
                scores[doc_id] = scores[doc_id] + dts #update the score  
    result = []
    if th == 0:
        return(scores.most_common(k))
    else:
        for i in scores.most_common(k):
            if (i[1] >= th) & (len(result)<3):
                result.append(i)
            else:
                return(result)
    return(result)

def sent_eval(target,guess):
    sen_num = 0
    sen_wrong = []
    for i in guess:
        t = target[i]['evidence']
        for e in t:
            e[0] = unicodedata.normalize('NFC',e[0])
            if not e in guess[i]:
                if not i in sen_wrong:
                    sen_wrong.append(i)
                    sen_num +=1
    total_len = 0
    for i in guess:
        total_len+=len(guess[i])
    print("Sentence Selection Result")
    print("Recall : ",1-sen_num/len(guess))
    print("Average length : ",total_len/len(guess))

def getDoc(evidence,wiki=wiki):
    docs = []
    sent_id = []
    for doc_title in evidence:
        for doc in wiki.wiki[doc_title]:
            string = ""
            for i in wiki.wiki[doc_title][doc].split(",")[1:]:
                string += i + " ,"
            docs.append(string[:-1])
            sent_id.append([doc_title,doc])
    return docs,sent_id

def sentSearch(query,docs,sent_id,wiki = wiki, k = 20, th = 0,  score = False):
    #docs = []
    #sent_id = []
    #for doc_title in evidence:
    #    for doc in wiki.wiki[doc_title]:
    #        string = ""
    #        for i in wiki.wiki[doc_title][doc].split(",")[1:]:
    #            string += i + " ,"
    #        docs.append(string[:-1])
    #        sent_id.append([doc_title,doc]) 
    index = processed(docs, lower = True, stem = True)
    result = query_sim(query,index,k ,th)
    docs = []
    scores = []
    for i in result:
        docs.append(sent_id[i[0]])
        scores.append((title,i[1]))
    if score:
        return(docs,scores)
    else:
        return(docs)
    
def sent_selection_title(TB_docs,setname,wiki=wiki,topk=20,th=0):
    filt_results_doc = {}
    for key in tqdm(setname):
        docs,sent_id = getDoc(TB_docs[key],wiki)
        t = sentSearch(setname[key]['claim'],docs,sent_id)
        filt_results_doc[key] = t
    return filt_results_doc

def sent_selection_cont(cont_docs,titles,setname,wiki=wiki,topk=30,th=0.5):
    filt_results_doc = {}
    for key in tqdm(cont_docs):
        docs = []
        sent_id = []
        cont_title = title_filter_for_cont(cont_docs[key],titles[key])
        for sent in cont_title:
            string = ""
            for i in wiki.single_sent(sent).split(",")[1:]:
                string += i + " ,"
            docs.append(string[:-1])
            sent_id.append(sent)
        t = sentSearch(setname[key]['claim'],docs,sent_id,k=topk,th=th)
        filt_results_doc[key] = t
    return filt_results_doc
    
def title_filter_for_cont(cont_docs,titles, wiki = wiki):
    sent = []
    for doc_title in cont_docs:
        doc_title = unicodedata.normalize('NFC',doc_title)
        for sent_id in wiki.wiki[doc_title]:
            doc = wiki.wiki[doc_title][sent_id]
            for title in titles:
                if title.lower() in doc.lower():
                    sent.append([doc_title,sent_id])
    return sent

def merged_result(TB_docs,cont_docs):
    for i in TB_docs:
        for j in cont_docs[i]:
            if j not in TB_docs[i]:
                TB_docs[i].append(j)
    return TB_docs

def sentent_selection(TB_docs,cont_docs,setname,titles,wiki=wiki,k = 30, th = 0.5):
    TB_title = sent_selection_title(TB_docs,setname)
    cont_title = sent_selection_cont(cont_docs,titles,setname,topk = k, th = th)
    final_title = merged_result(TB_title,cont_title)
    return TB_title

def output_senten_result(result,setname,path):
    output,claim_id = getoutput(result,setname)
    with open(path, 'wt') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow("TEST!")
        for i in output:
            tsv_writer.writerow(i)
            
def getoutput(result,setname,wiki=wiki):
    output = []
    claim_id = []
    for i in result:
        for evi in result[i]:
            example = []
            example.append(unicodedata.normalize('NFC',setname[i]['claim']))
            example.append(wiki.single_sent(evi))
            output.append(example)
            claim_id.append([i,evi])
    return output,claim_id

In [10]:
merged = sentent_selection(TB_dev,cont_dev,dev,titles_dev)

NameError: name 'sentent_selection' is not defined

In [None]:
sent_eval(dev,merged)

In [12]:
output_senten_result(merged,dev,INTERMEDIATE_DATAPATH+"/test.tsv")
output_ss_dev, claim_id_dev = getoutput(merged,dev)

NameError: name 'output_senten_result' is not defined

In [13]:
import json
with open('/home/shawn/workspace/research/final_codelab/data/final/sentence/claim_id_dev.json') as fp:
    claim_id_dev = json.load(fp)

In [None]:
gpu_homepath = "/home/shawn/workspace/research/final_codelab/"
SERVERNAME = 'gpu'
HOMEPATH = {'server1':server1_homepath, 'server2':server2_homepath, 'gpu':gpu_homepath, 'jun':jun_homepath}[SERVERNAME]
DATAPATH = HOMEPATH + 'data/'
INTERMEDIATE_DATAPATH = DATAPATH + "intermediate/"

with open(INTERMEDIATE_DATAPATH + 'analysis_output_ss_dev.pkl', 'wb') as fp:
    pickle.dump(output_ss_dev, fp)

with open(INTERMEDIATE_DATAPATH + 'analysis_claim_id_dev.pkl', 'wb') as fp:
    pickle.dump(claim_id_dev, fp)

# Read Sentent Selection Result

In [10]:
def alleval(predicted,actual,gs=False):
    NEI = "NOT ENOUGH INFO"
    correct_label = num_instances = 0
    evidence_prec = num_eprec = 0
    evidence_recall = num_erec = 0
    doc_prec = num_dprec = 0
    doc_rec = num_drec = 0

    for ident, arecord in actual.items():
        precord = predicted[ident]

        alabel = arecord['label'].upper()
        plabel = precord['label'].upper()
        if alabel == plabel:
            correct_label += 1
        num_instances += 1

        if alabel != NEI:
            prec = prec_hits = 0
            rec = rec_hits = 0

            aes = arecord['evidence']
            pes = precord['evidence'][:5]
            for pe in pes:
                if pe in aes:
                    prec += 1
                prec_hits += 1

            for ae in aes:
                if ae in pes:
                    rec += 1
                rec_hits += 1

            ads = set(map(lambda ds: ds[0], aes))
            last_pd = None
            dp = ndp = 0
            for pe in pes:
                if not last_pd or pe[0] != last_pd:
                    if pe[0] in ads:
                        dp += 1
                    ndp += 1
                last_pd = pe[0]

            pds = set(map(lambda ds: ds[0], pes))
            dr = ndr = 0
            for ae in ads:
                if ae in pds:
                    dr += 1
                ndr += 1

            if prec_hits > 0:
                evidence_prec += float(prec) / prec_hits
                num_eprec += 1

            if ndp > 0:
                doc_prec += float(dp) / ndp
                num_dprec += 1

            assert rec_hits > 0
            evidence_recall += float(rec) / rec_hits
            num_erec += 1

            assert ndr > 0
            doc_rec += float(dr) / ndr
            num_drec += 1
    accuracy = correct_label / float(num_instances)
    precision = evidence_prec / float(num_eprec) if num_eprec != 0 else 0
    recall = evidence_recall / float(num_erec) if num_erec != 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    doc_precision = doc_prec / float(num_dprec) if num_dprec != 0 else 0
    doc_recall = doc_rec / float(num_drec) if num_drec != 0 else 0
    doc_f1 = 2 * doc_precision * doc_recall / (doc_precision + doc_recall) if doc_precision + doc_recall > 0 else 0
            
    doc=[doc_precision,doc_recall,doc_f1]
    sent = [precision,recall,f1]
    if gs == True:
        return doc,sent,accuracy 
    else:
        
        print('Label Accuracy', '\t\t%.2f%%' % (100 * accuracy))
        print('Sentence Precision', '\t%.2f%%' % (100 * precision))
        print('Sentence Recall', '\t%.2f%%' % (100 * recall))
        print('Sentence F1', '\t\t%.2f%%' % (100 * f1))
        print('Document Precision', '\t%.2f%%' % (100 * doc_precision))
        print('Document Recall', '\t%.2f%%' % (100 * doc_recall))
        print('Document F1', '\t\t%.2f%%' % (100 * doc_f1))
def read_tsv_result(path):
    result = []
    with open(path,'r') as f:
        csv_reader = csv.reader(f, delimiter='\t')
        for row in csv_reader:
            result.append(row)
    return result
def evi_list2str(evi):
    string = evi[0] + " " + str(evi[1])
    return(unicodedata.normalize('NFC',string))
def get_sent_sel_result(model_output,index, target, th=0.999,k = 5):
    result = {}
    result_for_claim = {}
    for i in target:
        result_for_claim[i] = Counter()
    for i,res in enumerate(model_output):
        ind = index[i][0]
        title = evi_list2str(index[i][1])
        result_for_claim[ind][title] = float(res[1])    
    for i in target:
        result[i] = {}
        result[i]['claim'] = target[i]['claim']
        result[i]['label'] = "SUPPORTS"
        result[i]['evidence'] = []
        titles = result_for_claim[i].most_common(k)
        for inx in titles:
            if len(result[i]['evidence']) < 1 :
                title = [unicodedata.normalize('NFD',inx[0].split(" ")[0]),int(inx[0].split(" ")[1])]
                result[i]['evidence'].append(title)
            else:
                if float(inx[1])>th:
                    #a = unicodedata.normalize('NFD',inx[0])
                    #title = [a.split(" ")[0],int(a.split(" ")[1])]
                    title = [unicodedata.normalize('NFD',inx[0].split(" ")[0]),int(inx[0].split(" ")[1])]
                    result[i]['evidence'].append(title)
    return result
def grid_search_sent(result,index,setname):
    th = {}
    num = 0.9
    max_f1 = 0
    th = 0
    while num< 1:
        after_model = get_sent_sel_result(result,index,setname,th=num)
        doc,sent,acc=alleval(after_model,setname,gs=True)
        if sent[2]>max_f1:
            max_f1 = sent[2]
            th = num
        num += 0.001
    return th
def getoutput_final(result,setname,wiki=wiki):
    output = []
    claim_id = []
    for key in result:
        example = []
        example.append(unicodedata.normalize('NFC',setname[key]['claim']))
        example.append(wiki.multi_sents(result[key]['evidence']))
        output.append(example)
        claim_id.append(key)
    return output,claim_id
def output_final_test(result,setname,path):
    output,claim_id = getoutput_final(result,setname)
    with open(path, 'wt') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow("TEST!")
        for i in output:
            tsv_writer.writerow(i)

In [14]:
##结果在这里，有空分析一下结果
import csv
# import Counter
sent_sel_result_list = read_tsv_result(INTERMEDIATE_DATAPATH+"/test_results.tsv")
th = grid_search_sent(sent_sel_result_list,claim_id_dev,dev)
sent_sel_result_dev = get_sent_sel_result(sent_sel_result_list,claim_id_dev,dev,th=th)
alleval(sent_sel_result_dev,dev)

Label Accuracy 		33.33%
Sentence Precision 	7.69%
Sentence Recall 	6.86%
Sentence F1 		7.25%
Document Precision 	30.05%
Document Recall 	28.62%
Document F1 		29.32%


In [53]:
################################################################
# ENVIRON
################################################################
# This is server-1
server1_homepath = "/home/ubuntu/workspace/codelab/"
server2_homepath = "/home/ubuntu/workspace/codelab/"
gpu_homepath = "/home/shawn/workspace/research/final_codelab/"
jun_homepath = "/home/junw/workspace/codelab/"

# choose from the server1, server2, gpu, jun.
SERVERNAME = 'gpu'
HOMEPATH = {'server1':server1_homepath, 'server2':server2_homepath, 'gpu':gpu_homepath, 'jun':jun_homepath}[SERVERNAME]

TASKNAME = 'preprocess'

## get the path of the wiki files
DATAPATH = HOMEPATH + "data/"
ORGINAL_DATAPATH = DATAPATH +"orignal/"
INTERMEDIATE_DATAPATH = DATAPATH + "intermediate/"
FINAL_DATAPATH =  DATAPATH + "final/"
################################################################
# ENVIRON
################################################################

import prepare
from prepare import prepare_data
from prepare import get_training_devset_test
train_dict, devset_dict, test_dict = get_training_devset_test(ORGINAL_DATAPATH)

In [65]:
# shawn写入test数据
output_list = []
_dict = {}
for _list in claim_id_dev:
    claim_id = _list[0]
    claim = dev[claim_id]['claim']
    claim = preprocessed_claim_sentence(claim)
    claim = unicodedata.normalize('NFC', claim)
    evidence_sent_id = _list[1]
    evidence_sent = wiki.single_sent(evidence_sent_id)
    try:
        _dict[claim_id] += evidence_sent
    except KeyError:
        _dict[claim_id] = ""

for key in _dict.keys():
    claim = dev[key]['claim']
    output_list.append([claim, _dict[key]])
with open('/home/shawn/workspace/research/final_codelab/shawn/input/classification/devset/test.tsv', 'wt') as fp:
    tsv_writer = csv.writer(fp, delimiter='\t')
    tsv_writer.writerow("TEST!")
    for _list in output_list:
        tsv_writer.writerow(_list)
print(len(_dict))

5001


In [66]:
output_list

[['A Floppy disk is lined with turnips.',
  "Floppy disk , Floppy disks are read and written by a floppy disk drive -LRB- FDD -RRB .Floppy disk , By the late 2000s , computers were rarely manufactured with installed floppy disk drives ; 3 1/2 - inch floppy disks can be used with an external USB floppy disk drive , but USB drives for 5 1/4 - inch , 8-inch , and non-standard diskettes are rare to non-existent .Floppy disk , While floppy disk drives still have some limited uses , especially with legacy industrial computer equipment , they have been superseded by data storage methods with much greater capacity , such as USB flash drives , flash storage cards , portable external hard disk drives , optical discs , ROM cartridges and storage available through computer networks .Floppy disk , Floppy disks , initially as 8 inch media and later in 5 1/4 - inch -LRB- 133 mm -RRB and 3 1/2 - inch -LRB- 90 mm -RRB sizes , were a ubiquitous form of data storage and exchange from the mid-1970s into t

In [60]:
len(test_dict)

14997

In [None]:
alleval(sent_sel_result_dev,dev)

In [72]:
output_final, cli = getoutput_final(sent_sel_result_dev,dev)
output_final_test(sent_sel_result_dev,dev,INTERMEDIATE_DATAPATH+"/final_test.tsv")

# Final Evaluation and Output

In [17]:
def getLabel(res):
    res_new = []
    for i in res:
        res_new.append(float(i))
    num = res_new.index(max(res_new))
    if num==0:
        return "REFUTES"
    if num ==1:
        return "SUPPORTS"
    if num == 2:
        return "NOT ENOUGH INFO"
def get_final_result(sen_sel_result,final_result,index):
    for i,res in enumerate(final_result):
        claim_id = index[i]
        sen_sel_result[claim_id]['label'] = getLabel(res)
    return sen_sel_result

def final_output(result,path):
    new_final = {}
    for i in result:
        new_final[i] = {}
        new_final[i]['claim'] = unicodedata.normalize('NFD', result[i]['claim'])
        new_final[i]['label'] = unicodedata.normalize('NFD', result[i]['label'])
        new_evi = []
        for evi in result[i]['evidence']:
            new_evi.append([unicodedata.normalize('NFD', evi[0]),evi[1]])
        new_final[i]['evidence'] = new_evi
    with open(path,"w") as f:
        json.dump(new_final,f)
    return new_final

In [79]:
print(cli[10])
print(cli[100])
print(cli[200])
print(cli[4000])
print(cli[5000])

140846
105419
23845
186884
7956


In [2]:
import json
with open('/home/shawn/workspace/research/final_codelab/shawn/output/classification/cli', 'rb') as fp:
    cli = json.load(fp)
print(cli[10])
print(cli[100])
print(cli[200])
print(cli[4000])
print(cli[5000])

174604
104820
63320
161575
53895


In [88]:
final_result_tsv

[['0.12731124', '7.755902e-05', '0.8726112'],
 ['0.00086266577', '4.842491e-05', '0.9990889'],
 ['0.00051106303', '6.6758264e-05', '0.9994222'],
 ['0.0004047261', '7.506598e-05', '0.9995202'],
 ['0.9964862', '0.00014299824', '0.0033707588'],
 ['0.9993067', '0.00023139577', '0.00046201516'],
 ['0.0003728904', '8.090318e-05', '0.99954623'],
 ['0.00067050004', '6.0943767e-05', '0.9992686'],
 ['0.18087487', '0.070350714', '0.7487744'],
 ['0.000827886', '5.3159798e-05', '0.999119'],
 ['0.32642215', '0.0020119194', '0.67156595'],
 ['0.01868939', '0.00015213908', '0.9811585'],
 ['0.0025502692', '6.8139256e-05', '0.9973816'],
 ['0.99886703', '0.0001742063', '0.0009587937'],
 ['0.9991779', '0.00031644487', '0.0005055685'],
 ['0.999273', '0.00016055942', '0.00056640693'],
 ['0.001316979', '0.00017390208', '0.9985091'],
 ['0.058573373', '0.6502667', '0.29115993'],
 ['0.0037085628', '0.0016899332', '0.9946015'],
 ['0.26130626', '0.12341291', '0.6152808'],
 ['0.00093832705', '0.0001581195', '0.9989

In [18]:
##结果在这里
# final_result_tsv = read_tsv_result(INTERMEDIATE_DATAPATH+"/final_results.tsv")
# final_result_tsv = read_tsv_result("/home/shawn/workspace/research/final_codelab/shawn/output/classification" + "/output_classification_devset_test_results.tsv")
# final_result_tsv = read_tsv_result("/home/shawn/workspace/research/final_codelab/shawn/output/classification" + "/output_classification_add_nei_filtered_by_model_test_results.tsv")
final_result_tsv = read_tsv_result("/home/shawn/workspace/research/final_codelab/shawn/output/classification/random_forest" + "test_results.tsv")
final = get_final_result(sent_sel_result_dev,final_result_tsv,cli)
alleval(final,dev)

Label Accuracy 		51.95%
Sentence Precision 	7.69%
Sentence Recall 	6.86%
Sentence F1 		7.25%
Document Precision 	30.05%
Document Recall 	28.62%
Document F1 		29.32%


In [None]:
new_final = final_output(final,HOMEPATH+"final_data/final_results.json")

# Test Set

In [None]:
TB_test, titles_test = title_based(test,invindex)

In [None]:
with open(INTERMEDIATE_DATAPATH+"/title_based_test_set.json","w",encoding='utf-8') as f:
    json.dump(TB_test,f)
with open(INTERMEDIATE_DATAPATH+"/title_test_set.json","w",encoding='utf-8') as f:
    json.dump(titles_test,f)

In [None]:
with open(INTERMEDIATE_DATAPATH+"/merged_test100",'rb') as fp:
    merged_test_dict = pickle.load(fp)
cont_test ={}
for key in merged_test_dict:
    titles = []
    for title in merged_test_dict[key]['matched']:
        titles.append(title[0])
    cont_test[key] = titles    

In [None]:
merged_test = sentent_selection(TB_test,cont_test,test,titles_test)

In [None]:
totle_len = 0
for key in merged_test:
    totle_len+= len(merged_test[key])
print(totle_len/len(merged_test))

In [None]:
output_senten_result(merged_test,test,INTERMEDIATE_DATAPATH+"/test.tsv")
output_ss_test, claim_id_test = getoutput(merged_test,test)

In [None]:
merged_test

In [None]:
sent_sel_result_list_test = read_tsv_result(INTERMEDIATE_DATAPATH+"/testset/test_results.tsv")
sent_sel_result_test = get_sent_sel_result(sent_sel_result_list_test,claim_id_test,test,th=0.5)

In [None]:
totle_len = 0
for i in sent_sel_result_test:
    totle_len += len(sent_sel_result_test[i]['evidence'])
print(totle_len/len(test))

In [None]:
l = 0
for i in sent_sel_result_test:
    if len(sent_sel_result_test[i]['evidence']) == 0:
        l+=1

In [None]:
print(l)

In [None]:
len(sent_sel_result_list_test)

In [None]:
len(output_ss_test)

In [None]:
sent_sel_result_test['121548']

In [None]:
wiki

In [None]:
claim_id_test[8000]

In [None]:
nup = 0
for i in sent_sel_result_list_test[80000:801000]:
    if float(i[1]) >0.5:
        nup+=1
print(nup)

In [None]:
num = 0
for i in sent_sel_result_list_test:
    if float(i[1])>0.5:
        num+=1
num/len(test)

In [None]:
i

In [None]:
output_test, cli_test = getoutput_final(sent_sel_result_test,test)
output_final_test(sent_sel_result_test,test,INTERMEDIATE_DATAPATH+"/final_test.tsv")

In [None]:
final_result_tsv_test = read_tsv_result(INTERMEDIATE_DATAPATH+"/testset/final_test_results.tsv")

In [None]:
final_test = get_final_result(sent_sel_result_test,final_result_tsv_test,cli_test)

In [None]:
label = {}
for i,res in enumerate(final_result_tsv_test):
    claim_id = cli_test[i]
    label[claim_id] = getLabel(res)

In [None]:
new_final_test = final_output(final_test,HOMEPATH+"final_data/testoutput.json")

In [None]:
s =[]
r = []
nei = []
for i in final_test:
    if final_test[i]['label'] == "SUPPORTS":
        s.append(i)
    if final_test[i]['label'] == "REFUTES":
        r.append(i)
    if final_test[i]['label'] == "NOT ENOUGH INFO":
        nei.append(i)

In [None]:
for i in 

In [None]:
final_result_tsv_test[8000:8100]

In [None]:
output_test[8000:8100]

In [None]:
cli_test[8000:8100]

In [None]:
final_test['20807']

In [None]:
print(s,r,nei)

In [None]:
final_test

In [None]:
getLabel(final_result_tsv_test[1])

In [None]:
a = []
s = []
nei = []
for i in final_result_tsv_test:
    a = getLabel(i)
    if a == "SUPPORTS":
        s.append(i)
    if a == "REFUTES":
        r.append(i)
    if a == "NOT ENOUGH INFO":
        nei.append(i)

In [None]:
len(nei)

In [None]:
r

In [None]:
nei

In [None]:
new_final_test['134617']

In [None]:
HOMEPATH

In [None]:
NEI = 0
for i in new_final_test:
    if new_final_test[i]['label'] == "NOT ENOUGH INFO":
        NEI+=1

In [None]:
print(NEI)

In [None]:
final_test['110000']

In [None]:
final_test.keys()