In [2]:
from nltk.corpus import gazetteers, names
from nltk import word_tokenize
#from Corpus import WikiCorpus

class DocSelection:
    def __init__(self, corpus):
        self.places = set(gazetteers.words())
        self.people = set(names.words())
        self.stop_words = self.load_stop_words()
        self.corpus = corpus
        self.title_index = self.build_index()

    def build_index(self):
        index = {}
        num = 0
        for title in self.corpus:
            num += 1
            if num % 10000==0:
                print("processed {} titles, total {}".format(num, len(self.corpus)))
            title_tokens, other_txt, title_str = WikiCorpus.normalize_title(title, rflag=True)
            for title_token in title_tokens:
                title_token = title_token.lower
                if title_token not in index:
                    index[title_token] = set()
                    index[title_token].add(title_str)
                else:
                    if title_str not in index[title_token]:
                        index[title_token].add(title_str)
        #del self.titles

        print("title index is built sucessfully!!!!!")
        return index

    def load_stop_words(self):
        stop_words = set()
        with open("stoplist") as f:
            for word in f:
                word=word.rstrip("\n")
                stop_words.add(word)
        return stop_words


    def select_docs(self, claim):
        claim_tokens = word_tokenize(claim)

        claim_tokens = [token.lower() for token in claim_tokens if token not in self.stop_words]

        select_docs = self.title_index[claim_tokens[0]]
        for claim_token in claim_tokens[1:]:
            select_docs = select_docs.intersection(self.title_index[claim_token])

        return select_docs

    def select_docs2(self, claim):
        select_docs = []
        claim_tokens = word_tokenize(claim)
        claim_tokens = [token.lower() for token in claim_tokens if token not in self.stop_words]
        return claim_tokens





In [8]:
from nltk import word_tokenize
import nltk
# from Corpus import WikiCorpus
import numpy as np
#select all sentences that are related to the claim
class SentenceRank:
    def __init__(self, corpus_dic, embedding_matrix={}, sentence_num = 15):
        self.corpus_dic = corpus_dic
        self.stop_words = self.load_stop_list()
        self.sentence_num = sentence_num
        if(len(embedding_matrix) > 0):
            self.embedding = embedding_matrix
        else:
            self.embedding = self.load_embedding()
    def load_stop_list(self):
        stop_words=set()
        with open("stoplist") as f:
            for word in f:
                word = word.rstrip("\n")
                stop_words.add(word)
        return stop_words

    def load_embedding(self):
        embedding ={}
        try:
            f = open('glove.840B.300d.txt')
            i = 0
            for line in f:
                values = line.split()
                word = values[0]
                index = 1
                for value in values[1:]:
                    if not WikiCorpus.is_number(value):
                        word += ' '+value
                        index += 1
                    else:
                        break

                coefs = np.asarray(values[index:], dtype='float32')
                embedding[word] = coefs
                i += 1
                if i%10000 == 0:
                    print("processed %d" %i)
            print("building embedding index finished!")
            return embedding
        finally:
            f.close()
    def get_glove_embedding(self, word):
        return self.embedding[word]

    def sentence_similarity(self, claim, sentence):
        unknown_word = '0.22418134 -0.28881392 0.13854356 0.00365387 -0.12870757 0.10243822 0.061626635 0.07318011 -0.061350107 -1.3477012 0.42037755 -0.063593924 -0.09683349 0.18086134 0.23704372 0.014126852 0.170096 -1.1491593 0.31497982 0.06622181 0.024687296 0.076693475 0.13851812 0.021302193 -0.06640582 -0.010336159 0.13523154 -0.042144544 -0.11938788 0.006948221 0.13333307 -0.18276379 0.052385733 0.008943111 -0.23957317 0.08500333 -0.006894406 0.0015864656 0.063391194 0.19177166 -0.13113557 -0.11295479 -0.14276934 0.03413971 -0.034278486 -0.051366422 0.18891625 -0.16673574 -0.057783455 0.036823478 0.08078679 0.022949161 0.033298038 0.011784158 0.05643189 -0.042776518 0.011959623 0.011552498 -0.0007971594 0.11300405 -0.031369694 -0.0061559738 -0.009043574 -0.415336 -0.18870236 0.13708843 0.005911723 -0.113035575 -0.030096142 -0.23908928 -0.05354085 -0.044904727 -0.20228513 0.0065645403 -0.09578946 -0.07391877 -0.06487607 0.111740574 -0.048649278 -0.16565254 -0.052037314 -0.078968436 0.13684988 0.0757494 -0.006275573 0.28693774 0.52017444 -0.0877165 -0.33010918 -0.1359622 0.114895485 -0.09744406 0.06269521 0.12118575 -0.08026362 0.35256687 -0.060017522 -0.04889904 -0.06828978 0.088740796 0.003964443 -0.0766291 0.1263925 0.07809314 -0.023164088 -0.5680669 -0.037892066 -0.1350967 -0.11351585 -0.111434504 -0.0905027 0.25174105 -0.14841858 0.034635577 -0.07334565 0.06320108 -0.038343467 -0.05413284 0.042197507 -0.090380974 -0.070528865 -0.009174437 0.009069661 0.1405178 0.02958134 -0.036431845 -0.08625681 0.042951006 0.08230793 0.0903314 -0.12279937 -0.013899368 0.048119213 0.08678239 -0.14450377 -0.04424887 0.018319942 0.015026873 -0.100526 0.06021201 0.74059093 -0.0016333034 -0.24960588 -0.023739101 0.016396184 0.11928964 0.13950661 -0.031624354 -0.01645025 0.14079992 -0.0002824564 -0.08052984 -0.0021310581 -0.025350995 0.086938225 0.14308536 0.17146006 -0.13943303 0.048792403 0.09274929 -0.053167373 0.031103406 0.012354865 0.21057427 0.32618305 0.18015954 -0.15881181 0.15322933 -0.22558987 -0.04200665 0.0084689725 0.038156632 0.15188617 0.13274793 0.113756925 -0.095273495 -0.049490947 -0.10265804 -0.27064866 -0.034567792 -0.018810693 -0.0010360252 0.10340131 0.13883452 0.21131058 -0.01981019 0.1833468 -0.10751636 -0.03128868 0.02518242 0.23232952 0.042052146 0.11731903 -0.15506615 0.0063580726 -0.15429358 0.1511722 0.12745973 0.2576985 -0.25486213 -0.0709463 0.17983761 0.054027 -0.09884228 -0.24595179 -0.093028545 -0.028203879 0.094398156 0.09233813 0.029291354 0.13110267 0.15682974 -0.016919162 0.23927948 -0.1343307 -0.22422817 0.14634751 -0.064993896 0.4703685 -0.027190214 0.06224946 -0.091360025 0.21490277 -0.19562101 -0.10032754 -0.09056772 -0.06203493 -0.18876675 -0.10963594 -0.27734384 0.12616494 -0.02217992 -0.16058226 -0.080475815 0.026953284 0.110732645 0.014894041 0.09416802 0.14299914 -0.1594008 -0.066080004 -0.007995227 -0.11668856 -0.13081996 -0.09237365 0.14741232 0.09180138 0.081735 0.3211204 -0.0036552632 -0.047030564 -0.02311798 0.048961394 0.08669574 -0.06766279 -0.50028914 -0.048515294 0.14144728 -0.032994404 -0.11954345 -0.14929578 -0.2388355 -0.019883996 -0.15917352 -0.052084364 0.2801028 -0.0029121689 -0.054581646 -0.47385484 0.17112483 -0.12066923 -0.042173345 0.1395337 0.26115036 0.012869649 0.009291686 -0.0026459037 -0.075331464 0.017840583 -0.26869613 -0.21820338 -0.17084768 -0.1022808 -0.055290595 0.13513643 0.12362477 -0.10980586 0.13980341 -0.20233242 0.08813751 0.3849736 -0.10653763 -0.06199595 0.028849555 0.03230154 0.023856193 0.069950655 0.19310954 -0.077677034 -0.144811'
        unknown_word_vec = np.array(unknown_word.split(" "), dtype='float32')
        def sentence_embedding(s):
            tokens = word_tokenize(s)
            v = np.zeros(300)
            for token in tokens:
                if token not in self.embedding:
                    v += unknown_word_vec
                else:
                    v += self.embedding[token]
            v /= len(tokens)
            return v
        v_claim, v_sentence = sentence_embedding(claim), sentence_embedding(sentence)
        return np.dot(v_claim, v_sentence) / (np.linalg.norm(v_claim) * np.linalg.norm(v_sentence))

    def select_sentence_by_embedding(self, selected_docs, claim):
        selected_sentences = {}
        min_score = -1
        num = 0

        sentences = []

        for doc in selected_docs:
            for sentence_id in self.corpus_dic[doc]:
                sentence = self.corpus_dic[doc][sentence_id]
                if len(sentence) > 400:
                    continue

                similarity = self.sentence_similarity(claim, sentence)

                if similarity > min_score:
                    potential_sentence = (doc, sentence_id)

                    if num == self.sentence_num:
                        del_num = len(selected_sentences[min_score])
                        num -= del_num
                        del selected_sentences[min_score]

                    if similarity not in selected_sentences:
                        selected_sentences[similarity] = [potential_sentence]
                    else:
                        selected_sentences[similarity].append(potential_sentence)

                    min_score = min(list(sorted(selected_sentences.keys())))
                    num += 1
                    continue

                if similarity < min_score:
                    potential_sentence = (doc, sentence_id)
                    if num < self.sentence_num:
                        num += 1
                        selected_sentences[similarity] = [potential_sentence]
                        min_score = similarity
                    continue

                if similarity == min_score:
                    potential_sentence = (doc, sentence_id)
                    if num < self.sentence_num:
                        num += 1
                        selected_sentences[similarity].append(potential_sentence)
                        min_score = similarity
                    continue
        score_list = list(reversed(sorted(selected_sentences.keys())))
        for score in score_list:
            for sentence in selected_sentences[score]:
                sentences.append(sentence)

        return sentences


    def select_and_score_sentences(self, selected_docs, claim):
        sentences_score = {}
        min_score = -1
        num = 0

        claim_tokens = word_tokenize(claim)

        stemmer = nltk.stem.PorterStemmer()

        claim_tokens = [token.lower() for token in claim_tokens if token not in self.stop_words]

        for i in range(len(claim_tokens)):
            if claim_tokens[i].isalpha():
                claim_tokens[i] = stemmer.stem(claim_tokens[i])


        claim_len = len(claim_tokens)

        for doc in selected_docs:
            for sentence_id in self.corpus_dic[doc]:
                sentence = self.corpus_dic[doc][sentence_id]
                if len(sentence) > 400:
                    continue
                match_token_num = 0
                sentence_tokens = sentence.split()

                sentence_tokens = [token.lower() for token in sentence_tokens if token not in self.stop_words]

                for i in range(len(sentence_tokens)):
                    if sentence_tokens[i].isalpha():
                        sentence_tokens[i] = stemmer.stem(sentence_tokens[i])

                for token in sentence_tokens:
                    if token in claim_tokens:
                        match_token_num += 1

                match_ratio = match_token_num / claim_len
                if match_ratio > min_score:
                    potential_sentence = {doc:sentence_id}

                    if num == self.sentence_num:

                        del_num = len(sentences_score[min_score])
                        num -= del_num
                        del sentences_score[min_score]

                    if match_ratio not in sentences_score:
                        sentences_score[match_ratio] = [potential_sentence]
                    else:
                        sentences_score[match_ratio].append(potential_sentence)

                    min_score = min(list(sorted(sentences_score.keys())))
                    num += 1
                    continue

                if match_ratio < min_score:
                    potential_sentence = {doc: sentence_id}
                    if num < self.sentence_num:
                        num += 1
                        sentences_score[match_ratio] = [potential_sentence]
                        min_score = match_ratio
                    continue

                if match_ratio == min_score:
                    potential_sentence = {doc: sentence_id}
                    if num < self.sentence_num:
                        num += 1
                        sentences_score[match_ratio].append(potential_sentence)
                        min_score = match_ratio
                    continue

        return sentences_score







In [9]:
import numpy as np
import zipfile
import json
import collections
import nltk
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from os.path import join as pjoin
from nltk.corpus import gazetteers, names
from unicodedata import normalize
from nltk import word_tokenize
from random import shuffle
from sklearn.utils.class_weight import compute_class_weight
from collections import defaultdict

class WikiCorpus:
    def __init__(self, tokenizer):
        ######################Read Wiki ZIP File#####################################
        self.corpus_dic = None
        self.term_id = 0

        self.vocabulary = {}

        self.embedding_matrix = None
        self.max_sequence_len = -1

        self.places = set(gazetteers.words())
        self.people = set(names.words())
        self.stop = self.load_stoplist()

        self.tokenizer = tokenizer

        self.titles = []

    def load_stoplist(self,stopfile="stoplist"):
        stop = set()
        with open(stopfile) as f:
            for line in f:
                word = line.rstrip("\n")
                stop.add(word)
        return stop

    def _load_corpus_for_doc_IR(self):
        titles = []
        myzip = zipfile.ZipFile('wiki-pages-text.zip')
        ziplist = myzip.namelist()

        ## loop each txt file
        for i in range(1, len(ziplist)):
            fileobj = myzip.open(ziplist[i])
            print("start read " + str(i) + "file:" + ziplist[i])
            #    loop each line for txt
            for line in fileobj:
                # must use utf-8 to store different languages
                # remove "/n" at each line
                myline = line.decode('utf-8').strip()
                # use first 2 blanks to cut the string
                line_list = myline.split(' ', 2)
                if not self.is_number(line_list[1]):
                    continue
                title = line_list[0]
                titles.append(title)

        return titles


    def _load_corpus_for_nli(self):
        self.term_id = 1

        corpus = collections.defaultdict(dict)
        myzip = zipfile.ZipFile('wiki-pages-text.zip')
        ziplist = myzip.namelist()

        ## loop each txt file
        for i in range(1, len(ziplist)):
            fileobj = myzip.open(ziplist[i])
            print("start read " + str(i) + "file:" + ziplist[i])
            #    loop each line for txt
            for line in fileobj:
                # must use utf-8 to store different languages
                # remove "/n" at each line
                myline = line.decode('utf-8').strip()
                #time.sleep(2)
                # use first 2 blanks to cut the string
                line_list = myline.split(' ', 2)
                if not self.is_number(line_list[1]):
                    continue

                title_origin = line_list[0]


                title = title_origin.replace("-LRB-", "(_")
                title = title.replace("-RRB-", "_)")
                title = title.replace("_"," ")

                tokens = title.split(' ')
                for token in tokens:
                    if token not in self.vocabulary:
                        #self.vocabulary[token] = self.term_id
                        self.term_id += 1

                _, title = WikiCorpus.normalize_title(title_origin)


                text_decode = line_list[2]
                text = text_decode.replace('-LRB-', '(')
                text = text.replace('-RRB-',')')
                text = text.replace('-LSB-', '[')
                text = text.replace('-RSB-', ']')
                corpus[title_origin][line_list[1]] = title + ' # ' + text
                tokens = text.split(' ')
                for token in tokens:

                    if token not in self.vocabulary:
                        #self.vocabulary[token] = self.term_id
                        self.term_id += 1



        myzip.close()
        self.corpus_dic = corpus



        #del self.corpus_dic
    def get_voc_size(self):
        return len(self.tokenizer.word_index)

    def get_corpus(self):
        if self.corpus_dic is None or len(self.corpus_dic) == 0:
            self._load_corpus_for_nli()
        return self.corpus_dic

    def get_titles(self):
        if len(self.titles) == 0:
            self._load_corpus_for_nli()
        return self.titles

    def get_vocabulary(self):
        return self.vocabulary

    @staticmethod
    def normalize_title(title, rflag=False):
        other_txt = ""
        title = title.replace("_", " ").replace("-COLON-", ":")

        if title.find("-LRB-") > -1:
            other_txt = title[title.find("-LRB-"):]
            other_txt = other_txt.replace("-LRB-", "(").replace("-RRB-", ")")
            main_txt = title[:title.find("-LRB-")].rstrip(" ")
            title_str = main_txt
            main_txt = main_txt.split()
        else:
            main_txt = title.split()
            title_str = title
        if rflag:
            return main_txt, other_txt, title_str
        else:
            return main_txt, title_str

    def generate_instance_features(self, title, claim, start_pos=0):
        features = dict()
        title_tokens, other_txt = self.normalize_title(title, rflag=True)
        features["other_txt"] = (other_txt == "")
        features["rinc"] = ((other_txt != "") and (other_txt in claim))
        features["start"] = start_pos
        features["start0"] = (start_pos == 0)
        features["lend"] = len(title_tokens)
        features["lend1"] = (features["lend"] == 1)
        features["cap1"] = title_tokens[0][0].isupper()
        features["stop1"] = (title_tokens[0].lower() in self.stop)
        features["people1"] = (title_tokens[0] in self.people)
        features["places1"] = (title_tokens[0] in self.places)
        features["capany"] = False
        features["capall"] = True
        features["stopany"] = False
        features["stopall"] = True
        features["peopleany"] = False
        features["peopleall"] = True
        features["placesany"] = False
        features["placesall"] = True
        for token in title_tokens:
            features["capany"] = (features["capany"] or token[0].isupper())
            features["capall"] = (features["capall"] and token[0].isupper())
            features["stopany"] = (features["stopany"] or token.lower() in self.stop)
            features["stopall"] = (features["stopall"] and token.lower() in self.stop)
            features["peopleany"] = (features["peopleany"] or token in self.people)
            features["peopleall"] = (features["peopleall"] and token in self.people)
            features["placesany"] = (features["placesany"] or token in self.places)
            features["placesall"] = (features["placesall"] and token in self.places)
        return features


    def generate_doc_retrival_data(self, sample_size=30000):
        titles = self._load_corpus_for_doc_IR()
        shuffle(titles)
        num_instance = 0

        with open('train.json', 'r') as f:
            data = json.load(f)
            num_claim = len(data.keys)
            train_x = np.zeros(shape=(num_claim*2, 18), dtype=np.float32)
            train_y = np.zeros(shape=(num_claim*2), dtype=np.float32)
            for key in data:
                label = data[key]['label']
                if label == "NOT ENOUGH INFO":
                    continue

                claim = data[key]['claim']
                evidences = []
                for evidence in data[key]['evidence']:
                    evidence_norm = normalize("NFC", evidence[0])
                    evidences.append(evidence_norm)



        with open('devset.json', 'r') as f:
            data = json.load(f)
            for key in data:

                for evidence in data[key]['evidence']:
                    evidence_norm = normalize("NFC", evidence[0])
        pass

    def texts_to_sequences(self, texts, need_tokenize):
        sequences = []
        for text in texts:
            if not need_tokenize:
                tokens = text.split()
            else:
                tokens = word_tokenize(text)
            sequence = []
            for token in tokens:
                sequence.append(self.vocabulary[token])
            sequences.append(sequence)
        return sequences


    def get_preprocessed_data(self, texts, need_tokenize = False):
        sequences = self.texts_to_sequences(texts, need_tokenize)
        padded_sequences = pad_sequences(sequences, maxlen=self.max_sequence_len, padding='post')
        return padded_sequences

    @staticmethod
    def label_to_vec(labels):
        vec = []
        for label in labels:
            if label == 'SUPPORTS':
                vec.append([1.0, 0.0, 0.0])
            if label == 'REFUTES':
                vec.append([0.0, 1.0, 0.0])
            if label == 'NOT ENOUGH INFO':
                vec.append([0.0, 0.0, 1.0])
        return np.array(vec)

    def generate_sequence(self, texts):
        sequences = self.tokenizer.texts_to_sequences(texts)
        return pad_sequences(sequences, maxlen=100, padding='post')

    def load_test_data(self):
        with open('test-unlabelled.json', 'r') as f:
            data = json.load(f)
            for key in data:
                tokens = nltk.word_tokenize(data[key]['claim'])
                for token in tokens:
                    if token not in self.vocabulary:
                        #self.vocabulary[token] = self.term_id
                        self.term_id += 1


    def generate_training_data(self, word_embeddings):
        from Doc_Retrieval import DocSelection
        from Sentence_Rank import SentenceRank

        hypo_train_data_x = []
        premise_train_data_x = []
        train_text_y = []

        hypo_dev_data_x = []
        premise_dev_data_x = []
        dev_text_y = []


        self._load_corpus_for_nli()

        doc_selection = DocSelection(self.corpus_dic)
        sentence_selection = SentenceRank(self.corpus_dic, word_embeddings)

        with open('train.json', 'r') as f:
            data = json.load(f)
            for key in data:
                claim = data[key]['claim']
                evidence_docs = []
                for evidence in data[key]['evidence']:
                    evidence_norm = normalize("NFC", evidence[0])
                    if evidence_norm not in self.corpus_dic:
                        continue

                evidence_docs.append((evidence_norm, str(evidence[1])))
                selected_docs = doc_selection.select_docs(claim)
                if len(selected_docs) == 0:
                    print("0000000000000")
                selected_sentences = sentence_selection.select_sentence_by_embedding(selected_docs, claim)
                not_enough_num = 0
                for sentence in selected_sentences:
                    if not_enough_num == 2:
                        break
                    if sentence not in evidence_docs:
                        hypo_train_data_x.append(claim)
                        premise_train_data_x.append(self.corpus_dic[sentence[0]][str(sentence[1])])
                        train_text_y.append('NOT ENOUGH INFO')
                        not_enough_num += 1

        with open('devset.json', 'r') as f:
            data = json.load(f)
            for key in data:
                claim = data[key]['claim']

                evidence_docs = []
                for evidence in data[key]['evidence']:
                    evidence_norm = normalize("NFC", evidence[0])
                    if evidence_norm not in self.corpus_dic:
                        continue

                evidence_docs.append((evidence_norm, str(evidence[1])))

                selected_docs = doc_selection.select_docs(claim)
                if len(selected_docs) == 0:
                    print("0000000000000")
                selected_sentences = sentence_selection.select_sentence_by_embedding(selected_docs, claim)

                not_enough_num = 0

                for sentence in selected_sentences:
                    if not_enough_num == 2:
                        break
                    if sentence not in evidence_docs:
                        hypo_dev_data_x.append(claim)
                        premise_dev_data_x.append(self.corpus_dic[sentence[0]][str(sentence[1])])
                        dev_text_y.append('NOT ENOUGH INFO')
                        not_enough_num += 1

        del doc_selection
        del sentence_selection
        del word_embeddings

        with open('train.json', 'r') as f:
            data = json.load(f)
            for key in data:
                claim = data[key]['claim']

                for evidence in data[key]['evidence']:
                    evidence_norm = normalize("NFC", evidence[0])
                    if evidence_norm not in self.corpus_dic:
                        continue

                    hypo_train_data_x.append(claim)
                    premise_train_data_x.append(self.corpus_dic[evidence_norm][str(evidence[1])])
                    train_text_y.append(data[key]['label'])

        with open('devset.json', 'r') as f:
            data = json.load(f)
            for key in data:
                claim = data[key]['claim']

            for evidence in data[key]['evidence']:
                evidence_norm = normalize("NFC", evidence[0])

                hypo_dev_data_x.append(claim)
                premise_dev_data_x.append(self.corpus_dic[evidence_norm][str(evidence[1])])
                dev_text_y.append(data[key]['label'])


        del self.corpus_dic


        #self.tokenizer.fit_on_texts(hypo_train_data_x+premise_train_data_x+hypo_dev_data_x+ premise_dev_data_x)

        #hypo_train_sequences = self.get_preprocessed_data(hypo_train_data_x, True)
        #premise_train_sequences = self.get_preprocessed_data(premise_train_data_x)

        d = list(zip(hypo_train_data_x, premise_train_data_x, train_text_y))
        shuffle(d)

        hypo_train_data_x, premise_train_data_x, train_text_y = zip(*d)


        hypo_train_sequences = self.generate_sequence(list(hypo_train_data_x))
        premise_train_sequences = self.generate_sequence(list(premise_train_data_x))

        train_sequences = [hypo_train_sequences, premise_train_sequences]
        train_y = WikiCorpus.label_to_vec(list(train_text_y))

        y_integers = np.argmax(train_y, axis=1)
        class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers)
        d_class_weights = dict(enumerate(class_weights))

        print('SUPPORTS :', train_text_y.count('SUPPORTS'), '| NOT ENOUGH INFO :', train_text_y.count('NOT ENOUGH INFO'), '| REFUTES :',train_text_y.count('REFUTES'))

        #hypo_dev_sequences = self.get_preprocessed_data(hypo_dev_data_x, True)
        #premise_dev_sequences = self.get_preprocessed_data(premise_dev_data_x)

        hypo_dev_sequences = self.generate_sequence(hypo_dev_data_x)
        premise_dev_sequences = self.generate_sequence(premise_dev_data_x)

        dev_sequences = [hypo_dev_sequences, premise_dev_sequences]
        dev_y = WikiCorpus.label_to_vec(dev_text_y)


        del hypo_train_data_x
        del premise_train_data_x
        del train_text_y

        del hypo_dev_data_x
        del premise_dev_data_x
        del dev_text_y

        return train_sequences, train_y, dev_sequences, dev_y, d_class_weights
    @staticmethod
    def is_number( str):
        try:
            float(str)
            return True
        except ValueError:
            pass
        return False

    def get_embedding_matrix(self):

        embeddings_index = {}
        word_index = self.get_vocabulary()
        print("build vocabulary successfully! ", len(word_index))
        #embedding_matrix = np.zeros((len(word_index) + 1, 300))
        embedding_matrix = np.zeros((len(self.tokenizer.word_index) + 1, 300))
        try:
            f = open('glove.840B.300d.txt', encoding='utf8')
            i = 0
            for line in f:
                values = line.split()
                word = values[0]
                index = 1
                for value in values[1:]:
                    if not self.is_number(value):
                        word += ' '+value
                        index += 1
                    else:
                        break

                coefs = np.asarray(values[index:], dtype='float32')
                embeddings_index[word] = coefs
                i += 1
                if i%10000 == 0:
                    print("processed %d" %i)
            print("building embedding index finished!")
            """
            for word in word_index:
                index = word_index[word]
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    # words not found in embedding index will be all-zeros.
                    embedding_matrix[index] = embedding_vector
                    
            """

            for word, i in self.tokenizer.word_index.items():
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    # words not found in embedding index will be all-zeros.
                    embedding_matrix[i+1] = embedding_vector

            return embedding_matrix

        finally:
            f.close()



class SNLICorpus:
    def __init__(self):
        self.TEXT_DATA_DIR = ''
        self.train_x = None
        self.train_y = None
        self.dev_x = None
        self.dev_y = None
        self.test_x = None
        self.test_y = None
        self.tokenizer = Tokenizer()

    def _load_data(self, tier):

        premise = []
        hypothseis = []
        label = []
        cnt = 0

        with open(pjoin(self.TEXT_DATA_DIR, 'snli_1.0_' + tier + '.jsonl')) as f:
            for line in f.readlines():
                d = json.loads(line)
                if d['gold_label'] != '-':
                    cnt += 1
                    premise.append(d['sentence1'])
                    hypothseis.append(d['sentence2'])
                    label.append(d['gold_label'])

        print('# of', tier, 'samples :', cnt, end=' | ')
        print('Entailment :', label.count('entailment'), '| Neutral :', label.count('neutral'), '| Contradiction :',
              label.count('contradiction'))
        #    return (premise[:100], hypothseis[:100], label[:100])
        return premise, hypothseis, label




    def _PadSeq(self, text):
        sequences = self.tokenizer.texts_to_sequences(text)
        return pad_sequences(sequences, maxlen=200,padding='post')

    def _labelToVec(self, labels):
        vec = []
        for label in labels:
            if label == 'entailment':
                vec.append([1.0, 0.0, 0.0])
            elif label == 'contradiction':
                vec.append([0.0, 1.0, 0.0])
            elif label == 'neutral':
                vec.append([0.0, 0.0, 1.0])
            else:
                raise ValueError('Unknown label %s' % (label))
        return np.array(vec)

    def _is_number(self, str):
        try:
            float(str)
            return True
        except ValueError:
            pass
        return False

    def get_embedding_matrix(self):
        embeddings_index = {}
        embedding_matrix = np.zeros((len(self.tokenizer.word_index) + 1, 300))
        try:
            f = open('glove.840B.300d.txt',encoding='utf8')
            i = 0
            for line in f:
                values = line.split()
                word = values[0]
                index = 1
                for value in values[1:]:
                    if not self._is_number(value):
                        word += ' ' + value
                        index += 1
                    else:
                        break

                coefs = np.asarray(values[index:], dtype='float32')
                embeddings_index[word] = coefs
                i += 1
                if i % 10000 == 0:
                    print("processed %d" % i)
            print("building embedding index finished!")
            for word, i in self.tokenizer.word_index.items():
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    # words not found in embedding index will be all-zeros.
                    embedding_matrix[i+1] = embedding_vector
            return embedding_matrix, embeddings_index

        finally:
            f.close()

    def get_voc_size(self):
        return len(self.tokenizer.word_index)

    def get_tokenizer(self):
        train = self._load_data('train')
        dev = self._load_data('dev')
        test = self._load_data('test')
        self.tokenizer.fit_on_texts(train[0] + train[1] + dev[0] + dev[1] + test[0] + test[1])
        return self.tokenizer

    def generate_training_data(self):
        train = self._load_data('train')
        dev = self._load_data('dev')
        test = self._load_data('test')


        self.tokenizer.fit_on_texts(train[0] + train[1] + dev[0] + dev[1] + test[0] + test[1])

        self.train_y = self._labelToVec(train[2])
        self.train_x = [self._PadSeq(train[0]), self._PadSeq(train[1])]
        self.dev_y = self._labelToVec(dev[2])
        self.dev_x = [self._PadSeq(dev[0]), self._PadSeq(dev[1])]
        self.test_y = self._labelToVec(test[2])
        self.test_x = [self._PadSeq(test[0]), self._PadSeq(test[1])]
        return self.train_x, self.train_y, self.dev_x, self.dev_y, self.test_x, self.test_y



a=["asda","cdc","cdscd"]
b=["cdcd","sdwde","xsas"]
c=["a","b","c"]

d=list(zip(a,b,c))
shuffle(d)
a,b,c = zip(*d)


print(d)
print(list(a),list(b),list(c))

[('asda', 'cdcd', 'a'), ('cdc', 'sdwde', 'b'), ('cdscd', 'xsas', 'c')]
['asda', 'cdc', 'cdscd'] ['cdcd', 'sdwde', 'xsas'] ['a', 'b', 'c']


In [10]:
from keras.models import Model
from keras.layers import Input, Dense, LSTM, GlobalAvgPool1D, CuDNNLSTM,Permute
from keras.layers import GlobalMaxPool1D, Embedding, Bidirectional,Dot, Lambda, Softmax
from keras.layers import Concatenate, Subtract, Multiply, Dropout,Layer
import keras.backend as K
import tensorflow as tf
class ESIM:
    def __init__(self, n_classes, max_sequence_length, embedding_matrix, voc_size, learning_rate=0.0004, use_gpu=False):
        self._max_sequence_length = max_sequence_length
        self._learning_rate = learning_rate
        self._n_classes = n_classes
        self._embedding_matrix = embedding_matrix
        self._voc_size = voc_size
        self._use_gpu = use_gpu

        self._inputEncodingBlock = None
        self._localInferenceBlock = None
        self._compositionBlock = None

        self._premise = Input(name='premise', shape=(self._max_sequence_length,), dtype='int32')
        self._hypothesis = Input(name='hypothesis', shape=(self._max_sequence_length,), dtype='int32')
        self.model = self.build_ESIM_model()

    def _input_encoding_block(self):

        embedding_layer = Embedding(self._voc_size + 1,
                                    300,weights=[self._embedding_matrix],
                                    input_length=self._max_sequence_length,
                                    trainable=True, mask_zero=True)

        premise_embedded_sequences = embedding_layer(self._premise)
        hypothesis_embedded_sequences = embedding_layer(self._hypothesis)

        if self._use_gpu:
            encoding_layer = Bidirectional(CuDNNLSTM(300,return_sequences=True))
        else:
            encoding_layer = Bidirectional(LSTM(300, dropout=0.5, return_sequences=True))

        a_bar = encoding_layer(premise_embedded_sequences)
        b_bar = encoding_layer(hypothesis_embedded_sequences)

        return a_bar, b_bar

    def _local_inference_block(self, a_bar, b_bar):
        attention_weights = Dot(axes=-1)([a_bar, b_bar])

        weight_b = Softmax(axis=1)(attention_weights)

        weight_a = Permute((2,1))(Softmax(axis=2)(attention_weights))

        b_aligned = Dot(axes=1)([weight_b, a_bar])

        a_aligned = Dot(axes=1)([weight_a, b_bar])

        m_a = Concatenate()([a_bar, a_aligned, Subtract()([a_bar, a_aligned]), Multiply()([a_bar, a_aligned])])

        m_b = Concatenate()([b_bar, b_aligned, Subtract()([b_bar, b_aligned]), Multiply()([b_bar, b_aligned])])
        return m_a, m_b


    def _inference_composition_block(self, m_a, m_b):
        y_a = Bidirectional(LSTM(300, return_sequences=True))(m_a)
        y_b = Bidirectional(LSTM(300, return_sequences=True))(m_b)

        class GlobalAvgPool1DMasked(Layer):
            def __init__(self, **kwargs):
                self.supports_masking = True
                super(GlobalAvgPool1DMasked, self).__init__(**kwargs)

            def compute_mask(self, inputs, mask=None):
                return None

            def call(self, inputs, mask=None):
                if mask is not None:
                    mask = K.cast(mask, K.floatx())
                    mask = K.repeat(mask, inputs.shape[-1])
                    mask = tf.transpose(mask, [0, 2, 1])
                    inputs = inputs * mask
                    return K.sum(inputs, axis=1) / K.sum(mask, axis=1)
                else:
                    print('not mask average!')
                    return super().call(inputs)

            def compute_output_shape(self, input_shape):
                return (input_shape[0], input_shape[2])

        class GlobalMaxPool1DMasked(GlobalMaxPool1D):
            def __init__(self, **kwargs):
                self.supports_masking = True
                super(GlobalMaxPool1DMasked, self).__init__(**kwargs)

            def compute_mask(self, inputs, mask=None):
                return None

            def call(self, inputs, mask=None):
                return super(GlobalMaxPool1DMasked, self).call(inputs)

        max_pooling_a = GlobalMaxPool1D()(y_a)
        avg_pooling_a = GlobalAvgPool1D()(y_a)

        max_pooling_b = GlobalMaxPool1D()(y_b)
        avg_pooling_b = GlobalAvgPool1D()(y_b)

        y = Concatenate()([avg_pooling_a, max_pooling_a, avg_pooling_b, max_pooling_b])
        y = Dense(1024, activation='tanh')(y)
        y = Dropout(0.5)(y)
        y = Dense(1024, activation='tanh')(y)
        y = Dropout(0.5)(y)
        y = Dense(self._n_classes, activation='softmax')(y)
        return y

    def build_ESIM_model(self):
        a_bar, b_bar = self._input_encoding_block()
        m_a, m_b = self._local_inference_block(a_bar, b_bar)
        y = self._inference_composition_block(m_a, m_b)
        model = Model(inputs=[self._premise, self._hypothesis], outputs=[y])

        print(model.summary())
        return model

In [11]:
corpus_snli = SNLICorpus()
embedding_matrix, word_embedding = corpus_snli.get_embedding_matrix()
tokenizer = corpus_snli.get_tokenizer()

processed 10000
processed 20000
processed 30000
processed 40000
processed 50000
processed 60000
processed 70000
processed 80000
processed 90000
processed 100000
processed 110000
processed 120000
processed 130000
processed 140000
processed 150000
processed 160000
processed 170000
processed 180000
processed 190000
processed 200000
processed 210000
processed 220000
processed 230000
processed 240000
processed 250000
processed 260000
processed 270000
processed 280000
processed 290000
processed 300000
processed 310000
processed 320000
processed 330000
processed 340000
processed 350000
processed 360000
processed 370000
processed 380000
processed 390000
processed 400000
processed 410000
processed 420000
processed 430000
processed 440000
processed 450000
processed 460000
processed 470000
processed 480000
processed 490000
processed 500000
processed 510000
processed 520000
processed 530000
processed 540000
processed 550000
processed 560000
processed 570000
processed 580000
processed 590000
proces

In [12]:
corpus = WikiCorpus(tokenizer)
corpus = corpus.get_corpus()

start read 1file:wiki-pages-text/wiki-009.txt
start read 2file:wiki-pages-text/wiki-021.txt
start read 3file:wiki-pages-text/wiki-035.txt
start read 4file:wiki-pages-text/wiki-034.txt
start read 5file:wiki-pages-text/wiki-020.txt
start read 6file:wiki-pages-text/wiki-008.txt
start read 7file:wiki-pages-text/wiki-036.txt
start read 8file:wiki-pages-text/wiki-022.txt
start read 9file:wiki-pages-text/wiki-023.txt
start read 10file:wiki-pages-text/wiki-037.txt
start read 11file:wiki-pages-text/wiki-033.txt
start read 12file:wiki-pages-text/wiki-027.txt
start read 13file:wiki-pages-text/wiki-026.txt
start read 14file:wiki-pages-text/wiki-032.txt
start read 15file:wiki-pages-text/wiki-024.txt
start read 16file:wiki-pages-text/wiki-030.txt
start read 17file:wiki-pages-text/wiki-018.txt
start read 18file:wiki-pages-text/wiki-019.txt
start read 19file:wiki-pages-text/wiki-031.txt
start read 20file:wiki-pages-text/wiki-025.txt
start read 21file:wiki-pages-text/wiki-042.txt
start read 22file:wiki

In [None]:
from collections import defaultdict
prediction = defaultdict(dict)
num = 0
doc_selection = DocSelection(corpus)
sentence_selection = SentenceRank(corpus, word_embedding)
with open('devset.json', 'r') as f:
    data = json.load(f)
    for key in data:
        claim = data[key]['claim']
        selected_docs = doc_selection.select_docs(claim)
        selected_sentences = sentence_selection.select_sentence_by_embedding(selected_docs, claim)
        prediction[key]['claim'] = claim
        prediction[key]['evidence'] = selected_docs
        num += 1
        if num %50 == 0:
            print(num)

with open('prediction.json', 'w') as f:
    json.dump(prediction, f)

processed 10000 titles, total 5396106
processed 20000 titles, total 5396106
processed 30000 titles, total 5396106
processed 40000 titles, total 5396106
processed 50000 titles, total 5396106
processed 60000 titles, total 5396106
processed 70000 titles, total 5396106
processed 80000 titles, total 5396106
processed 90000 titles, total 5396106
processed 100000 titles, total 5396106
processed 110000 titles, total 5396106
processed 120000 titles, total 5396106
processed 130000 titles, total 5396106
processed 140000 titles, total 5396106
processed 150000 titles, total 5396106
processed 160000 titles, total 5396106
processed 170000 titles, total 5396106
processed 180000 titles, total 5396106
processed 190000 titles, total 5396106
processed 200000 titles, total 5396106
processed 210000 titles, total 5396106
processed 220000 titles, total 5396106
processed 230000 titles, total 5396106
processed 240000 titles, total 5396106
processed 250000 titles, total 5396106
processed 260000 titles, total 539

processed 2090000 titles, total 5396106
processed 2100000 titles, total 5396106
processed 2110000 titles, total 5396106
processed 2120000 titles, total 5396106
processed 2130000 titles, total 5396106
processed 2140000 titles, total 5396106
processed 2150000 titles, total 5396106
processed 2160000 titles, total 5396106
processed 2170000 titles, total 5396106
processed 2180000 titles, total 5396106
processed 2190000 titles, total 5396106


In [None]:
import pickle
with open("title_index", "rb") as index:
    pickle.dump(doc_selection.title_, wb)