In [2]:
import sys, argparse, os, ujson as json, time, nltk, random
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import CoreNLPNERTagger
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
from tqdm import tqdm
sys.stdout.flush()

In [2]:
def display_topics(model, questions, feature_names, no_top_words):
    ret = []
    for rand_words_line in questions:
        rand_words = rand_words_line.split()
        max_val = {}
        null_sentence = ""
        for topic_idx, topic in enumerate(model.components_):
            topic_sorted = topic.argsort()[:-len(topic) - 1:-1]
            sentence = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
            if(null_sentence == ""):
                null_sentence = sentence
            for idx, x in enumerate(topic_sorted):
                if(feature_names[idx] in rand_words):
                    max_val[x] = sentence

        if(len(max_val) == 0):
            ret.append(null_sentence)
        for k in reversed(sorted(max_val.keys())):
            ret.append(max_val[k])
            break
    return ret

def call_strat(documents, ques, no_features, no_topics, no_top_words):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(documents)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()

    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(documents)
    tf_feature_names = tf_vectorizer.get_feature_names()

    lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
    return display_topics(lda, ques, tf_feature_names, no_top_words)

def parse_documents(raw_input, no_features, no_topics, no_top_words):
    json_data=open(raw_input)
    data = json.load(json_data)['data']
    json_data.close()
    output_dataset = {'qids': [], 'questions': [], 'answers': [], 'contexts': [], 'qid2cid': []}
    article_topics = []
    for article in tqdm(data):
        docs = []
        ques = []
        for paragraph in article['paragraphs']:
            docs.append(paragraph['context'])
            for qa in paragraph['qas']:
                ques.append(qa['question'])
        article_topics.append(call_strat(docs, ques, no_features, no_topics, no_top_words))
    return article_topics

def main(raw_input):
    no_features = 1000
    no_topics = 20
    no_top_words = 5
    return parse_documents(raw_input, no_features, no_topics, no_top_words)

topic_models_dev = main("./data/datasets_without_topics_new/dev-v1.1.json")
topic_models_train = main("./data/datasets_without_topics_new/train-v1.1.json")

100%|██████████| 48/48 [00:42<00:00,  1.12it/s]
100%|██████████| 442/442 [06:09<00:00,  1.20it/s]


In [3]:
def tokenize(texts):
    wordnet_lemmatizer = WordNetLemmatizer()
    outputs = []
    print("Lenght of texts:", len(texts))
    for text in tqdm(texts):
        chars_l = []
        lemma_l = []
        ner_tags = []
        offsets_l = []
        #tokens = [token for token in nltk.wordpunct_tokenize(text)]
        #tokens = [token for token in nltk.word_tokenize(text)]
        #text = "Who was Beyonce's duet with in ''Beautiful Liar''? video school campaign music concert"
        tokens = [token.replace("``", '"').replace("''", '"').replace('\n', ' ').replace('"', '\"') for token in nltk.word_tokenize(text)]
        words = tokens
        tags_l = nltk.pos_tag(words)
        pos_tags = [x[1] for x in tags_l]
        
        for word in words:
            chars_l.append(word[0])
            lemma_l.append(wordnet_lemmatizer.lemmatize(word.lower()))
        
        for chunk in nltk.ne_chunk(nltk.pos_tag(words)):
            if hasattr(chunk, 'label'):
                for i in range(len(chunk)):
                    ner_tags.append(chunk.label().replace('ORGANIZATION', 'ORG'))
            else:
                ner_tags.append('')
        
        cs = get_char_word_loc_mapping(text, words)
        span = []
        last_span = -1
        if(bool(cs) == False):
            #print("Tokens empty", text)
            print("Tokens empty")
            chars_l = []
            lemma_l = []
            ner_tags = []
            offsets_l = []
            tokens = [token for token in nltk.wordpunct_tokenize(text)]
            
            words = tokens
            tags_l = nltk.pos_tag(words)
            pos_tags = [x[1] for x in tags_l]

            for word in words:
                chars_l.append(word[0])
                lemma_l.append(wordnet_lemmatizer.lemmatize(word.lower()))

            for chunk in nltk.ne_chunk(nltk.pos_tag(words)):
                if hasattr(chunk, 'label'):
                    for i in range(len(chunk)):
                        ner_tags.append(chunk.label().replace('ORGANIZATION', 'ORG'))
                else:
                    ner_tags.append('')

            cs = get_char_word_loc_mapping(text, words)
            span = []
            last_span = -1
            if(bool(cs) == False):
                print("Still empty", text)
            else:
                print("Not empty")
                for key, val in cs.items():
                    (i, j) = val
                    if((last_span == -1) or (last_span <= key)):
                        span.append((key, key + len(i)))
                        last_span = key + len(i)
        else:
            for key, val in cs.items():
                (i, j) = val
                if((last_span == -1) or (last_span <= key)):
                    span.append((key, key + len(i)))
                    last_span = key + len(i)
        
        output = {'words':words,'chars':chars_l,'offsets':span,'pos':pos_tags,'lemma':lemma_l,'ner': ner_tags, 'cs': cs}
        outputs.append(output)
    return outputs

def find_answer(offsets, begin_offset, end_offset):
    start = [i for i, tok in enumerate(offsets) if tok[0] == begin_offset]
    end = [i for i, tok in enumerate(offsets) if tok[1] == end_offset]
    if(len(start) == 1 and len(end) == 1):
        return start[0], end[0]
    else:
        return None

def process_dataset(data):
    cnt = 10
    print("Length of process_data_ctx", len(data['contexts']))
    print("Length of process_data_q", len(data['questions']))
    q_tokens = tokenize(data['questions'])
    c_tokens = tokenize(data['contexts'])
    abc = []
    f = open('out1.txt', 'w')
    for idx in range(len(data['qids'])):
        marker = c_tokens[data['qid2cid'][idx]]['offsets']
        ans_tokens = []
        if len(data['answers']) > 0:
            for ans_idx, ans in enumerate(data['answers'][idx]):
                found = find_answer(marker, ans['answer_start'], ans['answer_start'] + len(ans['text']))
                if found:
                    ans_tokens.append(found)
                else:
                    abc.append(1)
                    f.write(str(len(abc)) + " Not found for " + str(ans_idx) + str(data['contexts'][data['qid2cid'][idx]][ans['answer_start']: (ans['answer_start'] + len(ans['text']))]) + "\t" + str(ans['answer_start']) + " " + str(ans['answer_start'] + len(ans['text'])) + "\n")
                    f.write(json.dumps(c_tokens[data['qid2cid'][idx]]['cs']) + "\n")
                    f.write(json.dumps(marker) + "\n\n")
        yield {
           'id':data['qids'][idx], 'question': q_tokens[idx]['words'],
           'ques_char': q_tokens[idx]['chars'],
           'ctxt': c_tokens[data['qid2cid'][idx]]['words'],
           'ctxt_char': c_tokens[data['qid2cid'][idx]]['chars'],
           'ans_pos': c_tokens[data['qid2cid'][idx]]['offsets'],
           'ans': ans_tokens, 'qlemma': q_tokens[idx]['lemma'],
           'ques_pos': q_tokens[idx]['pos'], 'qner': q_tokens[idx]['ner'],
           'ctxt_lemma': c_tokens[data['qid2cid'][idx]]['lemma'],
           'ctxt_pos': c_tokens[data['qid2cid'][idx]]['pos'],
           'ctxt_ner': c_tokens[data['qid2cid'][idx]]['ner'],
        }

def get_char_word_loc_mapping(context, context_tokens):
    acc = ''
    current_token_idx = 0
    mapping = dict()

    for char_idx, char in enumerate(context):
        if char != u' ' and char != u'\n' and char!=chr(8201) and char !=chr(12288) and char!=chr(8239):
            acc += char
            context_token = context_tokens[current_token_idx]
            if acc == context_token:
                syn_start = char_idx - len(acc) + 1
                for char_loc in range(syn_start, char_idx+1):
                    mapping[char_loc] = (acc, current_token_idx)
                acc = ''
                current_token_idx += 1

    if current_token_idx != len(context_tokens):
        return None
    else:
        return mapping

topic_models_dev = []
topic_models_train = []
for args_split, topic_models in [("dev-v1.1", topic_models_dev), ("train-v1.1", topic_models_train)]:
    t0 = time.time()
    in_file = os.path.join("./data/datasets_without_topics_new", args_split + '.json')
    json_data=open(in_file)
    data = json.load(json_data)['data']
    json_data.close()
    print("Length of data1", len(data))
    output_dataset = {'qids': [], 'questions': [], 'answers': [], 'contexts': [], 'qid2cid': []}
    for article_idx, article in enumerate(data):
        article_q_cnt = 0
        for paragraph_idx, paragraph in enumerate(article['paragraphs']):
            output_dataset['contexts'].append(paragraph['context'])
            for qa_idx, qa in enumerate(paragraph['qas']):
                output_dataset['qids'].append(qa['id'])
                #output_dataset['questions'].append(qa['question'] + " " + topic_models[article_idx][article_q_cnt])
                output_dataset['questions'].append(qa['question'])# + " " + topic_models[article_idx][article_q_cnt])
                output_dataset['qid2cid'].append(len(output_dataset['contexts']) - 1)
                if 'answers' in qa:
                    output_dataset['answers'].append(qa['answers'])
                article_q_cnt = article_q_cnt + 1
    dataset = output_dataset
    out_file = os.path.join("./data/datasets_without_topics_new", args_split + "-processed" + ".txt")
    print('Writing to %s' % out_file, file=sys.stderr)
    with open(out_file, 'w') as f:
        for ex in process_dataset(dataset):
            f.write(json.dumps(ex) + '\n')
    print('Time: %.4f (s)' % (time.time() - t0))


Writing to ./data/datasets_without_topics_new/dev-v1.1-processed.txt
  0%|          | 0/10570 [00:00<?, ?it/s]

Length of data1 48
Length of process_data_ctx 2067
Length of process_data_q 10570
Lenght of texts: 10570


100%|██████████| 10570/10570 [00:48<00:00, 219.76it/s]
  0%|          | 3/2067 [00:00<01:22, 24.89it/s]

Lenght of texts: 2067


 75%|███████▍  | 1541/2067 [01:15<00:25, 20.53it/s]

Tokens empty
Not empty


100%|██████████| 2067/2067 [01:40<00:00, 20.49it/s]


Time: 150.9741 (s)


Writing to ./data/datasets_without_topics_new/train-v1.1-processed.txt
  0%|          | 0/87599 [00:00<?, ?it/s]

Length of data1 442
Length of process_data_ctx 18896
Length of process_data_q 87599
Lenght of texts: 87599


  0%|          | 383/87599 [00:01<07:07, 204.02it/s]

Tokens empty
Not empty
Tokens empty
Not empty
Tokens empty
Not empty


  1%|          | 469/87599 [00:02<07:10, 202.47it/s]

Tokens empty
Not empty
Tokens empty
Not empty
Tokens empty
Not empty
Tokens empty
Not empty


 78%|███████▊  | 68210/87599 [04:54<01:23, 231.58it/s]

Tokens empty
Not empty


 96%|█████████▌| 83992/87599 [06:01<00:15, 232.56it/s]

Tokens empty
Not empty


100%|██████████| 87599/87599 [06:15<00:00, 233.13it/s]
  0%|          | 2/18896 [00:00<21:32, 14.62it/s]

Lenght of texts: 18896


 60%|██████    | 11376/18896 [08:19<05:30, 22.76it/s]

Tokens empty
Not empty


100%|██████████| 18896/18896 [14:30<00:00, 21.70it/s]


Time: 1257.8500 (s)
