In [1]:
import os
import sys
import random
import pandas as pd
import numpy as np
from pandas import DataFrame
from gensim import corpora, models, similarities
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from stop_words import get_stop_words

In [17]:
train_doc_question = pd.read_csv('../data/FiQA_train_question_doc_final.tsv', sep='\t')
train_question = pd.read_csv('../data/FiQA_train_question_final.tsv', sep='\t')
train_doc = pd.read_csv('../data/FiQA_train_doc_final.tsv', sep='\t')
vocabulary = pd.read_csv('../data/vocabulary.csv')

In [21]:
dictionary = dict(zip(vocabulary['token'], vocabulary['word']))
vocab_size = len(dictionary)
VOCAB_PAD_ID = vocab_size + 1
VOCAB_GO_ID = vocab_size + 2

In [2]:
def splitWordByLibrary(documents):
    texts = []
    tokenizer = RegexpTokenizer(r'\w+')
    en_stop = get_stop_words('en')
    p_stemmer = PorterStemmer()

    for i in documents:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)

        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]

        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

        # add tokens to list
        texts.append(stemmed_tokens)
        
    return texts

In [3]:
def padding(lis, pad, size):
    if size > len(lis):
        lis += [pad] * (size - len(lis))
    else:
        lis = lis[0:size]
    return lis
    

In [4]:
def pack_question_n_utterance(q, doc, VOCAB_PAD_ID, VOCAB_GO_ID, q_length = 20, doc_length = 99):
    q = padding(q, VOCAB_PAD_ID, q_length)
    doc = padding(doc, VOCAB_PAD_ID, doc_length)
    assert len(q) == q_length, "question should be pad to q_length"
    assert len(doc) == doc_length, "doc should be pad to doc_length"
    return q + [VOCAB_GO_ID] + doc

In [5]:
def preprocess_data(data, VOCAB_PAD_ID, VOCAB_GO_ID, question_max_length = 20, doc_max_length = 99):
    result = []
    for o in data:
        x = pack_question_n_utterance(o['question'], o['doc'], VOCAB_PAD_ID, VOCAB_GO_ID)
        y_ = o['label']
        assert len(x) == doc_max_length + question_max_length + 1, "Wrong length afer padding"
        assert VOCAB_GO_ID in x, "<GO> must be in input x"
        result.append([x, y_])
    
    return result

In [6]:
def text_to_id(texts, dictionary):
    #get the dictionary of the vocalbulary in the corpus, the word_idct is the word and id mapping in the dictionary
#     word_dict = dictionary.token2id
    # change the question and docs to id list
    texts_id = []
    for text in texts:
        t = []
        for word in text:
            t.append(dictionary[word])
        texts_id.append(t)
    return texts_id

In [7]:
def create_input(questions, docs, dictionary, isPositive = 1):
    questions_id = text_to_id(questions, dictionary)
    docs_id =  text_to_id(docs, dictionary)
    # add the label to the corresponding question and docs
    q_len = len(questions)
    input_data = []
    for i in range(0, q_len):
        item = {}
        item['question'] = questions[i]
        item['doc'] = docs[i]
        item['label'] = isPositive
        input_data.append(item)
    return input_data

In [8]:
def create_negative_input(questions, docs, nums):
    negative_questions = random.sample(list, nums)
    negative_docs =  random.sample(list, nums)
    
    return [negative_questions, negative_docs]

In [114]:
def get_train_data()
    qdic = train_question.set_index('qid').T.to_dict('list')
    docdic = train_doc.set_index('docid').T.to_dict('list')

    #question id and the corresponding doc id
    question_id_list = train_doc_question['qid']
    doc_id_list = train_doc_question['docid']

    questions = []
    docs = []

    for i in range(0, len(question_id_list)):
    #     question = train_question[train_question.qid == question_id_list[i]]['question'].values[0]
    #     doc = train_doc[train_doc.docid == doc_id_list[i]]['doc'].values[0]
        question = qdic[question_id_list[i]][1]
        doc = docdic[doc_id_list[i]][1]
        questions.append(question)
        docs.append(doc)
    return [question, docs]

In [116]:
questions, docs =  get_train_data()
positive_questions = splitWordByLibrary(np.array(questions))
positive_docs  = splitWordByLibrary(np.array(docs))

In [91]:
negative_questions, negative_docs = create_negative_input(positive_questions, positive_docs, 1000):

In [23]:
positive_input_data = create_input(positive_questions, positive_docs, dictionary)
negative_input_data = create_input(negative_questions, negative_docs, dictionary) 
input_data = np.vstack((positive_input_data, negative_input_data))
result = preprocess_data(input_data, VOCAB_PAD_ID, VOCAB_GO_ID)

NameError: name 'positive_questions' is not defined

In [79]:
df=DataFrame(data=np.array(result))
df.columns = ['data','label']


In [75]:
df.to_csv('../data/train_baseline.csv')