In [1]:
import numpy as np
import gensim
import nltk
import inflect
import re
import pandas as pd

Couldn't import dot_parser, loading of dot files will not be possible.


In [2]:
# Load any words embedding
# Here we use word2vec Google pretrained model
# The link to download those embeddings can be found on this page :
# https://github.com/mmihaltz/word2vec-GoogleNews-vectors  (download .gz file in the README)

path = './model/GoogleNews-vectors-negative300.bin'
model = gensim.models.Word2Vec.load_word2vec_format(path, binary=True)

In [6]:
# load data 

def clean_str(string):
    """
    Tokenization/string cleaning strings
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip('\n').strip().lower()

def load_test_data(path, file_name):
    df = pd.read_csv(path + file_name, sep='\t', header=None)
    df.columns = ['hardcoded_question', 'user_question']
    for i in df.index:
        df['hardcoded_question'][i] = clean_str(df['hardcoded_question'][i])
        df['user_question'][i] = clean_str(df['user_question'][i])
    return df 

In [7]:
# Loading the test file which is a .tsv file with format : hardcoded_question \t user_question 
path = 'data2/'
file_name = 'test_set2.tsv'
data = load_test_data(path, file_name)
questions = data['hardcoded_question'].tolist() + data['user_question'].tolist()

In [8]:
keep_pos = ['NN', 'NNS', 'VB', 'JJ']

def filter_pos(question):
    '''
    From a sentence, return list of tuples [(word, POS)] if POS in self.keep_pos (by default : noun(s))
    '''
    question_tokens = nltk.word_tokenize(question)
    question_tagged = nltk.pos_tag(question_tokens)
    question_tagged_pos = [question_tagged[i] 
                            for i in range(len(question_tagged)) 
                            if question_tagged[i][1] 
                            in keep_pos]
    return question_tagged_pos

def plural_to_singular(tagged_pos):
    '''
    Putting all nouns to singular form (because similarity(A,A') > similarity(A,As'))
    '''
    p = inflect.engine()
    q_singular = []
    for pos in tagged_pos:
        if pos[1] != 'NNS':
            q_singular.append(pos[0]) 
        else:
            q_singular.append(p.singular_noun(pos[0]))
    return q_singular

def embed_question(question):
    question_tagged_nouns = filter_pos(question)
    question_nouns = plural_to_singular(question_tagged_nouns)
    embedding = np.zeros(model['random'].shape)
    i=0.0
    for word in question_nouns:
        if word in model.vocab:
            i+=1.0
            embedding += model[word]
    embedding /= i
    return embedding

In [9]:
embedded_questions = [embed_question(q) for q in questions]

In [11]:
import csv

# saving output files
with open ('data2/question_embeddings.tsv', 'w') as embeddings:
    writer = csv.writer(embeddings, delimiter='\t')
    for eq in embedded_questions:
        writer.writerow(eq)
        
with open ('data2/questions.tsv', 'w') as embeddings:
    for q in questions:
        embeddings.write(q+'\n')