# Paraphrase Candidate Generation

To generate **non-academic** to **academic** word pairs for paaphrasing, we used the paraphrase (word-pairs) in CoInCo, WordNet and PPDB as the starting point.

For the CoInCo dataset, we have included only those word pairs where : 1) the target word is non-academic, 2) the substitution candidate is academic, 3) the target word has a higher word frequency than the substitute candidate in our academic resources. Since the academic resource is not exhaustive, some proper academic terms may be mistakenly considered as **non-academic**.

We have collected a total of 23,476 word pairs from the CoInCo training set. The dataset is prepared with 4 candidates for each informal target, where 2 candidates are academic and 2 are non-academic. When we do not have appropriate candidates we extract further candidates from WordNet and PPDB.

In [None]:
import random, pickle, re, pickle
from collections import Counter
import xml.etree.ElementTree as ET

from tqdm import tqdm_notebook

import pandas as pd
import numpy as np

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [None]:
np.random.seed(42) # to replicate the results

In [None]:
CoInCo = '<path-to-coinco.xml>'
COCA_ALL = 'path-to-(COCA)allWords.xlsx>'
COMPILED_LIST = '<path-to-academic_keyphrases.xlsx>'
COCA_LIST = '<path-to-(COCA)acadCore.xlsx>'
NAWL = '<path-to-NAWL_Headwords.txt>'
ACL_FREQ = '<path-to-academic_unigrams.pkl>' # obained while compiling the resources
BEAUTIFUL_DATA = '<path-to-(beautiful_data)count_1w.txt>'
GLOVE_PATH = 'path-to-glove.840B.300d.txt'

PRECONTEXT = 0
TARGETSENTENCE = 1
POSTCONTEXT = 2
TOKENS = 3

In [None]:
glove_file = datapath(GLOVE_PATH)
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

In [None]:
tree = ET.parse(CoInCo)
root = tree.getroot()

In [None]:
sentences = list()
for child in root:
    sentences.append(child[TARGETSENTENCE].text.strip())

In [None]:
random.Random(9).shuffle(sentences)
train_sentences = sentences[ : int(0.65 * len(sentences))]
test_sentences = sentences[int(0.65 * len(sentences)) : ]

In [None]:
print(len(train_sentences), len(test_sentences), len(train_sentences)+len(test_sentences))

In [None]:
t_sentences = train_sentences[ : int(0.8 * len(train_sentences))]
v_sentences = train_sentences[int(0.8 * len(train_sentences)) : ]

In [None]:
t_d = dict()
v_d = dict()
test_d = dict()

for child in root:
    for token in child[TOKENS]:
        token_id = token.get('id')
        t = dict()
        t['precontenxt'] = child[PRECONTEXT].text.strip()
        t['postcontext'] = child[POSTCONTEXT].text.strip()
        t['wordform'] = token.get('wordform')
        t['lemma'] = token.get('lemma')
        t['posMASC'] = token.get('posMASC')
        t['posTT'] = token.get('posTT')
        t['problematic'] = token.get('problematic')
        l = list()
        for substitutions in token:
            for subst in substitutions:
                s = (subst.get('lemma'), subst.get('pos'), subst.get('freq'))
                l.append(s)
        t['substitutions'] = l
        
        if(token_id != 'XXX' and (child[TARGETSENTENCE].text.strip() in t_sentences)):
            t['targetsentence'] = child[TARGETSENTENCE].text.strip()
            t_d[token_id] = t
        elif(token_id != 'XXX' and (child[TARGETSENTENCE].text.strip() in v_sentences)):
            t['targetsentence'] = child[TARGETSENTENCE].text.strip()
            v_d[token_id] = t
        elif(token_id != 'XXX' and (child[TARGETSENTENCE].text.strip() in test_sentences)):
            t['targetsentence'] = child[TARGETSENTENCE].text.strip()
            test_d[token_id] = t

In [None]:
train_d = dict()
for t in t_d:
    train_d[t] = t_d[t]
for t in v_d:
    train_d[t] = v_d[t]

In [None]:
len(t_d), len(v_d), len(train_d), len(test_d)

In [None]:
academic_df = pd.read_excel(COMPILED_LIST, sheet_name='<sheet-name>')
academic_list = academic_df.phrase.tolist()

In [None]:
coca_df = pd.read_excel(COCA_LIST, sheet_name='list')
coca_list = coca_df.word.tolist()

In [None]:
with open(NAWL, 'r') as f:
    s = f.read()
    nawl_list = s.split()

In [None]:
with open(ACL_FREQ, 'rb') as f:
    acl_freq = pickle.load(f)

In [None]:
beatiful_data_freq = Counter()
with open(BEAUTIFUL_DATA, 'r') as f:
    tmp = f.read().strip().split('\n')
    for c in tmp:
        word, freq = c.strip().split('\t')
        beatiful_data_freq[word] = freq

In [None]:
allwords_df = pd.read_excel(COCA_ALL, sheet_name='list')

In [None]:
import pymysql
pymysql.install_as_MySQLdb()
import MySQLdb
import nltk
from nltk.corpus import wordnet

class DBS:
    def __init__(self, host, username, password):
        self.host = host
        self.username = username
        self.password = password

    def get_ppdb2_candidates(self, phrase):
        db = MySQLdb.connect(self.host, self.username, self.password, "ppdb2")
        cursor = db.cursor()
        cmd = "select target, ppdb2score from ppdb where source = '%s' order by ppdb2score desc" % (str(phrase))
        cursor.execute(cmd)
        results = cursor.fetchall()
        return results

    def get_wordnet_candidates(self, word):
        word = word.replace(' ', '_')
        synonyms = list()
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
        return synonyms

In [None]:
db = DBS('ltdatabase1', 'dummy', 'dummy')

In [None]:
def replace_with_substitute(sentence, target, candidate):
    return sentence.replace(target, candidate)

In [None]:
def is_repeat(candidate, subs):
    for s in subs:
        if(candidate == s[0]):
            return True
    return False

In [None]:
UNK_embed = np.random.rand(300,)

In [None]:
def get_word_embedding(word):
    try:
        return model.wv[word]
    except:
        return UNK_embed

In [None]:
def get_sentence_embedding(sentence):
    # remove special characters
    sentence = ' '.join(re.findall(r"[a-zA-Z0-9]+", sentence))

    words_embed = list()
    word_list = sentence.split()
    for word in word_list:
        words_embed.append(get_word_embedding(word))
    
    return np.mean(words_embed, axis=0)

In [None]:
feature2index = {
    'freq_beatiful' : 1,
    'freq_coca_general' : 2,
    'freq_acl' : 3,
    'cos_target' : 4,
    'euclidean_distance' : 5,
    'posMASC_le' : 6,
    'is_problematic' : 7,
    'word_length' : 8,
    'count_vowel' : 9
}

In [None]:
word_pairs_t = dict()
for token_id in tqdm_notebook(t_d):
    lemma = t_d[token_id]['lemma']
    wordform = t_d[token_id]['wordform']
    sentence_embed = get_sentence_embedding(t_d[token_id]['targetsentence'])
    if(lemma not in coca_list or lemma not in nawl_list or lemma not in academic_list):
        academic_subs = list()
        non_academic_subs = list()
        for subst in t_d[token_id]['substitutions']:
            s = subst[0]
            if(s in coca_list or s in nawl_list or s in academic_list):
                try:
                    if(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'] > allwords_df.loc[allwords_df['word'] == s].iloc[0]['COCA-All']):
                        l = list(subst)
                        pos = l.pop(1)
                        l.append('0')
                        l.append(pos)
                        word_embed = get_word_embedding(s)
                        cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                        l.append(cos_sim)
                        l.append(np.linalg.norm(sentence_embed-word_embed))
                        if(len(academic_subs) < 2):
                            academic_subs.append(l)
                except:
                    l = list(subst)
                    pos = l.pop(1)
                    l.append('1')
                    l.append(pos)
                    word_embed = get_word_embedding(s)
                    cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                    l.append(cos_sim)
                    l.append(np.linalg.norm(sentence_embed-word_embed))
                    if(len(non_academic_subs) < 2):
                        non_academic_subs.append(l)
            else:
                l = list(subst)
                pos = l.pop(1)
                l.append('1')
                l.append(pos)
                word_embed = get_word_embedding(s)
                cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                l.append(cos_sim)
                l.append(np.linalg.norm(sentence_embed-word_embed))
                if(len(non_academic_subs) < 2):
                    non_academic_subs.append(l)

        if(academic_subs):
            if(len(academic_subs) < 2 or len(non_academic_subs) < 2):
                wordnet_candidates = db.get_wordnet_candidates(lemma)
                for t in wordnet_candidates:
                    candidate = t[0]
                    if(candidate in coca_list or candidate in nawl_list or candidate in academic_list):
                        try:
                            if(not is_repeat(candidate, academic_subs)):
                                if(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'] > allwords_df.loc[allwords_df['word'] == candidate].iloc[0]['COCA-All']):
                                    l = [candidate, '0', '0', 'UNK']
                                    word_embed = get_word_embedding(candidate)
                                    cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                    l.append(cos_sim)
                                    l.append(np.linalg.norm(sentence_embed-word_embed))
                                    if(len(academic_subs) < 2):
                                        academic_subs.append(l)
                        except:
                            if(not is_repeat(candidate, non_academic_subs)):
                                l = [candidate, '0', '0', 'UNK']
                                word_embed = get_word_embedding(candidate)
                                cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                l.append(cos_sim)
                                l.append(np.linalg.norm(sentence_embed-word_embed))
                                if(len(non_academic_subs) < 2):
                                    non_academic_subs.append(l)
                    else:
                        if(not is_repeat(candidate, non_academic_subs)):
                            l = [candidate, '0', '0', 'UNK']
                            word_embed = get_word_embedding(candidate)
                            cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                            l.append(cos_sim)
                            l.append(np.linalg.norm(sentence_embed-word_embed))
                            if(len(non_academic_subs) < 2):
                                non_academic_subs.append(l)
            if(len(academic_subs) < 2 or len(non_academic_subs) < 2):
                ppdb_candidates = db.get_ppdb2_candidates(lemma)
                for t in ppdb_candidates:
                    candidate = t[0]
                    if(candidate in coca_list or candidate in nawl_list or candidate in academic_list):
                        try:
                            if(not is_repeat(candidate, academic_subs)):
                                if(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'] > allwords_df.loc[allwords_df['word'] == candidate].iloc[0]['COCA-All']):
                                    l = [candidate, '0', '0', 'UNK']
                                    word_embed = get_word_embedding(candidate)
                                    cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                    l.append(cos_sim)
                                    l.append(np.linalg.norm(sentence_embed-word_embed))
                                    if(len(academic_subs) < 2):
                                        academic_subs.append(l)
                        except:
                            if(not is_repeat(candidate, non_academic_subs)):
                                l = [candidate, '0', '0', 'UNK']
                                word_embed = get_word_embedding(candidate)
                                cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                l.append(cos_sim)
                                l.append(np.linalg.norm(sentence_embed-word_embed))
                                if(len(non_academic_subs) < 2):
                                    non_academic_subs.append(l)
                    else:
                        if(not is_repeat(candidate, non_academic_subs)):
                            l = [candidate, '0', '0', 'UNK']
                            word_embed = get_word_embedding(candidate)
                            cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                            l.append(cos_sim)
                            l.append(np.linalg.norm(sentence_embed-word_embed))
                            if(len(non_academic_subs) < 2):
                                non_academic_subs.append(l)
            if(len(academic_subs) == 2 and len(non_academic_subs) == 2):
                academic_subs = sorted(academic_subs, key=lambda x: int(x[1]), reverse=True)
                non_academic_subs = sorted(non_academic_subs, key=lambda x: int(x[1]), reverse=True)
                valid_subs = list()
                valid_subs.extend(academic_subs)
                valid_subs.extend(non_academic_subs)
                # valid_subs : [[candidate, freq, is_non_academic, pos, cos_target, euclidean_distance], ....]
                word_pairs_t[(token_id, wordform)] = valid_subs

In [None]:
word_pairs_t

In [None]:
pos_tags = list()
for p in tqdm_notebook(word_pairs_t):
    for candidate_info in word_pairs_t[p]:
        pos_tags.append(candidate_info[3])
pos_tags = set(pos_tags)
le_pos = dict()
for i, v in enumerate(pos_tags):
    le_pos[v] = i

In [None]:
with open('word_pairs_t.pkl', 'wb') as f:
    pickle.dump(word_pairs_t, f)

In [None]:
qid = 1
doc = ''

for p in tqdm_notebook(word_pairs_t):
    for candidate_info in word_pairs_t[p]:

        is_non_academic = candidate_info[2]
        if(is_non_academic == '0'):
            freq = candidate_info[1]
        elif(is_non_academic == '1'):
            freq = ('-' + candidate_info[1]) # string concate to put it into the parser
        target_value = freq
        
        lemma = candidate_info[0]

        try:
            freq_beatiful = beatiful_data_freq[lemma]
        except:
            freq_beatiful = 0
        f_1 = feature2index['freq_beatiful']
        
        try:
            freq_coca_general = allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All']
        except:
            freq_coca_general = 0
        f_2 = feature2index['freq_coca_general']    
        
        try:
            freq_acl = acl_freq[tuple(lemma.split())]
        except:
            freq_acl = 0
        f_3 = feature2index['freq_acl']
        
        cos_target = candidate_info[4]
        f_4 = feature2index['cos_target']
        
        euclidean_distance = candidate_info[5]
        f_5 = feature2index['euclidean_distance']
        
        posMASC = candidate_info[3]
        posMASC_le = le_pos[posMASC]
        f_6 = feature2index['posMASC_le']
        
        # is_problematic
        
        word_length = len(lemma)
        f_8 = feature2index['word_length']
        
        count_vowel = sum(list(map(lemma.lower().count, 'aeiou')))
        f_9 = feature2index['count_vowel']
        
        doc += (str(target_value) + ' qid:' + str(qid) + ' ' + str(f_1) + ':' + str(freq_beatiful) + ' ' + str(f_2) + ':' + str(freq_coca_general) + ' ' + str(f_3) + ':' + str(freq_acl) + ' ' + str(f_4) + ':' + str(cos_target) + ' ' + str(f_5) + ':' + str(euclidean_distance) + ' ' + str(f_6) + ':' + str(posMASC_le) + ' ' + str(f_8) + ':' + str(word_length) + ' ' + str(f_9) + ':' + str(count_vowel) + '\n')

    qid += 1

with open('paraphrase-t.txt', 'w') as f:
    f.write(doc)

In [None]:
word_pairs_v = dict()
for token_id in tqdm_notebook(v_d):
    lemma = v_d[token_id]['lemma']
    wordform = v_d[token_id]['wordform']
    if(lemma not in coca_list or lemma not in nawl_list or lemma not in academic_list):
        academic_subs = list()
        non_academic_subs = list()
        for subst in v_d[token_id]['substitutions']:
            s = subst[0]
            if(s in coca_list or s in nawl_list or s in academic_list):
                try:
                    if(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'] > allwords_df.loc[allwords_df['word'] == s].iloc[0]['COCA-All']):
                        l = list(subst)
                        pos = l.pop(1)
                        l.append('0')
                        l.append(pos)
                        word_embed = get_word_embedding(s)
                        cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                        l.append(cos_sim)
                        l.append(np.linalg.norm(sentence_embed-word_embed))
                        if(len(academic_subs) < 2):
                            academic_subs.append(l)
                except:
                    l = list(subst)
                    pos = l.pop(1)
                    l.append('1')
                    l.append(pos)
                    word_embed = get_word_embedding(s)
                    cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                    l.append(cos_sim)
                    l.append(np.linalg.norm(sentence_embed-word_embed))
                    if(len(non_academic_subs) < 2):
                        non_academic_subs.append(l)
            else:
                l = list(subst)
                pos = l.pop(1)
                l.append('1')
                l.append(pos)
                word_embed = get_word_embedding(s)
                cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                l.append(cos_sim)
                l.append(np.linalg.norm(sentence_embed-word_embed))
                if(len(non_academic_subs) < 2):
                    non_academic_subs.append(l)

        if(academic_subs):
            if(len(academic_subs) < 2 or len(non_academic_subs) < 2):
                wordnet_candidates = db.get_wordnet_candidates(lemma)
                for t in wordnet_candidates:
                    candidate = t[0]
                    if(candidate in coca_list or candidate in nawl_list or candidate in academic_list):
                        try:
                            if(not is_repeat(candidate, academic_subs)):
                                if(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'] > allwords_df.loc[allwords_df['word'] == candidate].iloc[0]['COCA-All']):
                                    l = [candidate, '0', '0', 'UNK']
                                    word_embed = get_word_embedding(candidate)
                                    cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                    l.append(cos_sim)
                                    l.append(np.linalg.norm(sentence_embed-word_embed))
                                    if(len(academic_subs) < 2):
                                        academic_subs.append(l)
                        except:
                            if(not is_repeat(candidate, non_academic_subs)):
                                l = [candidate, '0', '0', 'UNK']
                                word_embed = get_word_embedding(candidate)
                                cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                l.append(cos_sim)
                                l.append(np.linalg.norm(sentence_embed-word_embed))
                                if(len(non_academic_subs) < 2):
                                    non_academic_subs.append(l)
                    else:
                        if(not is_repeat(candidate, non_academic_subs)):
                            l = [candidate, '0', '0', 'UNK']
                            word_embed = get_word_embedding(candidate)
                            cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                            l.append(cos_sim)
                            l.append(np.linalg.norm(sentence_embed-word_embed))
                            if(len(non_academic_subs) < 2):
                                non_academic_subs.append(l)

            if(len(academic_subs) < 2 or len(non_academic_subs) < 2):
                ppdb_candidates = db.get_ppdb2_candidates(lemma)
                for t in ppdb_candidates:
                    candidate = t[0]
                    if(candidate in coca_list or candidate in nawl_list or candidate in academic_list):
                        try:
                            if(not is_repeat(candidate, academic_subs)):
                                if(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'] > allwords_df.loc[allwords_df['word'] == candidate].iloc[0]['COCA-All']):
                                    l = [candidate, '0', '0', 'UNK']
                                    word_embed = get_word_embedding(candidate)
                                    cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                    l.append(cos_sim)
                                    l.append(np.linalg.norm(sentence_embed-word_embed))
                                    if(len(academic_subs) < 2):
                                        academic_subs.append(l)
                        except:
                            if(not is_repeat(candidate, non_academic_subs)):
                                l = [candidate, '0', '0', 'UNK']
                                word_embed = get_word_embedding(candidate)
                                cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                l.append(cos_sim)
                                l.append(np.linalg.norm(sentence_embed-word_embed))
                                if(len(non_academic_subs) < 2):
                                    non_academic_subs.append(l)
                    else:
                        if(not is_repeat(candidate, non_academic_subs)):
                            l = [candidate, '0', '0', 'UNK']
                            word_embed = get_word_embedding(candidate)
                            cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                            l.append(cos_sim)
                            l.append(np.linalg.norm(sentence_embed-word_embed))
                            if(len(non_academic_subs) < 2):
                                non_academic_subs.append(l)
            if(len(academic_subs) == 2 and len(non_academic_subs) == 2):
                academic_subs = sorted(academic_subs, key=lambda x: int(x[1]), reverse=True)
                non_academic_subs = sorted(non_academic_subs, key=lambda x: int(x[1]), reverse=True)
                valid_subs = list()
                valid_subs.extend(academic_subs)
                valid_subs.extend(non_academic_subs)
                # valid_subs : [[candidate, freq, is_non_academic, pos, cos_target, euclidean_distance], ....]
                word_pairs_v[(token_id, wordform)] = valid_subs

In [None]:
word_pairs_v

In [None]:
with open('word_pairs_v.pkl', 'wb') as f:
    pickle.dump(word_pairs_v, f)

In [None]:
qid = 1
doc = ''

for p in tqdm_notebook(word_pairs_v):
    for candidate_info in word_pairs_v[p]:

        is_non_academic = candidate_info[2]
        if(is_non_academic == '0'):
            freq = candidate_info[1]
        elif(is_non_academic == '1'):
            freq = ('-' + candidate_info[1]) # string concate to put it into the parser
        target_value = freq
        
        lemma = candidate_info[0]

        try:
            freq_beatiful = beatiful_data_freq[lemma]
        except:
            freq_beatiful = 0
        f_1 = feature2index['freq_beatiful']
        
        try:
            freq_coca_general = allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All']
        except:
            freq_coca_general = 0
        f_2 = feature2index['freq_coca_general']    
        
        try:
            freq_acl = acl_freq[tuple(lemma.split())]
        except:
            freq_acl = 0
        f_3 = feature2index['freq_acl']
        
        cos_target = candidate_info[4]
        f_4 = feature2index['cos_target']
        
        euclidean_distance = candidate_info[5]
        f_5 = feature2index['euclidean_distance']
        
        posMASC = candidate_info[3]
        try:
            posMASC_le = le_pos[posMASC]
        except:
            posMASC_le = le_pos['UNK']
        f_6 = feature2index['posMASC_le']
        
        # is_problematic
        
        word_length = len(lemma)
        f_8 = feature2index['word_length']
        
        count_vowel = sum(list(map(lemma.lower().count, 'aeiou')))
        f_9 = feature2index['count_vowel']
        
        doc += (str(target_value) + ' qid:' + str(qid) + ' ' + str(f_1) + ':' + str(freq_beatiful) + ' ' + str(f_2) + ':' + str(freq_coca_general) + ' ' + str(f_3) + ':' + str(freq_acl) + ' ' + str(f_4) + ':' + str(cos_target) + ' ' + str(f_5) + ':' + str(euclidean_distance) + ' ' + str(f_6) + ':' + str(posMASC_le) + ' ' + str(f_8) + ':' + str(word_length) + ' ' + str(f_9) + ':' + str(count_vowel) + '\n')

    qid += 1

with open('paraphrase-val.txt', 'w') as f:
    f.write(doc)

In [None]:
word_pairs_test = dict()
no_replacement = 0
non_academic_count = 0
with_replacement = 0
for token_id in tqdm_notebook(test_d):
    lemma = test_d[token_id]['lemma']
    wordform = test_d[token_id]['wordform']
    sentence_embed = get_sentence_embedding(test_d[token_id]['targetsentence'])
    if(lemma not in coca_list or lemma not in nawl_list or lemma not in academic_list):
        non_academic_count += 1
        academic_subs = list()
        non_academic_subs = list()
        c = 0
        for subst in test_d[token_id]['substitutions']:
            s = subst[0]
            if(s in coca_list or s in nawl_list or s in academic_list):
                try:
                    if(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'] > allwords_df.loc[allwords_df['word'] == s].iloc[0]['COCA-All']):
                        if(c == 0):
                            with_replacement += 1
                            c = 1
                        l = list(subst)
                        pos = l.pop(1)
                        l.append('0')
                        l.append(pos)
                        word_embed = get_word_embedding(s)
                        cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                        l.append(cos_sim)
                        l.append(np.linalg.norm(sentence_embed-word_embed))
                        if(len(academic_subs) < 2):
                            academic_subs.append(l)
                except:
                    l = list(subst)
                    pos = l.pop(1)
                    l.append('1')
                    l.append(pos)
                    word_embed = get_word_embedding(s)
                    cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                    l.append(cos_sim)
                    l.append(np.linalg.norm(sentence_embed-word_embed))
                    if(len(non_academic_subs) < 2):
                        non_academic_subs.append(l)
            else:
                l = list(subst)
                pos = l.pop(1)
                l.append('1')
                l.append(pos)
                word_embed = get_word_embedding(s)
                cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                l.append(cos_sim)
                l.append(np.linalg.norm(sentence_embed-word_embed))
                if(len(non_academic_subs) < 2):
                    non_academic_subs.append(l)

        if(academic_subs):
            if(len(academic_subs) < 2 or len(non_academic_subs) < 2):
                word_candidates = db.get_wordnet_candidates(lemma)
                for t in word_candidates:
                    candidate = t[0]
                    if(candidate in coca_list or candidate in nawl_list or candidate in academic_list):
                        try:
                            if(not is_repeat(candidate, academic_subs)):
                                if(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'] > allwords_df.loc[allwords_df['word'] == candidate].iloc[0]['COCA-All']):
                                    l = [candidate, '0', '0', 'UNK']
                                    word_embed = get_word_embedding(candidate)
                                    cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                    l.append(cos_sim)
                                    l.append(np.linalg.norm(sentence_embed-word_embed))
                                    if(len(academic_subs) < 2):
                                        academic_subs.append(l)
                        except:
                            if(not is_repeat(candidate, non_academic_subs)):
                                l = [candidate, '0', '0', 'UNK']
                                word_embed = get_word_embedding(candidate)
                                cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                l.append(cos_sim)
                                l.append(np.linalg.norm(sentence_embed-word_embed))
                                if(len(non_academic_subs) < 2):
                                    non_academic_subs.append(l)
                    else:
                        if(not is_repeat(candidate, non_academic_subs)):
                            l = [candidate, '0', '0', 'UNK']
                            word_embed = get_word_embedding(candidate)
                            cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                            l.append(cos_sim)
                            l.append(np.linalg.norm(sentence_embed-word_embed))
                            if(len(non_academic_subs) < 2):
                                non_academic_subs.append(l)
            if(len(academic_subs) < 2 or len(non_academic_subs) < 2):
                ppdb_candidates = db.get_ppdb2_candidates(lemma)
                for t in ppdb_candidates:
                    candidate = t[0]
                    if(candidate in coca_list or candidate in nawl_list or candidate in academic_list):
                        try:
                            if(not is_repeat(candidate, academic_subs)):
                                if(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'] > allwords_df.loc[allwords_df['word'] == candidate].iloc[0]['COCA-All']):
                                    l = [candidate, '0', '0', 'UNK']
                                    word_embed = get_word_embedding(candidate)
                                    cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                    l.append(cos_sim)
                                    l.append(np.linalg.norm(sentence_embed-word_embed))
                                    if(len(academic_subs) < 2):
                                        academic_subs.append(l)
                        except:
                            if(not is_repeat(candidate, non_academic_subs)):
                                l = [candidate, '0', '0', 'UNK']
                                word_embed = get_word_embedding(candidate)
                                cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                l.append(cos_sim)
                                l.append(np.linalg.norm(sentence_embed-word_embed))
                                if(len(non_academic_subs) < 2):
                                    non_academic_subs.append(l)
                    else:
                        if(not is_repeat(candidate, non_academic_subs)):
                            l = [candidate, '0', '0', 'UNK']
                            word_embed = get_word_embedding(candidate)
                            cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                            l.append(cos_sim)
                            l.append(np.linalg.norm(sentence_embed-word_embed))
                            if(len(non_academic_subs) < 2):
                                non_academic_subs.append(l)
            if(len(academic_subs) == 2 and len(non_academic_subs) == 2):
                academic_subs = sorted(academic_subs, key=lambda x: int(x[1]), reverse=True)
                non_academic_subs = sorted(non_academic_subs, key=lambda x: int(x[1]), reverse=True)
                valid_subs = list()
                valid_subs.extend(academic_subs)
                valid_subs.extend(non_academic_subs)
                # valid_subs : [[candidate, freq, is_non_academic, pos, cos_target, euclidean_distance], ....]
                word_pairs_test[(token_id, wordform)] = valid_subs
        else:
            no_replacement += 1

In [None]:
print(non_academic_count, with_replacement)

In [None]:
print(len(word_pairs_test), no_replacement, len(test_d), len(test_d)-no_replacement)

In [None]:
word_pairs_test

In [None]:
pos_tags = list()
for p in tqdm_notebook(word_pairs_test):
    for candidate_info in word_pairs_test[p]:
        pos_tags.append(candidate_info[3])
pos_tags = set(pos_tags)
le_pos = dict()
for i, v in enumerate(pos_tags):
    le_pos[v] = i

In [None]:
with open('word_pairs_test.pkl', 'wb') as f:
    pickle.dump(word_pairs_test, f)

In [None]:
qid = 1
doc = ''

for p in tqdm_notebook(word_pairs_test):
    for candidate_info in word_pairs_test[p]:

        is_non_academic = candidate_info[2]
        if(is_non_academic == '0'):
            freq = candidate_info[1]
        elif(is_non_academic == '1'):
            freq = ('-' + candidate_info[1]) # string concate to put it into the parser
        target_value = freq
        
        lemma = candidate_info[0]

        try:
            freq_beatiful = beatiful_data_freq[lemma]
        except:
            freq_beatiful = 0
        f_1 = feature2index['freq_beatiful']
        
        try:
            freq_coca_general = allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All']
        except:
            freq_coca_general = 0
        f_2 = feature2index['freq_coca_general']    
        
        try:
            freq_acl = acl_freq[tuple(lemma.split())]
        except:
            freq_acl = 0
        f_3 = feature2index['freq_acl']
        
        cos_target = candidate_info[4]
        f_4 = feature2index['cos_target']
        
        euclidean_distance = candidate_info[5]
        f_5 = feature2index['euclidean_distance']
        
        posMASC = candidate_info[3]
        posMASC_le = le_pos[posMASC]
        f_6 = feature2index['posMASC_le']
        
        # is_problematic
        
        word_length = len(lemma)
        f_8 = feature2index['word_length']
        
        count_vowel = sum(list(map(lemma.lower().count, 'aeiou')))
        f_9 = feature2index['count_vowel']
        
        doc += (str(target_value) + ' qid:' + str(qid) + ' ' + str(f_1) + ':' + str(freq_beatiful) + ' ' + str(f_2) + ':' + str(freq_coca_general) + ' ' + str(f_3) + ':' + str(freq_acl) + ' ' + str(f_4) + ':' + str(cos_target) + ' ' + str(f_5) + ':' + str(euclidean_distance) + ' ' + str(f_6) + ':' + str(posMASC_le) + ' ' + str(f_8) + ':' + str(word_length) + ' ' + str(f_9) + ':' + str(count_vowel) + '\n')

    qid += 1

with open('paraphrase-test.txt', 'w') as f:
    f.write(doc)

In [None]:
print(len(word_pairs_t), len(word_pairs_v), len(word_pairs_test))

In [None]:
word_pairs_train = dict()
for token_id in tqdm_notebook(train_d):
    lemma = train_d[token_id]['lemma']
    wordform = train_d[token_id]['wordform']
    sentence_embed = get_sentence_embedding(train_d[token_id]['targetsentence'])
    if(lemma not in coca_list or lemma not in nawl_list or lemma not in academic_list):
        academic_subs = list()
        non_academic_subs = list()
        for subst in train_d[token_id]['substitutions']:
            s = subst[0]
            if(s in coca_list or s in nawl_list or s in academic_list):
                try:
                    if(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'] > allwords_df.loc[allwords_df['word'] == s].iloc[0]['COCA-All']):
                        l = list(subst)
                        pos = l.pop(1)
                        l.append('0')
                        l.append(pos)
                        word_embed = get_word_embedding(s)
                        cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                        l.append(cos_sim)
                        l.append(np.linalg.norm(sentence_embed-word_embed))
                        if(len(academic_subs) < 2):
                            academic_subs.append(l)
                except:
                    l = list(subst)
                    pos = l.pop(1)
                    l.append('1')
                    l.append(pos)
                    word_embed = get_word_embedding(s)
                    cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                    l.append(cos_sim)
                    l.append(np.linalg.norm(sentence_embed-word_embed))
                    if(len(non_academic_subs) < 2):
                        non_academic_subs.append(l)
            else:
                l = list(subst)
                pos = l.pop(1)
                l.append('1')
                l.append(pos)
                word_embed = get_word_embedding(s)
                cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                l.append(cos_sim)
                l.append(np.linalg.norm(sentence_embed-word_embed))
                if(len(non_academic_subs) < 2):
                    non_academic_subs.append(l)

        if(academic_subs):
            if(len(academic_subs) < 2 or len(non_academic_subs) < 2):
                wordnet_candidates = db.get_wordnet_candidates(lemma)
                for t in wordnet_candidates:
                    candidate = t[0]
                    if(candidate in coca_list or candidate in nawl_list or candidate in academic_list):
                        try:
                            if(not is_repeat(candidate, academic_subs)):
                                if(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'] > allwords_df.loc[allwords_df['word'] == candidate].iloc[0]['COCA-All']):
                                    l = [candidate, '0', '0', 'UNK']
                                    word_embed = get_word_embedding(candidate)
                                    cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                    l.append(cos_sim)
                                    l.append(np.linalg.norm(sentence_embed-word_embed))
                                    if(len(academic_subs) < 2):
                                        academic_subs.append(l)
                        except:
                            if(not is_repeat(candidate, non_academic_subs)):
                                l = [candidate, '0', '0', 'UNK']
                                word_embed = get_word_embedding(candidate)
                                cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                l.append(cos_sim)
                                l.append(np.linalg.norm(sentence_embed-word_embed))
                                if(len(non_academic_subs) < 2):
                                    non_academic_subs.append(l)
                    else:
                        if(not is_repeat(candidate, non_academic_subs)):
                            l = [candidate, '0', '0', 'UNK']
                            word_embed = get_word_embedding(candidate)
                            cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                            l.append(cos_sim)
                            l.append(np.linalg.norm(sentence_embed-word_embed))
                            if(len(non_academic_subs) < 2):
                                non_academic_subs.append(l)
            if(len(academic_subs) < 2 or len(non_academic_subs) < 2):
                ppdb_candidates = db.get_ppdb2_candidates(lemma)
                for t in ppdb_candidates:
                    candidate = t[0]
                    if(candidate in coca_list or candidate in nawl_list or candidate in academic_list):
                        try:
                            if(not is_repeat(candidate, academic_subs)):
                                if(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'] > allwords_df.loc[allwords_df['word'] == candidate].iloc[0]['COCA-All']):
                                    l = [candidate, '0', '0', 'UNK']
                                    word_embed = get_word_embedding(candidate)
                                    cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                    l.append(cos_sim)
                                    l.append(np.linalg.norm(sentence_embed-word_embed))
                                    if(len(academic_subs) < 2):
                                        academic_subs.append(l)
                        except:
                            if(not is_repeat(candidate, non_academic_subs)):
                                l = [candidate, '0', '0', 'UNK']
                                word_embed = get_word_embedding(candidate)
                                cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                                l.append(cos_sim)
                                l.append(np.linalg.norm(sentence_embed-word_embed))
                                if(len(non_academic_subs) < 2):
                                    non_academic_subs.append(l)
                    else:
                        if(not is_repeat(candidate, non_academic_subs)):
                            l = [candidate, '0', '0', 'UNK']
                            word_embed = get_word_embedding(candidate)
                            cos_sim = np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed)))
                            l.append(cos_sim)
                            l.append(np.linalg.norm(sentence_embed-word_embed))
                            if(len(non_academic_subs) < 2):
                                non_academic_subs.append(l)
            if(len(academic_subs) == 2 and len(non_academic_subs) == 2):
                academic_subs = sorted(academic_subs, key=lambda x: int(x[1]), reverse=True)
                non_academic_subs = sorted(non_academic_subs, key=lambda x: int(x[1]), reverse=True)
                valid_subs = list()
                valid_subs.extend(academic_subs)
                valid_subs.extend(non_academic_subs)
                # valid_subs : [[candidate, freq, is_non_academic, pos, cos_target, euclidean_distance], ....]
                word_pairs_train[(token_id, wordform)] = valid_subs

In [None]:
word_pairs_train

In [None]:
pos_tags = list()
for p in tqdm_notebook(word_pairs_train):
    for candidate_info in word_pairs_train[p]:
        pos_tags.append(candidate_info[3])
pos_tags = set(pos_tags)
le_pos = dict()
for i, v in enumerate(pos_tags):
    le_pos[v] = i

In [None]:
with open('word_pairs_train.pkl', 'wb') as f:
    pickle.dump(word_pairs_train, f)

In [None]:
with open('word_pairs_train.pkl', 'rb') as f:
    word_pairs_train = pickle.load(f)

In [None]:
word_pairs_train

In [None]:
print(len(word_pairs_train)*4)

In [None]:
qid = 1
doc = ''

for p in tqdm_notebook(word_pairs_train):
    for candidate_info in word_pairs_train[p]:

        is_non_academic = candidate_info[2]
        if(is_non_academic == '0'):
            freq = candidate_info[1]
        elif(is_non_academic == '1'):
            freq = ('-' + candidate_info[1]) # string concate to put it into the parser
        target_value = freq
        
        lemma = candidate_info[0]

        try:
            freq_beatiful = beatiful_data_freq[lemma]
        except:
            freq_beatiful = 0
        f_1 = feature2index['freq_beatiful']
        
        try:
            freq_coca_general = allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All']
        except:
            freq_coca_general = 0
        f_2 = feature2index['freq_coca_general']    
        
        try:
            freq_acl = acl_freq[tuple(lemma.split())]
        except:
            freq_acl = 0
        f_3 = feature2index['freq_acl']
        
        cos_target = candidate_info[4]
        f_4 = feature2index['cos_target']
        
        euclidean_distance = candidate_info[5]
        f_5 = feature2index['euclidean_distance']
        
        posMASC = candidate_info[3]
        posMASC_le = le_pos[posMASC]
        f_6 = feature2index['posMASC_le']
        
        # is_problematic
        
        word_length = len(lemma)
        f_8 = feature2index['word_length']
        
        count_vowel = sum(list(map(lemma.lower().count, 'aeiou')))
        f_9 = feature2index['count_vowel']
        
        doc += (str(target_value) + ' qid:' + str(qid) + ' ' + str(f_1) + ':' + str(freq_beatiful) + ' ' + str(f_2) + ':' + str(freq_coca_general) + ' ' + str(f_3) + ':' + str(freq_acl) + ' ' + str(f_4) + ':' + str(cos_target) + ' ' + str(f_5) + ':' + str(euclidean_distance) + ' ' + str(f_6) + ':' + str(posMASC_le) + ' ' + str(f_8) + ':' + str(word_length) + ' ' + str(f_9) + ':' + str(count_vowel) + '\n')

    qid += 1

with open('paraphrase-train.txt', 'w') as f:
    f.write(doc)