In [662]:
import os
import re
import random

import spacy
import pandas as pd
import numpy as np

from copy import deepcopy
from nltk.corpus import wordnet
from textblob import Word
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import NearestNeighbors


In [2]:
nlp = spacy.load('en')

In [3]:
df = pd.read_csv('raw_data/train.csv').fillna('')[:1000]
qs1, qs2, dupl = list(df['question1']), list(df['question2']), list(df['is_duplicate'])

In [360]:
class Token(object):
    ''' stores information about single token'''
    
    def __init__(self, token):
        if isinstance(token, str):
            if re.match(r'_text:([\-a-zA-Z0-9]*),?_lemma:([\-a-zA-Z0-9]*),?_pos:([\-a-zA-Z0-9]*),?_ent_type:([\-a-zA-Z0-9]*),?', token) is None:
                raise ValueError()
            
            self.text = re.findall(r'_text:([\-a-zA-Z0-9]*),?', token)[0]
            self.lemma_ = re.findall(r'_lemma:([\-a-zA-Z0-9]*),?', token)[0]
            self.pos_ = re.findall(r'_pos:([\-a-zA-Z0-9]*),?', token)[0]
            self.ent_type_ = re.findall(r'_ent_type:([\-a-zA-Z0-9]*),?', token)[0]
        else:
            if token.text == ' ':
                raise ValueError()
            
            self.text = token.text
            self.lemma_ = token.lemma_
            self.pos_ = token.pos_
            self.ent_type_ = token.ent_type_
    
    def __repr__(self):
        return self.text
    
    def __str__(self):
        return self.text
    
    def to_str(self):
        return '_text:{},_lemma:{},_pos:{},_ent_type:{},'.format(self.text, self.lemma_, self.pos_, self.ent_type_)

    
class Tokens(object):
    ''' stores several tokens'''
    
    def __init__(self, tokens):
        self.tokens = self.load(tokens)
    
    def load(self, tokens):
        out = []
        for token in tokens:
            try:
                out.append(Token(token))
            except ValueError:
                # incorrect token, most likely 'space'
                continue
                
        return out
    
    def __repr__(self):
        return str([t.text for t in self.tokens])
    
    def __str__(self):
        return str([t.text for t in self.tokens])
    
    def __iter__(self):
        self.i = 0
        return self

    def __next__(self):
        if self.i < len(self.tokens):
            out = self.tokens[self.i]
            self.i += 1
            return out
        else:
            raise StopIteration

    def to_str(self):
        return ' '.join([t.to_str() for t in self.tokens])
    
    def ext_pattern(self, pattern):
        ''' extends regex pattern to be able to match all fields of token'''
        
        fields = ['_text', '_lemma', '_pos', '_ent_type']
        for i, _ in enumerate(fields):
            in_pattern = ''
            out_pattern = ''
            for j, field in enumerate(fields):
                if i == j:
                    in_pattern += '{}:([\-a-zA-Z0-9]*),?'.format(field)
                else:
                    in_pattern += '(?:{}:([\-a-zA-Z0-9]*),?)?'.format(field)
                
                out_pattern += '{}:\{},'.format(field, j + 1)
            
            pattern = re.sub(
                in_pattern, 
                out_pattern,
                pattern
            )
            
        for field in fields:
            in_pattern = '{}:,'.format(field)
            out_pattern = '{}:[\-a-zA-Z0-9]*,'.format(field)
            
            pattern = re.sub(
                in_pattern, 
                out_pattern,
                pattern
            )
        
        return pattern
    
    def match(self, pattern):
        m = re.match(self.ext_pattern(pattern), self.to_str())
        if m is not None:
            return dict([(k, Tokens(v.split(' '))) for k, v in m.groupdict().items()])
        else:
            return None
            
    def sub(self, pattern, repl):
        new_str = re.sub(self.ext_pattern(pattern), repl, self.to_str())
        return Tokens(new_str.split(' '))
        

parsed = nlp("What is the most fast way to visit France?")
tks = Tokens(parsed)
tks.sub(r'_lemma:can _pos:PRON', '_text:to,_lemma:to,_pos:to')
tks.match(r'_lemma:what _lemma:be (_pos:DET )?(_pos:ADV |_pos:ADJ )*(_lemma:way|_lemma:method|_lemma:source) (_lemma:to|_lemma:of) ?(?P<target>.*)')


{'target': ['visit', 'France']}

In [658]:
DEFAULT_PATH = 'top20k-english-words.txt'

class SpellingCorrector():
    
    MIN_LEN = 4
    MIN_RATIO = 2

    def __init__(self, whitelist_path=DEFAULT_PATH):
        self.whitelist_path = whitelist_path
        self.whitelist = None
        self.tokens = {}
        self.corrections = {}
    
    @staticmethod
    def _tokenize(s):
        return tuple(re.findall(r'\w+|\W+', s))
    
    @staticmethod
    def _edits1(word):
        ''' Based on http://norvig.com/spell-correct.html'''
        
        letters    = 'abcdefghijklmnopqrstuvwxyz-'
        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
        deletes    = [L + R[1:]               for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
        replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
        inserts    = [L + c + R               for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)
    
    def _correct_token(self, old_token):        
        if old_token in self.corrections:
            return self.corrections[old_token]
        
        candidates = [(e, self.tokens.get(e, 0)) for e in list(self._edits1(old_token)) + [old_token]]
        candidates_sorted = sorted(candidates, key=lambda x: -x[1])
        
        new_token = candidates_sorted[0][0]
        new_cnt = self.tokens.get(new_token, 0)
        old_cnt = self.tokens.get(old_token, 0)
        
        if len(new_token) >= self.MIN_LEN and new_cnt > self.MIN_RATIO*old_cnt and old_token not in self.whitelist:
            correct_token = new_token
        else:
            correct_token = old_token
        
        self.corrections[old_token] = correct_token

        return correct_token
    
    def _correct(self, jt):
        tokens = self._tokenize(jt)
        new_jt = ''.join([self._correct_token(t) for t in tokens])
        return ''.join([self._correct_token(t) for t in tokens])
    
    def _load_whitelist(self):
        out = set()
        with open(self.whitelist_path, 'r') as f:
            for line in f:
                out.add(line.strip('\n'))
        
        return out
    
    def fit(self, data, *args):
        for row in data:
            tokens = self._tokenize(row)
            for token in tokens:
                if token not in self.tokens:
                    self.tokens[token] = 0
                self.tokens[token] += 1

        return self
        
    def transform(self, data):
        if self.whitelist is None:
            self.whitelist = self._load_whitelist()
        
        out_data = []
        for row in data:
            out_data.append(self._correct(row))
            
        return out_data


class Parser(BaseEstimator, TransformerMixin):

    def fit(self, rows, *args):
        return self

    def transform(self, rows):
        out = []
        for s in rows:
            raw = s
            parsed = Tokens(nlp(s))
            
            out.append({
                'raw': raw, 
                'parsed': parsed,
                'query': None,
            })
            
        return out
    
    
class StripPunctuation(BaseEstimator, TransformerMixin):

    def fit(self, rows, *args):
        return self

    def transform(self, rows):
        out = []
        for row in rows:
            out.append({
                'raw': row['raw'],
                'parsed': row['parsed'].sub(r'_pos:PUNCT', ''),
                'query': row['query'],
            })
            
        return out
    
    
class UnifySyntax(BaseEstimator, TransformerMixin):
    # Where can I VERB => Where to VERB
    # How does one VERB => How to VERB
    
    def fit(self, rows, *args):
        return self

    def transform(self, rows):
        out = []
        for row in rows:
            out.append({
                'raw': row['raw'], 
                'parsed': row['parsed'] \
                    .sub(r'(_lemma:be|_lemma:do|_lemma:can|_lemma:could|_lemma:shall|_lemma:should|_lemma:will|_lemma:would) (_pos:PRON|_lemma:one)', '_text:to,_lemma:to,_pos:to,_ent_type:'),
                'query': row['query'],
            })
            
        return out

    
class GetQuery(BaseEstimator, TransformerMixin):
    
    def fit(self, rows, *args):
        return self

    def transform(self, rows):
        out = []
        for row in rows:
            query = row['query']
            if query is not None:
                out.append(deepcopy(row))
                continue
                
            for pattern in self.patterns:
                args = row['parsed'].match(pattern)
                if args is not None:
                    query = {}
                    query['args'] = args
                    query['type'] = self.type
                    break

            out.append({
                'raw': row['raw'], 
                'parsed': row['parsed'],
                'query': query,
            })
        
        return out 
    
    
class HowToQuestion(GetQuery):

    def __init__(self):
        # How to 
        # What to do to 
        # What are some ways to , What is the way to , What are the most efective method to 
        # How can 
        # What is the step by step guide
        
        self.type = 'HowTo'
        self.patterns = [
            r'_lemma:how _lemma:to ?(?P<target>.*)',
            r'_lemma:what _lemma:to _lemma:do _lemma:to ?(?P<target>.*)',            
            r'_lemma:what _lemma:be (_pos:DET )?(_pos:ADV |_pos:ADJ )*((_lemma:step _lemma:by _lemma:step ))?(_lemma:way|_lemma:method|_lemma:source|_lemma:guide) (_lemma:to|_lemma:of) ?(?P<target>.*)',
            r'_lemma:how _lemma:can ?(?P<target>.*)',
        ]

class FactWhatQuestion(GetQuery):

    def __init__(self):
        # 'What is Affiliate Marketing?'
        # 'What are the best white tequila brands?'
        # 'When is new years eve?'
        # 
        
        self.type = 'FactWhat'
        self.patterns = [
            r'(_lemma:what|_lemma:which) _lemma:be ?(?P<subject>.*)',
        ]
        
class FactWhoQuestion(GetQuery):

    def __init__(self):
        # 'What is Affiliate Marketing?'
        # 'What are the best white tequila brands?'
        # 'When is new years eve?'
        # 
        
        self.type = 'FactWho'
        self.patterns = [
            r'_lemma:who _lemma:be ?(?P<subject>.*)',
        ]

class FactWhenQuestion(GetQuery):

    def __init__(self):
        # 'What is Affiliate Marketing?'
        # 'What are the best white tequila brands?'
        # 'When is new years eve?'
        # 
        
        self.type = 'FactWhen'
        self.patterns = [
            r'_lemma:when _lemma:be ?(?P<subject>.*)',
        ]
        
class FactWhereQuestion(GetQuery):

    def __init__(self):
        # 'What is Affiliate Marketing?'
        # 'What are the best white tequila brands?'
        # 'When is new years eve?'
        # 
        
        self.type = 'FactWhere'
        self.patterns = [
            r'_lemma:where _lemma:be ?(?P<subject>.*)',
        ]
        

class TrueFalseQuestion(GetQuery):

    def __init__(self):
        # 'Are ferrari cars faster than porsche?'
        # 'Are black beans complex carbs?'
    
        NP1 = r'(( _pos:ADJ)*(( _pos:ADV)* _pos:VERB)?( _pos:ADJ)*( (_pos:NOUN)|(_pos:PROPN)){1,3})'
        NP2 = r'( _pos:ADV _pos:ADP( _pos:ADJ)*( (_pos:NOUN)|(_pos:PROPN)){1,3})'
        NP3 = r'( _pos:VERB .*)'
        NP4 = r'( (_pos:ADJ)|(_pos:ADV))'
        NP5 = r'(( (_pos:NOUN)|(_pos:PROPN)){1,3} _pos:VERB( (_pos:NOUN)|(_pos:PROPN)){1,3})'
        NP = '(' + '|'.join([NP1, NP2, NP3, NP4]) + ')'
        PP = NP + r'( _pos:ADP' + NP + ')?'
                
        self.type = 'TrueFalse'
        self.patterns = [
            r'_lemma:be(?P<subject>{})(?P<property>{})'.format(PP, PP),
        ]
        
        
class Lemmatizer(BaseEstimator, TransformerMixin):

    def fit(self, rows, *args):
        return self

    def transform(self, rows):
        out = []
        for row in rows:
            out.append({
                'raw': row['raw'], 
                'parsed': [t.lemma_ for t in row['parsed']],
                'query': row['query'],
            })
            
        return out


In [659]:
def normalize(rows):
    pipeline = make_pipeline(
        SpellingCorrector(),
        Parser(),
        StripPunctuation(),
        UnifySyntax(),
        HowToQuestion(),
        FactWhatQuestion(),
        FactWhoQuestion(),
        FactWhereQuestion(),
        FactWhenQuestion(),
        TrueFalseQuestion(),
#        Lemmatizer(),
    )
    
    return pipeline.fit_transform(rows)

In [660]:
normalize(["How does 3D printing work?"])#[0]['parsed'].to_str().split( )

[{'parsed': ['How', 'does', '3D', 'printing', 'work'],
  'query': None,
  'raw': 'How does 3D printing work?'}]

In [661]:
%%time

def count_similarity(rows1, rows2, dupl):
    def get_type(q1, q2):
        if q1['query'] is None or q2['query'] is None:
            return np.NaN
        else:
            return q1['query']['type'] == q2['query']['type']
    
    def get_normalized(q1, q2):
        # todo
        if q1['parsed'] == q2['parsed']:
            return True
        else:
            if q1['query'] is None or q2['query'] is None:
                return False
            else:
                return str(q1['query']['args']) == str(q2['query']['args'])
    
    sim0 = np.mean([get_normalized(q1, q2) for q1, q2, d in zip(rows1, rows2, dupl) if d == 0])
    sim1 = np.mean([get_normalized(q1, q2) for q1, q2, d in zip(rows1, rows2, dupl) if d == 1])

    query_all = np.mean([q['query'] is not None for q in rows1 + rows2])
    
    query0 = np.nanmean([get_type(q1, q2) for q1, q2, d in zip(rows1, rows2, dupl) if d == 0])
    query1 = np.nanmean([get_type(q1, q2) for q1, q2, d in zip(rows1, rows2, dupl) if d == 1])
    
    return sim0, sim1, query_all, query0, query1

rows1 = normalize(qs1)
rows2 = normalize(qs2)
print('''
Fully normalized 0: {:.3f}
Fully normalized 1: {:.3f}

Query type: {:.3f}
Same type 0: {:.3f}
Same type 1: {:.3f}
'''.format(*count_similarity(rows1, rows2, dupl)))


Fully normalized 0: 0.003
Fully normalized 1: 0.016

Query type: 0.447
Same type 0: 0.840
Same type 1: 0.947

CPU times: user 10.4 s, sys: 50.3 ms, total: 10.4 s
Wall time: 10.5 s


In [166]:
# wordnet unify experiments

%%time

parsed_synsets = {} # pos : lemmas : name
for synset in wordnet.all_synsets():
    pos = synset.pos()
    name = synset.name().split('.')[0]
    lemmas = tuple(synset.lemma_names())
    
    if pos not in parsed_synsets:
        parsed_synsets[pos] = {}
    parsed_synsets[pos][lemmas] = name


synset_vec = {} # pos : name : vec
for pos, synsets in parsed_synsets.items():
    synset_vec[pos] = {}
    
    for lemmas, name in synsets.items():
        tokens = []
        for lemma in lemmas:
            for token in lemma.split('_'):
                tokens.append(token)
    
        vec = np.zeros((1, 300), dtype=np.float32)
        for wordnet_token in tokens:
            for spacy_token in nlp(wordnet_token):
                spacy_vec = spacy_token.vector
                if sum(abs(spacy_vec)) < 0.001:
                    vec += np.random.rand(1, 300)
                else:
                    vec += spacy_vec

        synset_vec[pos][name] = vec

pos_nn = {} # pos : [names, nn]
for pos, name_vec in synset_vec.items():
    names = list(name_vec.keys())
    vec_lst = list(name_vec.values())
    vec_matrix = np.concatenate(vec_lst, axis=0)
    
    nn = NearestNeighbors(
        n_neighbors=5,
        metric='cosine',
        algorithm='brute',
    )
    nn.fit(vec_matrix)

    pos_nn[pos] = [names, nn]


def get_similar(token, pos):
    vec = np.reshape(nlp(token)[0].vector, (1, 300))
    names, nn = pos_nn[pos]
    kn = nn.kneighbors(vec)
    
    for d, i in zip(kn[0][0], kn[1][0]):
        print(names[i], d)

get_similar('gorilla', 'n'), Word('gorilla').synsets[0].lemma_names(), nlp('gorilla').vector

CPU times: user 4.31 s, sys: 246 ms, total: 4.56 s
Wall time: 4.6 s
