# Sentence Similarity Measures IIIa: Baseline Wide-Inclusive Sentence Featurization

## 0. Contents

* I. Corpora:
    * MSR Paraphrase Corpus
    * Brown
* II. Discriminativity Weighting (Brown, SpaCy lemmatization)
* III. Featurization:
    * Features:
        * Unigram Prec/Rec (Wan et al. 2006) 
        * Bleu Prec/Rec (Papineni et al. 2002)
        * Dependency Prec/Rec (Wan et al. 2006; Moll$\acute{a}$ 2003; Hovy et al. 2015)
        * F1 for Unigram, Bleu & Dependency
        * Tree Edit Distance (Zhang & Sasha Algorithm)
        * Sentence Lengths (Wan et al. 2006)
    * Featurization Function
* IV. Paraphrase Classifier:
    * Training: MSR Paraphrase Corpus
    * Classifier Types:
        * Logistic
        * SVM
* V. Evaluation

## I. Corpora

In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import brown
from spacy.en import English
from collections import defaultdict

In [2]:
parser = English()

##### Load MSR

In [4]:
train_path = "/Users/jacobsw/Desktop/WORK/OJO_CODE/SENTENCE_SIMILARITIES/CORPORA/paraphrase/msr_paraphrase_train.txt"
test_path = "/Users/jacobsw/Desktop/WORK/OJO_CODE/SENTENCE_SIMILARITIES/CORPORA/paraphrase/msr_paraphrase_test.txt"

In [5]:
df_train = pd.read_table(train_path, encoding='utf-8-sig')
df_test = pd.read_table(test_path, encoding='utf-8-sig')
df_train.head()

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,702876,702977,"Amrozi accused his brother, whom he called the...","Referring to him as only the witness, Amrozi a..."
1,0,2108705,2108831,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...
2,1,1330381,1330521,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an..."
3,0,3344667,3344648,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ..."
4,1,1236820,1236712,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...


In [6]:
print df_train.shape
print df_test.shape

(4076, 5)
(1725, 5)


In [7]:
df_train.ix[0] # NB: index Quality is actually weirdly '﻿Quality'.

Quality                                                      1
#1 ID                                                   702876
#2 ID                                                   702977
#1 String    Amrozi accused his brother, whom he called the...
#2 String    Referring to him as only the witness, Amrozi a...
Name: 0, dtype: object

In [8]:
df_train.ix[0]['#1 String']

u'Amrozi accused his brother, whom he called the witness, of deliberately distorting his evidence.'

##### Make Train/Test

In [9]:
dep_lemmas = lambda parsed_s: {(token.head.lemma_,token.lemma_) for token in parsed_s
                             if token.head.lemma_!=token.lemma_} # eliminte (v, ROOT, v) cases
dep_tokens = lambda parsed_s: {(token.head.orth_,token.orth_) for token in parsed_s
                             if token.head.lemma_!=token.lemma_}

In [10]:
get_root = lambda parsed_s: [token for token in parsed_s if token.lemma_==token.head.lemma_][0]

In [19]:
def parse_msr(df, indexer):
    
    X_dic, Y_dic = defaultdict(lambda x: defaultdict(list)), \
                   defaultdict(lambda x: defaultdict(list))
    
    for i in indexer:
        
        entry_dic = defaultdict(list)
        s1, s2 = df.ix[i]['#1 String'][:-1], \
                 df.ix[i]['#2 String'][:-1] 
                # get rid of period, which causes problem in distinguishing identical tokens.
        
        parsed_s1, parsed_s2 = parser(unicode(s1)), parser(unicode(s2))
        
        entry_dic['s1'] = [token.orth_ for token in parsed_s1]
        entry_dic['s2'] = [token.orth_ for token in parsed_s2]
        entry_dic['s1_lm'] = [token.lemma_ for token in parsed_s1]
        entry_dic['s2_lm'] = [token.lemma_ for token in parsed_s2]
        
        parsed_lm_s1, parsed_lm_s2 = parser(' '.join(entry_dic['s1_lm'])), \
                                    parser(' '.join(entry_dic['s2_lm'])) # parse on lemmas.
        
        entry_dic['s1_dep_lm'] = dep_lemmas(parsed_lm_s1) # for dep lemma features.
        entry_dic['s2_dep_lm'] = dep_lemmas(parsed_lm_s2)
        entry_dic['s1_dep_tk'] = dep_tokens(parsed_s1) # for dep token features.
        entry_dic['s2_dep_tk'] = dep_tokens(parsed_s2) 
        entry_dic['s1_root_lm'] = get_root(parsed_lm_s1)
        entry_dic['s2_root_lm'] = get_root(parsed_lm_s2)
        entry_dic['s1_root_tk'] = get_root(parsed_s1)
        entry_dic['s2_root_tk'] = get_root(parsed_s2)
        entry_dic['s1_id'] = df.ix[i]['#1 ID'] # for error analysis later.
        entry_dic['s2_id'] = df.ix[i]['#2 ID']
        X_dic[i] = entry_dic
        Y_dic[i] = df.ix[i]['Quality']
    
    return X_dic, Y_dic


In [20]:
%%time
X_train, Y_train = parse_msr(df_train, df_train.index)

CPU times: user 30.3 s, sys: 311 ms, total: 30.6 s
Wall time: 30.7 s


In [21]:
%%time
X_test, Y_test = parse_msr(df_test, df_test.index)

CPU times: user 12.7 s, sys: 142 ms, total: 12.8 s
Wall time: 12.9 s


In [22]:
print 'sentence 1: ', X_train[0]['s1']; print
print 'sentence 2: ', X_train[0]['s2']; print
print 'paraphrase label: ', Y_train[0]

sentence 1:  [u'Amrozi', u'accused', u'his', u'brother', u',', u'whom', u'he', u'called', u'the', u'witness', u',', u'of', u'deliberately', u'distorting', u'his', u'evidence']

sentence 2:  [u'Referring', u'to', u'him', u'as', u'only', u'the', u'witness', u',', u'Amrozi', u'accused', u'his', u'brother', u'of', u'deliberately', u'distorting', u'his', u'evidence']

paraphrase label:  1


##### Load Brown

In [23]:
def parse_brown():
    
    sents = brown.sents()
    parsed_sents = [parser(' '.join(sent)) for sent in sents]
    lemma_words = [token.lemma_ for parsed_sent in parsed_sents for token in parsed_sent]
    
    return lemma_words

In [24]:
%%time
brown_words = parse_brown()

CPU times: user 1min 43s, sys: 910 ms, total: 1min 44s
Wall time: 1min 44s


In [25]:
N = len(brown_words)
N

1188973

## I. Discriminativity Weighting (IDF)

**Math**

* $IDF(w) = log\frac{N}{df_w}$, where $N$ is the number of words in a corpus; $df_w$ is word $w$'s frequency in the corpus.

In [52]:
from __future__ import division

In [66]:
def log(x):
    return np.log(x) if x>0 else 0 
    # intuitively N > word_count(w) for any w,
    #  therefore we cannot let idf(w) be negative
    #  even when word_count(w) = 0 for w.
def div(x, y):
    return x/y if y!=0 else 0

In [33]:
def idf(w):
    
    return log(div(N,brown_words.count(w)))

In [34]:
print "'the': ", idf('the')
print "'discriminate': ", idf('discriminate')

'the':  2.77258872224
'discriminate':  12.0426886361


## IIa. Features

### A.  Unigram Prec/Rec

**Math**

* $Uni\_Prec(s_1,s_2) = \frac{word\_overlap(s_1,s_2)}{word\_count(s_1)}$ (cf. Wan et al. 2006:133, weighted by $IDF$)


* $Uni\_Rec(s_1,s_2) = \frac{word\_overlap(s_1,s_2)}{word\_count(s_2)}$ (cf. ibid.)


* If needed, $TF-IDF: \frac{\left(\sum_{w\in s_1\cap s_2}log\frac{N}{df_w}\right)}{\left(\sum_{w\in s_1}log\frac{N}{df_w}\right)}$

In [46]:
def intersection(s1, s2):
    return set(s1).intersection(set(s2))

def word_overlap(s1, s2):
    return len(intersection(s1,s2))

def lemmatize(s):
    return [token.lemma_ for token in parser(' '.join(s))]

In [47]:
def uni_prec(s1, s2): # s1,s2 assumed to be lists of words (lemmas or tokens)

    return div(word_overlap(s1,s2),len(s1))


In [48]:
def uni_rec(s1, s2): # s1,s2 assumed to be lists of words (lemmas or tokens)

    return div(word_overlap(s1,s2),len(s2))

In [38]:
s0 = X_train[0]['s1']
s1 = X_train[0]['s2'] # known to be the paraphrase of q
s2 = X_train[1]['s1'] # known to not be the paraphrase of q
s0_lm = X_train[0]['s1_lm']
s1_lm = X_train[0]['s2_lm']
s2_lm = X_train[1]['s1_lm']

In [54]:
%%time
print uni_prec(s0,s1)
print uni_prec(s0,s2)
print uni_prec(s0_lm,s1_lm)
print uni_prec(s0_lm,s2_lm)

0.6875
0.0625
0.6875
0.0625
CPU times: user 274 µs, sys: 184 µs, total: 458 µs
Wall time: 265 µs


In [55]:
%%time
print uni_rec(s0,s1)
print uni_rec(s0,s2)
print uni_rec(s0_lm,s1_lm)
print uni_rec(s0_lm,s2_lm)

0.647058823529
0.0625
0.647058823529
0.0625
CPU times: user 240 µs, sys: 152 µs, total: 392 µs
Wall time: 297 µs


### B. BLEU Prec/Rec

**NB** (cf. Wan et al. 2006:133)

* "... Bleu metric uses the geometric average of unigram, bigram and trigram precision scores."
* "... by reversing [two sentences], ... a recall version of Bleu is obtained."

In [56]:
from nltk import bleu

In [57]:
def bleu_prec(s1, s2, lemmatized=False): # s1 as the 'hypothesis'

    return bleu(s2,s1)

In [58]:
def bleu_rec(s1, s2, lemmatized=False): # s2 as the 'hypothesis' 
    
    return bleu(s1,s2)

In [59]:
%%time
print bleu_prec(s0,s1)
print bleu_prec(s0,s2)
print bleu_prec(s0_lm,s1_lm)
print bleu_prec(s0_lm,s2_lm)

0.5
0
0.5
0
CPU times: user 6.78 ms, sys: 2.31 ms, total: 9.1 ms
Wall time: 7.54 ms


In [60]:
%%time
print bleu_rec(s0,s1)
print bleu_rec(s0,s2)
print bleu_rec(s0_lm,s1_lm)
print bleu_rec(s0_lm,s2_lm)

0.492479060505
0
0.492479060505
0
CPU times: user 7.09 ms, sys: 2.98 ms, total: 10.1 ms
Wall time: 7.75 ms


### C. Dependency Prec/Rec

**Math**

* $Dep\_Prec(s_1,s_2) = \frac{|dep\_pair(s_1)|\cap|dep\_pair(s_2)|}{|dep\_pair(s_1)|}$ (cf. Wan et al. 2006:134)


* $Dep\_Rec(s_1,s_2) = \frac{|dep\_pair(s_1)|\cap|dep\_pair(s_2)|}{|dep\_pair(s_2)|}$ (cf. ibid.)

**NB**: $relation$ in the reference confuses *dependency pair* with *dependency relation*. $relation$ refers to "... a pair of words in a parent-child relationship within the dependency tree, referred to as head-modifier relationship. ... we ignore the label of the relationships which indicates the semantic role".

In [61]:
def dep_prec(dep_pairs_s1, dep_pairs_s2):
    
    return div(len(dep_pairs_s1.intersection(dep_pairs_s2)),
               len(dep_pairs_s1))
        

In [62]:
def dep_rec(dep_pairs_s1, dep_pairs_s2):
    
    return div(len(dep_pairs_s1.intersection(dep_pairs_s2)),
               len(dep_pairs_s2))


In [63]:
s0_dep_tk = X_train[0]['s1_dep_tk']
s1_dep_tk = X_train[0]['s2_dep_tk']
s2_dep_tk = X_train[1]['s1_dep_tk']
s0_dep_lm = X_train[0]['s1_dep_lm']
s1_dep_lm = X_train[0]['s2_dep_lm']
s2_dep_lm = X_train[1]['s1_dep_lm']

In [64]:
%%time
print dep_prec(s0_dep_tk,s1_dep_tk)
print dep_prec(s0_dep_tk,s2_dep_tk)
print dep_prec(s0_dep_lm,s1_dep_lm)
print dep_prec(s0_dep_lm,s2_dep_lm)

0.571428571429
0.0
0.533333333333
0.0
CPU times: user 183 µs, sys: 68 µs, total: 251 µs
Wall time: 198 µs


In [65]:
%%time
print dep_prec(s0_dep_tk,s1_dep_tk)
print dep_prec(s0_dep_tk,s2_dep_tk)
print dep_prec(s0_dep_lm,s1_dep_lm)
print dep_prec(s0_dep_lm,s2_dep_lm)

0.571428571429
0.0
0.533333333333
0.0
CPU times: user 224 µs, sys: 87 µs, total: 311 µs
Wall time: 264 µs


### D. F1

**Math**

* $F1 = 2\cdot\frac{prec\cdot rec}{prec + rec}$ (cf. https://en.wikipedia.org/wiki/F1_score)

In [67]:
def f1(s1_info, s2_info, fn_prec, fn_rec):
    # s1_info, s2_info: could be s1/s2 as lists of words, or dep-pairs.
    
    prec, rec = fn_prec(s1_info,s2_info), fn_rec(s1_info,s2_info)
    
    return prec, rec, 2*div(prec*rec,prec+rec)

In [69]:
%%time
print f1(s0,s1,uni_prec,uni_rec)[2]
print f1(s0,s2,uni_prec,uni_rec)[2]
print f1(s0_lm,s1_lm,uni_prec,uni_rec)[2]
print f1(s0_lm,s2_lm,uni_prec,uni_rec)[2]

0.666666666667
0.0625
0.666666666667
0.0625
CPU times: user 199 µs, sys: 81 µs, total: 280 µs
Wall time: 220 µs


In [70]:
%%time
print f1(s0,s1,bleu_prec,bleu_rec)[2]
print f1(s0,s2,bleu_prec,bleu_rec)[2]
print f1(s0_lm,s1_lm,bleu_prec,bleu_rec)[2]
print f1(s0_lm,s2_lm,bleu_prec,bleu_rec)[2]

0.496211033666
0
0.496211033666
0
CPU times: user 12.5 ms, sys: 5.74 ms, total: 18.2 ms
Wall time: 14 ms


In [71]:
%%time
print f1(s0_dep_tk,s1_dep_tk,dep_prec,dep_rec)[2]
print f1(s0_dep_tk,s2_dep_tk,dep_prec,dep_rec)[2]
print f1(s0_dep_lm,s1_dep_lm,dep_prec,dep_rec)[2]
print f1(s0_dep_lm,s2_dep_lm,dep_prec,dep_rec)[2]

0.533333333333
0
0.516129032258
0
CPU times: user 232 µs, sys: 228 µs, total: 460 µs
Wall time: 280 µs


### E. Tree Edit Distance

In [72]:
from zss import simple_distance, Node
    # use zss.distance if dynamic tree modification is needed. 
    #  cf. zss api: pythonhosted.org/zss/.

In [73]:
def make_tree(token, lemmatized):
    
    node = Node(token.lemma_) if lemmatized else Node(token.orth_)
    for child in token.children:
        node.addkid(make_tree(child, lemmatized)) 
    return node

In [74]:
def tree_edit_dist(root_s1, root_s2, lemmatized=False):
    
    return simple_distance(make_tree(root_s1, lemmatized),
                           make_tree(root_s2, lemmatized))
    

In [75]:
s0_root_lm = X_train[0]['s1_root_lm']
s1_root_lm = X_train[0]['s2_root_lm']
s2_root_lm = X_train[1]['s1_root_lm']

In [76]:
%%time
print tree_edit_dist(s0_root_lm,s1_root_lm)
print tree_edit_dist(s0_root_lm,s2_root_lm)
print tree_edit_dist(s0_root_lm,s1_root_lm, lemmatized=True)
print tree_edit_dist(s0_root_lm,s2_root_lm, lemmatized=True)

15
21
15
21
CPU times: user 30.6 ms, sys: 7.67 ms, total: 38.3 ms
Wall time: 32.9 ms


In [77]:
s0_root_tk = X_train[0]['s1_root_tk']
s1_root_tk = X_train[0]['s2_root_tk']
s2_root_tk = X_train[1]['s1_root_tk']

In [78]:
%%time
print tree_edit_dist(s0_root_tk,s1_root_tk)
print tree_edit_dist(s0_root_tk,s2_root_tk)

13
21
CPU times: user 17.8 ms, sys: 7.07 ms, total: 24.8 ms
Wall time: 19.7 ms


### F. Sentence Lengths

* "... the difference in length of two sentences ... measured in words by subtracting one length from the other." (cf. Wan et al. 2006:134)
* "... this difference could be a negative or positive integer ... an absolute variant was used." (cf. ibid.)

In [101]:
def sent_len_diffs(s1, s2):
    
    diff = len(s1)-len(s2)
    
    return diff, abs(diff)

In [102]:
print sent_len_diffs(s0,s1)
print sent_len_diffs(s0,s2)

(-1, 1)
(0, 0)


## IIb: Featurization Function

**Features (22 in total)**:

* Unigram Prec/Rec + lemmatized variant: 4
* Bleu Prec/Rec + lemmatized variant: 4
* Dependency Prec/Rec + lemmatized variant: 4
* F1 Unigram, Bleu, Dependency + lemmatized variant: 6
* Tree Edit Distance + lemmatized variant: 2
* Sentence Lengths: 2

In [90]:
import numpy as np

In [81]:
class Cache(object):
    
    def __init__(self):
        self.parse = {}
        self.token_s = {}
        self.token_lm = {}
        self.dep_lm = {}
        self.dep_tk = {}
        self.root = {}
       
    def get_parse(self, s):
        if s not in self.parse:
            self.parse[s] = parser(s)
        return self.parse[s]

    def get_token_s(self, s):
        if s not in self.token_s:
            self.token_s[s] = [token.orth_ for token in self.get_parse(s)]
        return self.token_s[s]
   
    def get_token_lm(self, s):
        if s not in self.token_lm:
            self.token_lm[s] = [token.lemma_ for token in self.get_parse(s)]
        return self.token_lm[s]
   
    def get_dep_lm(self, s):
        if s not in self.dep_lm:
            self.dep_lm[s] = dep_lemmas(self.get_parse(s))
        return self.dep_lm[s]
   
    def get_dep_tk(self, s):
        if s not in self.dep_tk:
            self.dep_tk[s] = dep_tokens(self.get_parse(s))
        return self.dep_tk[s]
   
    def get_root(self, s):
        if s not in self.root:
            self.root[s] = get_root(self.get_parse(s))
        return self.root[s]
    

In [82]:
CACHE = Cache()

In [109]:
def featurize_new(s1, s2): 
    # featurize a new input.
    # s1 and s2 are strings

    s1_s = CACHE.get_token_s(s1)
    s2_s = CACHE.get_token_s(s2)    
    s1_lm = CACHE.get_token_lm(s1)
    s2_lm = CACHE.get_token_lm(s2)
    s1_dep_lm = CACHE.get_dep_lm(s1)
    s2_dep_lm = CACHE.get_dep_lm(s2)
    s1_dep_tk = CACHE.get_dep_tk(s1)
    s2_dep_tk = CACHE.get_dep_tk(s2)    
    s1_root = CACHE.get_root(s1)
    s2_root = CACHE.get_root(s2)

    return list(f1(s1_s,s2_s,uni_prec,uni_rec)+f1(s1_lm,s2_lm,uni_prec,uni_rec)+\
                f1(s1_s,s2_s,bleu_prec,bleu_rec)+f1(s1_lm,s2_lm,bleu_prec,bleu_rec)+\
                f1(s1_dep_tk,s2_dep_tk,dep_prec,dep_rec)+f1(s1_dep_lm,s2_dep_lm,dep_prec,dep_rec)+\
                (tree_edit_dist(s1_root,s2_root),)+(tree_edit_dist(s1_root,s2_root,lemmatized=True),)+\
                sent_len_diffs(s1_s, s2_s))


In [110]:
s0_asnew, s1_asnew, s2_asnew = ' '.join(s0), ' '.join(s1), ' '.join(s2)
print s0_asnew
print s1_asnew
print s2_asnew

Amrozi accused his brother , whom he called the witness , of deliberately distorting his evidence
Referring to him as only the witness , Amrozi accused his brother of deliberately distorting his evidence
Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion


In [111]:
%%time
print featurize_new(s0_asnew,s1_asnew)
print 
print featurize_new(s0_asnew,s2_asnew)

[0.6875, 0.6470588235294118, 0.6666666666666667, 0.6875, 0.6470588235294118, 0.6666666666666667, 0.5, 0.4924790605054523, 0.49621103366618263, 0.5, 0.4924790605054523, 0.49621103366618263, 0.5714285714285714, 0.5, 0.5333333333333333, 0.5714285714285714, 0.5, 0.5333333333333333, 13, 13, -1, 1]

[0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0, 0, 0, 0, 0, 0, 0.0, 0.0, 0, 0.0, 0.0, 0, 21, 21, 0, 0]
CPU times: user 44.3 ms, sys: 6.86 ms, total: 51.1 ms
Wall time: 46.1 ms


## IV. Paraphrase Classifier

### A. Featurizing Training/Test from MSR

* "... the training set contains 2753 true paraphrase pairs and 1323 false paraphrase pairs; ... the test set contains 1147 and 578 pairs, respectively." (cf. Ji & Eisenstein 2013:893)

In [262]:
print X_train[0].keys()

['s1_dep_lm', 's1_lm', 's2_dep_lm', 's2', 's1', 's2_root', 's1_dep_tk', 's1_root', 's1_id', 's2_dep_tk', 's2_id', 's2_lm']


In [420]:
def featurize_set(X, Y):
    
    X_list, Y_list = [], []
    
    for i in xrange(len(X)):

        uni_tk_prec, uni_tk_rec, uni_tk_f1 = f1_unigram(X[i]['s1'], X[i]['s2'])
        uni_lm_prec, uni_lm_rec, uni_lm_f1 = f1_unigram(X[i]['s1_lm'], X[i]['s2_lm'])
        bleu_tk_prec, bleu_tk_rec, bleu_tk_f1 = f1_bleu(X[i]['s1'], X[i]['s2'])
        bleu_lm_prec, bleu_lm_rec, bleu_lm_f1 = f1_bleu(X[i]['s1_lm'], X[i]['s2_lm'])
        dep_tk_prec, dep_tk_rec, dep_tk_f1 = f1_dep(X[i]['s1_dep_tk'], X[i]['s2_dep_tk'])
        dep_lm_prec, dep_lm_rec, dep_lm_f1 = f1_dep(X[i]['s1_dep_lm'], X[i]['s2_dep_lm'])
        tree_tk_dist = tree_edit_dist(X[i]['s1_root'], X[i]['s2_root'])
        tree_lm_dist = tree_edit_dist(X[i]['s1_root'], X[i]['s2_root'],lemmatized=True) 
        diff, abs_diff = sent_len_diffs(X[i]['s1'], X[i]['s2'])
        X_list.append(
            [uni_tk_prec, uni_tk_rec, uni_tk_f1,
             uni_lm_prec, uni_lm_rec, uni_lm_f1,
             bleu_tk_prec, bleu_tk_rec, bleu_tk_f1,
             bleu_lm_prec, bleu_lm_rec, bleu_lm_f1,
             dep_tk_prec, dep_tk_rec, dep_tk_f1,
             dep_lm_prec, dep_lm_rec, dep_lm_f1,
             tree_tk_dist, tree_lm_dist,
             diff, abs_diff]
        )
        Y_list.append(Y[i])
    
    return X_list, Y_list

In [367]:
# %%time
# X_train_fts, Y_train_fts = featurize_set(X_train, Y_train)

CPU times: user 1h 35min 22s, sys: 14.6 s, total: 1h 35min 36s
Wall time: 1h 35min 39s


In [368]:
# %%time
# X_test_fts, Y_test_fts = featurize_set(X_test, Y_test)

CPU times: user 38min 49s, sys: 4.84 s, total: 38min 54s
Wall time: 38min 54s


In [124]:
import cPickle

In [125]:
data_path = "/Users/jacobsw/Desktop/WORK/OJO_CODE/SENTENCE_SIMILARITIES/DATA/"

In [369]:
# SAVED ON THE FIRST VERSION OF FEATURES
# with open(data_path+'train1.p','wb') as f_train:
#     cPickle.dump((X_train_fts,Y_train_fts), f_train)
# with open(data_path+'test1.p','wb') as f_test:
#     cPickle.dump((X_test_fts,Y_test_fts), f_test)

# SAVED ON THE SECOND VERSION OF FEATURES
# with open(data_path+'train2.p','wb') as f_train:
#     cPickle.dump((X_train_fts,Y_train_fts), f_train)
# with open(data_path+'test2.p','wb') as f_test:
#     cPickle.dump((X_test_fts,Y_test_fts), f_test)

In [126]:
# DATA SAVED, ONLY NEED TO LOAD FROM FILE
with open(data_path+'train1.p','rb') as f_train:
    X_train_fts, Y_train_fts = cPickle.load(f_train)
with open(data_path+'test1.p','rb') as f_test:
    X_test_fts, Y_test_fts = cPickle.load(f_test)

### B. Logistic Regression

##### Fit with Defaults

In [127]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [128]:
lr = LogisticRegression()

In [129]:
lr.fit(X_train_fts, Y_train_fts)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [130]:
y_true = Y_test_fts
y_pred = lr.predict(X_test_fts)

In [131]:
print (accuracy_score(y_true,y_pred)*100)

73.1014492754


In [132]:
print classification_report(y_true,y_pred)

             precision    recall  f1-score   support

          0       0.62      0.50      0.56       578
          1       0.77      0.85      0.81      1147

avg / total       0.72      0.73      0.72      1725



In [393]:
lr.predict([X_test_fts[0]])

array([1])

In [394]:
lr.predict_proba([X_test_fts[0]])

array([[ 0.43215239,  0.56784761]])

##### Grid Search

In [404]:
from sklearn.grid_search import GridSearchCV

In [408]:
parameters = {
    'C': (.5,1.,5.,10.,20.),
    'max_iter': (50,100,200),
    'penalty': ('l1','l2')
}

In [410]:
%%time
grd = GridSearchCV(LogisticRegression(),parameters,cv=5)
grd.fit(X_train_fts,Y_train_fts)

CPU times: user 5min 18s, sys: 673 ms, total: 5min 19s
Wall time: 5min 20s


In [415]:
print "Best Score: %.6f%%" % (grd.best_score_*100)
print "Best Params: "
best_params = grd.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print "\t%s: %r" % (param_name, best_params[param_name])

Best Score: 73.527969%
Best Params: 
	C: 10.0
	max_iter: 200
	penalty: 'l2'


##### Fit with Best

In [439]:
lr = LogisticRegression(C=10.,max_iter=200,penalty='l2')
lr.fit(X_train_fts, Y_train_fts)

LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [440]:
print len(lr.coef_[0]) # 22 params
print lr.coef_[0]

22
[ 0.01036332  0.0406214  -0.04055824 -0.32283921 -0.30543069  0.66289029
  0.65056561 -0.24493119 -0.21115104 -0.15354829 -0.10937086 -0.28815355
 -1.33465664 -0.68327822 -0.93370973  0.29397644  1.0149985   0.68062963
 -0.13821721  0.03671789 -0.06489293  0.01009204]


In [441]:
y_true = Y_test_fts
y_pred = lr.predict(X_test_fts)
print (accuracy_score(y_true,y_pred)*100)
print classification_report(y_true,y_pred)

73.0434782609
             precision    recall  f1-score   support

          0       0.62      0.51      0.56       578
          1       0.77      0.84      0.81      1147

avg / total       0.72      0.73      0.72      1725



### C. SVM

In [469]:
from sklearn import svm

In [474]:
clf = svm.SVC(probability=True)

In [475]:
clf.fit(X_train_fts, Y_train_fts)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [476]:
y_true = Y_test_fts
y_pred = clf.predict(X_test_fts)

In [477]:
print (accuracy_score(y_true,y_pred)*100)

69.1594202899


In [478]:
print classification_report(y_true,y_pred)

             precision    recall  f1-score   support

          0       0.58      0.29      0.39       578
          1       0.71      0.89      0.79      1147

avg / total       0.67      0.69      0.66      1725



In [401]:
clf.predict([X_test_fts[0]])

array([1])

In [402]:
clf.predict_proba([X_test_fts[0]])

array([[ 0.28366846,  0.71633154]])

In [484]:
parameters = {
    'C': (.5,1.,5.,10.,20.),
    'degree': (2,3,4,5),
    'kernel': ('linear','poly','rbf','sigmoid'),
    'shrinking': (True, False)
}

In [None]:
%%time
grd = GridSearchCV(svm.SVC(),parameters,cv=5)
grd.fit(X_train_fts,Y_train_fts)

In [None]:
print "Best Score: %.6f%%" % (grd.best_score_*100)
print "Best Params: "
best_params = grd.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print "\t%s: %r" % (param_name, best_params[param_name])

## V. Evaluation

In [139]:
ojo_sents = u'''
What are the quality of schools in this neighborhood?
What areas have the best schools?
What are the crime statistics in this neighborhood?
What are the number of registered sex offenders in this neighborhood?
What is the walkability score in this neighborhood?
Which neighborhoods have homes that are over 2500 sq ft. 
What neighborhoods have new construction
Show me pictures of the neighborhood
Show me pictures of homes in the neighborhood
How bicycle friendly is this neighborhood?
What is the median income of this neighborhood?
What is the average demographics of this neighborhood? 
What is the poverty score of this neighborhood?
What is the best day of the week to list my home?
What is the best month to list a home like mine for the most money and shortest time?
How much has my home appreciated?
How has appreciation been in my neighborhood vs other neighborhoods?
What has the average appreciation in my neighborhood been over the last x years?
What has the average appreciation in my school district been over the last x years?
What has the average appreciation on my street been over the last x years?
Which neighborhoods are best for kids under 10
Show me the nearest parks
Show me the nearest pools
Show me the nearest dog parks
Show me the nearest urgent care / emergency room?
Show me the nearest fire / police station?
Show me the impact of railroad/trains
How has appreciation been in this neighborhood vs other neighborhoods?
What has the average appreciation in this neighborhood over the last x years?
What has the average appreciation in this school district over the last x years?
Where can I find a house that is a better fit for me for less money?
What is the commute time for this neighborhood?
Which neighborhoods have a commute time of less than 30min from [address]
I want to live in a low traffic spot
Show me diversity of neighborhood
Show me historic natural disaster trends for this area
Show me historic weather trends for this area
Where can I find a house that is a better fit for me for less money?
What confidence level does OJO have that I should list my home now?
What confidence level does OJO have that I should buy a home right now?
How much is my home worth?
What confidence level does OJO have that I should buy a home right now?
Show me district city government information
Which street(s) in this neighborhood have the highest appreciation over x years?
What is the expected appreciation for my home over the next x years?
Which neighborhood in Austin is expected to appreciate the most over the next x years that have homes similar to what I'm interested in?
What areas have mature trees?
What areas have the most greenspace?
What is the expected appreciation for homes in this area over the next x years?
Which neighborhood in Austin is expected to appreciate the most over the next x years that have homes similar to what I'm interested in?
How is this neighborhood impacted by traffic congestion and which time(s) of day?
How fast will my home sell?
Is this a pet friendly neighborhood?
What are the utility costs in this neighborhood?
I want to live in a tidy area
I want a area where the homes are setback from the streeet
Are there complete streets in this neighborhood (connecting sidewalks)?
Green building score?
Air quaiilty of city/neighborhood?
Air quaility of home (VOCs, materials)
Curbside waster services?
Curbside recycling services?
Curbside composting services?
Average heating/cooling costs?
Is sustainiable energy availalbe?
Show me the impact of flight patterns
What are the zoning breakdowns of this neighborhood? (section 8, residtential, mixed used, commercial, etc)?
What is the estimated time to sell my home right now?
How long does it take to sell a home in my neighborhood right now?
How long does it take to sell a home on my street right now?
'''

In [140]:
def drop_question_mark(s):
    return s[:-1] if s.endswith('?') else s

In [141]:
ojo_sents = ojo_sents.split('\n') # split into list of sent strings.
ojo_sents = ojo_sents[1:len(ojo_sents)-1] # get rid of ''s in front and end.
ojo_sents = list({drop_question_mark(sent) for sent in ojo_sents}) # get rid of question mark and duplicates.
# ojo_sents = [sent.split() for sent in ojo_sents]

In [142]:
ojo_sents[0]

u'Show me the impact of flight patterns'

In [143]:
q1 = u'are the schools in the neighborhood good'

In [144]:
from operator import itemgetter
from heapq import nlargest

In [145]:
def top_k_sim(q, model, k=5):
    # q: query sentence, a string
    # model: classifier
   
    preds = []
    X = [featurize_new(q, sent) for sent in ojo_sents]
    probabilities = model.predict_proba(X)[:,1]
    indices = probabilities.argsort()[::-1][:5]
    print "Predictions: "
    print
    for j,i in enumerate(indices):
        
        prob = probabilities[i]
        print '#%d Prediction: %s (prob=%.2f%%)' % (j,ojo_sents[i],prob)
        

In [146]:
%%time
top_k_sim(q1, lr)

Predictions: 

#0 Prediction: What are the zoning breakdowns of this neighborhood? (section 8, residtential, mixed used, commercial, etc) (prob=0.74%)
#1 Prediction: What are the crime statistics in this neighborhood (prob=0.58%)
#2 Prediction: What are the utility costs in this neighborhood (prob=0.58%)
#3 Prediction: What is the walkability score in this neighborhood (prob=0.57%)
#4 Prediction: What are the number of registered sex offenders in this neighborhood (prob=0.55%)
CPU times: user 385 ms, sys: 3.75 ms, total: 388 ms
Wall time: 387 ms
