# Sentence Similarity Measures III: Wide-Inclusive Sentence Featurization

## 0. Contents

* I. Corpora:
    * MSR Paraphrase Corpus
    * Brown
* II. Discriminativity Weighting (Brown, SpaCy lemmatization)
* III. Featurization:
    * Features:
        * Unigram Prec/Rec (Wan et al. 2006) 
        * Bleu Prec/Rec (Papineni et al. 2002)
        * Dependency Prec/Rec (Wan et al. 2006; Moll$\acute{a}$ 2003; Hovy et al. 2015)
        * F1 for Unigram, Bleu & Dependency
        * Tree Edit Distance (Zhang & Sasha Algorithm)
        * Sentence Lengths (Wan et al. 2006)
    * Featurization Function
* IV. Paraphrase Classifier:
    * Training: MSR Paraphrase Corpus
    * Classifier Types:
        * Logistic
        * SVM
* V. Evaluation

## I. Corpora

In [237]:
import numpy as np
import pandas as pd
from nltk.corpus import brown
from spacy.en import English
from collections import defaultdict

In [16]:
parser = English()

##### Load MSR

In [229]:
train_path = "/Users/jacobsw/Desktop/WORK/OJO_CODE/SENTENCE_SIMILARITIES/CORPORA/paraphrase/msr_paraphrase_train.txt"
test_path = "/Users/jacobsw/Desktop/WORK/OJO_CODE/SENTENCE_SIMILARITIES/CORPORA/paraphrase/msr_paraphrase_test.txt"

In [234]:
df_train = pd.read_csv(train_path, delimiter='\t')
df_test = pd.read_csv(test_path, delimiter='\t')
df_train.head()

Unnamed: 0,﻿Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,702876,702977,"Amrozi accused his brother, whom he called the...","Referring to him as only the witness, Amrozi a..."
1,0,2108705,2108831,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...
2,1,1330381,1330521,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an..."
3,0,3344667,3344648,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ..."
4,1,1236820,1236712,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...


In [241]:
print df_train.shape
print df_test.shape

(4076, 5)
(1725, 5)


In [268]:
df_train.ix[0] # NB: index Quality is actually weirdly '﻿Quality'.

﻿Quality                                                     1
#1 ID                                                   702876
#2 ID                                                   702977
#1 String    Amrozi accused his brother, whom he called the...
#2 String    Referring to him as only the witness, Amrozi a...
Name: 0, dtype: object

In [252]:
df_train.ix[0]['#1 String']

'Amrozi accused his brother, whom he called the witness, of deliberately distorting his evidence.'

##### Make Train/Test

In [614]:
from zss import Node # for tree edit distance later.

In [562]:
dep_lemmas = lambda parsed_s: {(token.head.lemma_,token.lemma_) for token in parsed_s
                              if token.head.lemma_!=token.lemma_} # eliminte (v, ROOT, v) cases
dep_tokens = lambda parsed_s: {(token.head.orth_,token.orth_) for token in parsed_s
                              if token.head.lemma_!=token.lemma_} 

In [617]:
get_root_lm = lambda parsed_s: Node([token.lemma_ for token in parsed_s if token.dep_=='ROOT'][0])
get_root_tk = lambda parsed_s: Node([token.orth_ for token in parsed_s if token.dep_=='ROOT'][0])

In [563]:
X_train, X_test = defaultdict(lambda x: defaultdict(list)), defaultdict(lambda x: defaultdict(list))
Y_train, Y_test = defaultdict(int), defaultdict(int)

In [618]:
def parse_msr(df, indexer):
    
    X_dic, Y_dic = defaultdict(lambda x: defaultdict(list)), \
                   defaultdict(lambda x: defaultdict(list))
    
    for i in indexer:
        entry_dic = defaultdict(list)
        s1, s2 = unicode(df.ix[i]['#1 String'].decode('utf-8','ignore')), \
                 unicode(df.ix[i]['#2 String'].decode('utf-8','ignore'))
        parsed_s1, parsed_s2 = parser(s1), parser(s2)
        entry_dic['s1'] = [token.orth_ for token in parsed_s1]
        entry_dic['s2'] = [token.orth_ for token in parsed_s2]
        entry_dic['s1_lm'] = [token.lemma_ for token in parsed_s1]
        entry_dic['s2_lm'] = [token.lemma_ for token in parsed_s2]
        entry_dic['s1_dep_lm'] = dep_lemmas(parsed_s1) # for dep lemma features.
        entry_dic['s2_dep_lm'] = dep_lemmas(parsed_s2)
        entry_dic['s1_dep_tk'] = dep_tokens(parsed_s1) # for dep token features.
        entry_dic['s2_dep_tk'] = dep_tokens(parsed_s2)  
        entry_dic['s1_root_lm'] = get_root_lm(parsed_s1)
        entry_dic['s2_root_lm'] = get_root_lm(parsed_s2)
        entry_dic['s1_root_tk'] = get_root_tk(parsed_s1)
        entry_dic['s2_root_tk'] = get_root_tk(parsed_s2)
        entry_dic['s1_id'] = df.ix[i]['#1 ID'] # for error analysis later.
        entry_dic['s2_id'] = df.ix[i]['#2 ID']
        X_dic[i] = entry_dic
        Y_dic[i] = df.ix[i]['﻿Quality']
    
    return X_dic, Y_dic


In [619]:
%%time
X_train, Y_train = parse_msr(df_train, df_train.index)

CPU times: user 18.7 s, sys: 204 ms, total: 18.9 s
Wall time: 19 s


In [620]:
%%time
X_test, Y_test = parse_msr(df_test, df_test.index)

CPU times: user 7.61 s, sys: 21.9 ms, total: 7.64 s
Wall time: 7.65 s


In [621]:
print 'sentence 1: ', X_train[0]['s1']; print
print 'sentence 2: ', X_train[0]['s2']; print
print 'paraphrase label: ', Y_train[0]

sentence 1:  [u'Amrozi', u'accused', u'his', u'brother', u',', u'whom', u'he', u'called', u'the', u'witness', u',', u'of', u'deliberately', u'distorting', u'his', u'evidence', u'.']

sentence 2:  [u'Referring', u'to', u'him', u'as', u'only', u'the', u'witness', u',', u'Amrozi', u'accused', u'his', u'brother', u'of', u'deliberately', u'distorting', u'his', u'evidence', u'.']

paraphrase label:  1


##### Load Brown

In [3]:
def parse_brown():
    
    sents = brown.sents()
    parsed_sents = [parser(' '.join(sent)) for sent in sents]
    lemma_words = [token.lemma_ for parsed_sent in parsed_sents for token in parsed_sent]
    
    return lemma_words

In [4]:
%%time
brown_words = parse_brown()

CPU times: user 1min 38s, sys: 770 ms, total: 1min 39s
Wall time: 1min 39s


In [5]:
N = len(brown_words)
N

1188973

## I. Discriminativity Weighting (IDF)

**Math**

* $IDF(w) = log\frac{N}{df_w}$, where $N$ is the number of words in a corpus; $df_w$ is word $w$'s frequency in the corpus.

In [496]:
from __future__ import division

In [497]:
log = lambda x: np.log(x) if x>0 else 0 
    # intuitively N > word_count(w) for any w,
    #  therefore we cannot let idf(w) be negative
    #  even when word_count(w) = 0 for w.
div = lambda x,y: x/y if y!=0 else 0

In [498]:
def idf(w):
    
    return log(div(N,brown_words.count(w)))

In [499]:
print "'the': ", idf('the')
print "'discriminate': ", idf('discriminate')

'the':  2.83230709
'discriminate':  12.0426903182


## IIa. Features

### A.  Unigram Prec/Rec

**Math**

* $Uni\_Prec(s_1,s_2) = \frac{word\_overlap(s_1,s_2)\cdot \left(\sum_{w\in s_1\cap s_2}log\frac{N}{df_w}\right)}{word\_count(s_1)}$ (cf. Wan et al. 2006:133, weighted by $IDF$)


* $Uni\_Rec(s_1,s_2) = \frac{word\_overlap(s_1,s_2)\cdot \left(\sum_{w\in s_1\cap s_2}log\frac{N}{df_w}\right)}{word\_count(s_2)}$ (cf. ibid.)

In [500]:
intersection = lambda s1,s2: set(s1).intersection(set(s2))
word_overlap = lambda s1,s2: len(intersection(s1,s2))
lemmatize = lambda s: [token.lemma_ for token in parser(' '.join(s))]

In [501]:
def uni_prec(s1, s2): # s1,s2 assumed to be lists of words (lemmas or tokens)

    return div(word_overlap(s1,s2) * \
               sum(idf(w) for w in intersection(s1,s2)),
               len(s1))


In [504]:
def uni_rec(s1, s2):   
    
    return div(word_overlap(s1,s2) * \
               sum(idf(w) for w in intersection(s1,s2)),
               len(s2))


In [569]:
s0 = X_train[0]['s1']
s1 = X_train[0]['s2'] # known to be the paraphrase of q
s2 = X_train[1]['s1'] # known to not be the paraphrase of q
s0_lm = X_train[0]['s1_lm']
s1_lm = X_train[0]['s2_lm']
s2_lm = X_train[1]['s1_lm']

In [570]:
%%time
print uni_prec(s0,s1)
print uni_prec(s0,s2)
print uni_prec(s0_lm,s1_lm)
print uni_prec(s0_lm,s2_lm)

39.3330201597
0.706130559219
54.5317127116
0.706130559219
CPU times: user 799 ms, sys: 2.96 ms, total: 802 ms
Wall time: 804 ms


In [571]:
%%time
print uni_rec(s0,s1)
print uni_rec(s0,s2)
print uni_rec(s0_lm,s1_lm)
print uni_rec(s0_lm,s2_lm)

37.1478523731
0.706130559219
51.5021731165
0.706130559219
CPU times: user 801 ms, sys: 3.91 ms, total: 805 ms
Wall time: 808 ms


### B. BLEU Prec/Rec

**NB** (cf. Wan et al. 2006:133)

* "... Bleu metric uses the geometric average of unigram, bigram and trigram precision scores."
* "... by reversing [two sentences], ... a recall version of Bleu is obtained."

In [572]:
from nltk import bleu

In [573]:
def bleu_prec(s1, s2, lemmatized=False): # s1 as the 'hypothesis'

    return bleu(s2,s1)

In [574]:
def bleu_rec(s1, s2, lemmatized=False): # s2 as the 'hypothesis' 
    
    return bleu(s1,s2)

In [575]:
%%time
print bleu_prec(s0,s1)
print bleu_prec(s0,s2)
print bleu_prec(s0_lm,s1_lm)
print bleu_prec(s0_lm,s2_lm)

0.585659602743
0.492479060505
0.585659602743
0.492479060505
CPU times: user 8.07 ms, sys: 2.98 ms, total: 11 ms
Wall time: 8.78 ms


In [576]:
%%time
print bleu_rec(s0,s1)
print bleu_rec(s0,s2)
print bleu_rec(s0_lm,s1_lm)
print bleu_rec(s0_lm,s2_lm)

0.57735026919
0.492479060505
0.57735026919
0.492479060505
CPU times: user 7.5 ms, sys: 3.31 ms, total: 10.8 ms
Wall time: 8.4 ms


### C. Dependency Prec/Rec

**Math**

* $Dep\_Prec(s_1,s_2) = \frac{|dep\_pair(s_1)|\cap|dep\_pair(s_2)|}{|dep\_pair(s_1)|}$ (cf. Wan et al. 2006:134)


* $Dep\_Rec(s_1,s_2) = \frac{|dep\_pair(s_1)|\cap|dep\_pair(s_2)|}{|dep\_pair(s_2)|}$ (cf. ibid.)

**NB**: $relation$ in the reference confuses *dependency pair* with *dependency relation*. $relation$ refers to "... a pair of words in a parent-child relationship within the dependency tree, referred to as head-modifier relationship. ... we ignore the label of the relationships which indicates the semantic role".

In [577]:
def dep_prec(dep_pairs_s1, dep_pairs_s2):
    
    return div(len(dep_pairs_s1.intersection(dep_pairs_s2)),
               len(dep_pairs_s1))
        

In [578]:
def dep_rec(dep_pairs_s1, dep_pairs_s2):
    
    return div(len(dep_pairs_s1.intersection(dep_pairs_s2)),
               len(dep_pairs_s2))


In [579]:
s0_dep_tk = X_train[0]['s1_dep_tk']
s1_dep_tk = X_train[0]['s2_dep_tk']
s2_dep_tk = X_train[1]['s1_dep_tk']
s0_dep_lm = X_train[0]['s1_dep_lm']
s1_dep_lm = X_train[0]['s2_dep_lm']
s2_dep_lm = X_train[1]['s1_dep_lm']

In [580]:
%%time
print dep_prec(s0_dep_tk,s1_dep_tk)
print dep_prec(s0_dep_tk,s2_dep_tk)
print dep_prec(s0_dep_lm,s1_dep_lm)
print dep_prec(s0_dep_lm,s2_dep_lm)

0.6
0.0
0.6
0.0
CPU times: user 290 µs, sys: 344 µs, total: 634 µs
Wall time: 294 µs


In [581]:
%%time
print dep_prec(s0_dep_tk,s1_dep_tk)
print dep_prec(s0_dep_tk,s2_dep_tk)
print dep_prec(s0_dep_lm,s1_dep_lm)
print dep_prec(s0_dep_lm,s2_dep_lm)

0.6
0.0
0.6
0.0
CPU times: user 201 µs, sys: 66 µs, total: 267 µs
Wall time: 211 µs


### D. F1

**Math**

* $F1 = 2\cdot\frac{prec\cdot rec}{prec + rec}$ (cf. https://en.wikipedia.org/wiki/F1_score)

In [582]:
def f1_unigram(s1, s2):
    
    prec, rec = uni_prec(s1,s2), uni_rec(s1,s2)
    
    return prec, rec, 2*div(prec*rec,prec+rec) # so later we only do uni_prec/rec once!

In [583]:
def f1_bleu(s1, s2):
    
    prec, rec = bleu_prec(s1,s2), bleu_rec(s1,s2)
    
    return prec, rec, 2*div(prec*rec,prec+rec)

In [584]:
def f1_dep(dep_pairs_s1, dep_pairs_s2):
   
    prec, rec = dep_prec(dep_pairs_s1,dep_pairs_s2), \
                dep_rec(dep_pairs_s1,dep_pairs_s2)
    
    return prec, rec, 2*div(prec*rec,prec+rec)

In [585]:
%%time
print f1_unigram(s0,s1)[2]
print f1_unigram(s0,s2)[2]
print f1_unigram(s0_lm,s1_lm)[2]
print f1_unigram(s0_lm,s2_lm)[2]

38.2092195838
0.706130559219
52.973663777
0.706130559219
CPU times: user 1.64 s, sys: 9.47 ms, total: 1.65 s
Wall time: 1.66 s


In [586]:
%%time
print f1_bleu(s0,s1)[2]
print f1_bleu(s0,s2)[2]
print f1_bleu(s0_lm,s1_lm)[2]
print f1_bleu(s0_lm,s2_lm)[2]

0.581475252201
0.492479060505
0.581475252201
0.492479060505
CPU times: user 14.2 ms, sys: 3.73 ms, total: 17.9 ms
Wall time: 15 ms


In [587]:
%%time
print f1_dep(s0_dep_tk,s1_dep_tk)[2]
print f1_dep(s0_dep_tk,s2_dep_tk)[2]
print f1_dep(s0_dep_lm,s1_dep_lm)[2]
print f1_dep(s0_dep_lm,s2_dep_lm)[2]

0.5625
0
0.5625
0
CPU times: user 207 µs, sys: 165 µs, total: 372 µs
Wall time: 215 µs


### E. Tree Edit Distance

In [626]:
from zss import simple_distance
    # use zss.distance if dynamic tree modification is needed. 
    #  cf. zss api: pythonhosted.org/zss/.

In [631]:
def make_zss_tree(node, dep_pairs):
    
    for dep_pair in dep_pairs:
        if node.label==dep_pair[0]:
            kid = make_zss_tree(Node(dep_pair[1]), dep_pairs)
            node.addkid(kid)
    
    return node

In [632]:
def tree_edit_dist(root_s1, root_s2, dep_pairs_s1, dep_pairs_s2):
   
    tree_s1, tree_s2 = make_zss_tree(root_s1,dep_pairs_s1), \
                       make_zss_tree(root_s2,dep_pairs_s2)
    
    return simple_distance(tree_s1, tree_s2)
    

In [633]:
s0_root_lm = X_train[0]['s1_root_lm']
s1_root_lm = X_train[0]['s2_root_lm']
s2_root_lm = X_train[1]['s1_root_lm']
s0_root_tk = X_train[0]['s1_root_tk']
s1_root_tk = X_train[0]['s2_root_tk']
s2_root_tk = X_train[1]['s1_root_tk']

In [634]:
%%time
print tree_edit_dist(s0_root_tk,s1_root_tk,s0_dep_tk,s1_dep_tk)
print tree_edit_dist(s0_root_tk,s2_root_tk,s0_dep_tk,s2_dep_tk)

57
96
CPU times: user 259 ms, sys: 4.29 ms, total: 264 ms
Wall time: 262 ms


In [635]:
%%time
print tree_edit_dist(s0_root_lm,s1_root_lm,s0_dep_lm,s1_dep_lm)
print tree_edit_dist(s0_root_lm,s2_root_lm,s0_dep_lm,s2_dep_lm)

17
33
CPU times: user 24.5 ms, sys: 2.84 ms, total: 27.4 ms
Wall time: 25.2 ms


##### Step-by-Step Walkthrough of Tree Building

**a. Parse Sample Sents**

In [374]:
sample1 = u'i ate a big mac'
sample2 = u'i ate a small mac'
parsed_sample1 = parser(sample1)
parsed_sample2 = parser(sample2)
print dep_pairs(parsed_sample1)
print dep_pairs(parsed_sample2)

set([(u'mac', u'a'), (u'eat', u'mac'), (u'mac', u'big'), (u'eat', u'i')])
set([(u'mac', u'a'), (u'eat', u'mac'), (u'eat', u'i'), (u'mac', u'small')])


In [375]:
for token in parsed_sample1:
    print token.head.lemma_,token.dep_,token.lemma_

eat nsubj i
eat ROOT eat
mac det a
mac amod big
eat dobj mac


**b. Make Tree & Check for Samples**

In [376]:
test1 = make_zss_tree(Node('eat'),dep_pairs(parsed_sample1))

In [377]:
print test1

2:eat
2:mac
0:a
0:big
0:i


In [378]:
print test1.get_children(test1)
mac, i = test.get_children(test1)

[<zss.simple_tree.Node object at 0x116adf4d0 mac>, <zss.simple_tree.Node object at 0x18287b950 i>]


In [379]:
print mac.get_children(mac)
a, big = mac.get_children(mac)

[<zss.simple_tree.Node object at 0x11a661f10 a>, <zss.simple_tree.Node object at 0x18287b990 big>]


In [380]:
print i.get_children(i)

[]


In [381]:
print a.get_children(a)
print big.get_children(big)

[]
[]


In [382]:
test2 = make_zss_tree(Node('eat'),dep_pairs(parsed_sample2))

In [383]:
print test2

2:eat
2:mac
0:a
0:small
0:i


In [384]:
print test2.get_children(test2)
mac, i = test.get_children(test2)

[<zss.simple_tree.Node object at 0x11a661c90 mac>, <zss.simple_tree.Node object at 0x18287bc90 i>]


In [385]:
print mac.get_children(mac)
a, small = mac.get_children(mac)

[<zss.simple_tree.Node object at 0x11a661c50 a>, <zss.simple_tree.Node object at 0x11a661c10 small>]


In [386]:
print i.get_children(i)

[]


In [387]:
print a.get_children(a)
print small.get_children(small)

[]
[]


**c. Compute Edit Distance**

In [388]:
simple_distance(test1,test2)

1

### F. Sentence Lengths

* "... the difference in length of two sentences ... measured in words by subtracting one length from the other." (cf. Wan et al. 2006:134)
* "... this difference could be a negative or positive integer ... an absolute variant was used." (cf. ibid.)

In [389]:
def sent_len_diffs(s1, s2):
    
    diff = len(s1)-len(s2)
    
    return [diff, abs(diff)]

In [390]:
print sent_len_diffs(s0,s1)
print sent_len_diffs(s0,s2)

[-1, 1]
[0, 0]


## IIb: Featurization Function

**Features (22 in total)**:

* Unigram Prec/Rec + lemmatized variant: 4
* Bleu Prec/Rec + lemmatized variant: 4
* Dependency Prec/Rec + lemmatized variant: 4
* F1 Unigram, Bleu, Dependency + lemmatized variant: 6
* Tree Edit Distance + lemmatized variant: 2
* Sentence Lengths: 2

In [408]:
import numpy as np

In [409]:
def featurize(s1, s2): # s is a list of words.
    
    unigram_fts = [uni_prec(s1,s2),uni_rec(s1,s2),
                   uni_prec(s1,s2,lemmatized=True),uni_rec(s1,s2,lemmatized=True)]
    bleu_fts = [bleu_prec(s1,s2),bleu_rec(s1,s2),
                bleu_prec(s1,s2,lemmatized=True),bleu_rec(s1,s2,lemmatized=True)]
    dep_fts = [dep_prec(s1,s2),dep_rec(s1,s2),
               dep_prec(s1,s2,lemmatized=True),dep_rec(s1,s2,lemmatized=True)]
    f1_fts = [f1_unigram(s1,s2),f1_bleu(s1,s2),f1_dep(s1,s2),
              f1_unigram(s1,s2,lemmatized=True),f1_bleu(s1,s2,lemmatized=True),f1_dep(s1,s2,lemmatized=True)]
    tree_fts = [tree_edit_dist(s1,s2),
                tree_edit_dist(s1,s2,lemmatized=True)]
    len_fts = sent_len_diffs(s1,s2)
                   
    return np.asarray(unigram_fts+bleu_fts+dep_fts+f1_fts+tree_fts+len_fts)
                   

In [411]:
%%time
print featurize(s0,s1)

[ 39.33302016  37.14785237  54.53171271  51.50217312   0.5856596
   0.57735027   0.5856596    0.57735027   0.6          0.52941176   0.5625
   0.52941176  38.20921958   0.58147525   0.5625      52.97366378
   0.58147525   0.54545455  17.          10.          -1.           1.        ]
CPU times: user 2.9 s, sys: 12.9 ms, total: 2.92 s
Wall time: 2.93 s


## IV. Paraphrase Classifier

### A. Featurizing Training/Test from MSR

* "... the training set contains 2753 true paraphrase pairs and 1323 false paraphrase pairs; ... the test set contains 1147 and 578 pairs, respectively." (cf. Ji & Eisenstein 2013:893)

In [413]:
print X_train[0]

defaultdict(<type 'list'>, {'s2_id': 702977, 's2': [u'Referring', u'to', u'him', u'as', u'only', u'the', u'witness', u',', u'Amrozi', u'accused', u'his', u'brother', u'of', u'deliberately', u'distorting', u'his', u'evidence', u'.'], 's1': [u'Amrozi', u'accused', u'his', u'brother', u',', u'whom', u'he', u'called', u'the', u'witness', u',', u'of', u'deliberately', u'distorting', u'his', u'evidence', u'.'], 's1_id': 702876})


In [470]:
def featurize_set(X, Y):
    
    X_list, Y_list = [], []
    for i in xrange(len(X)):
        try:
            X_list.append(featurize(X[i]['s1'],X[i]['s2']))
            Y_list.append(Y[i])
        except Exception:
            print Exception
            print 'idx: ', i
    
    return X_list, Y_list

In [465]:
%%time
X_train_fts, Y_train_fts = featurize_set(X_train, Y_train)

<type 'exceptions.Exception'>
idx:  24


KeyboardInterrupt: 

In [None]:
%%time
X_test_fts, Y_test_fts = featurize_set(X_test, Y_test)

In [None]:
# TODO: long ass sentence issue

In [466]:
X_train[24]

defaultdict(list,
            {'s1': [u'A',
              u'BMI',
              u'of',
              u'25',
              u'or',
              u'above',
              u'is',
              u'considered',
              u'overweight',
              u';',
              u'30',
              u'or',
              u'above',
              u'is',
              u'considered',
              u'obese',
              u'.'],
             's1_id': 1713015,
             's2': [u'A',
              u'BMI',
              u'between',
              u'18.5',
              u'and',
              u'24.9',
              u'is',
              u'considered',
              u'normal',
              u',',
              u'over',
              u'25',
              u'is',
              u'considered',
              u'overweight',
              u'and',
              u'30',
              u'or',
              u'greater',
              u'is',
              u'defined',
              u'as',
              u'obese',
              

In [467]:
s241 = X_train[24]['s1']
s242 = X_train[24]['s2']

In [468]:
tree_edit_dist(s241,s242)

21

In [None]:
import cPickle

In [None]:
data_path = "/Users/jacobsw/Desktop/WORK/OJO_CODE/SENTENCE_SIMILARITIES/DATA/"

In [None]:
with open(data_path+'train.p','wb') as f_train:
    cPickle.dump((X_train_fts,Y_train_fts), f_train)
with open(data_path+'test.p','wb') as f_test:
    cPickle.dump((X_test_fts,Y_test_fts), f_test)

In [437]:
t_X, t_Y = [X_train[i] for i in range(5)], [Y_train[i] for i in range(5)]

In [440]:
%%time
t_Xlist, t_Ylist = featurize_set(t_X,t_Y)

CPU times: user 15.1 s, sys: 37.6 ms, total: 15.1 s
Wall time: 15.2 s


In [441]:
len(X_train)

4076

In [442]:
4076/5

815.2

In [443]:
815.2*15.2

12391.04

In [444]:
12391.04/60

206.51733333333334

In [446]:
206.51733333333334/60 # time needed to featurize the entrie training

3.441955555555556

In [447]:
len(X_test)

1725

In [449]:
((1725/5)*15.2)/3600

1.4566666666666668

In [450]:
3.441955555555556 + 1.4566666666666668 # expected time for featurizing entire corpus

4.898622222222222

In [423]:
tx1 = featurize(s0,s1)

In [424]:
tx1

array([ 39.33302016,  37.14785237,  54.53171271,  51.50217312,
         0.5856596 ,   0.57735027,   0.5856596 ,   0.57735027,
         0.6       ,   0.52941176,   0.5625    ,   0.52941176,
        38.20921958,   0.58147525,   0.5625    ,  52.97366378,
         0.58147525,   0.54545455,  17.        ,  10.        ,
        -1.        ,   1.        ])

In [425]:
ty1 = 1

In [426]:
tx2 = featurize(s0,s2)
ty2 = 0

In [417]:
from sklearn.linear_model import LogisticRegression

In [418]:
lr = LogisticRegression()

In [427]:
lr.fit([tx1,tx2],[ty1,ty2])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### B. Logistic Regression

### C. SVM