In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot
from nltk.corpus import stopwords
from collections import Counter
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

%matplotlib inline
import re

In [2]:
#CoreNLP web server : java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')
properties={
  'annotators': 'tokenize,ssplit,pos,depparse,parse,mention,openie,ner',
  'outputFormat': 'json'}

In [3]:
def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input)-n+1):
        output.append(input[i:i+n])
    return [' '.join(x) for x in output]

In [4]:
def get_qn_word(sentence):
    qn_words = []
    found = re.match( r'(.*)?(what|where|why|how|when|which)(.*)?', sentence)
    if found :
        if found.group(2) <> None:
            qn_words.append(found.group(2))
    return qn_words

In [5]:
def get_root(dep_tree):
    for dep in dep_tree:
        if dep['dep'] == 'ROOT':
            return str(dep['dependentGloss']).lower()

def get_subj(dep_tree):
    for dep in dep_tree:
        if dep['dep'] == 'nsubj':
            return str(dep['dependentGloss']).lower()

def get_obj(dep_tree):
    for dep in dep_tree:
        if dep['dep'] == 'dobj':
            return str(dep['dependentGloss']).lower()

In [6]:
def get_openie_features(sentence):
    subject_list = []
    object_list = []
    rel_list = []
    if 'openie' in sentence:
        for rel_map in sentence['openie']:
            subject_list.append(rel_map['subject'])
            object_list.append(rel_map['object'])
            rel_list.append(rel_map['relation'])
    return (set(object_list), set(rel_list), set(subject_list))

In [7]:
def nlp_features(row):
    openie_feature_req = False
    question1 = nlp.annotate(str(row['question1']), properties)
    question2 = nlp.annotate(str(row['question2']), properties)
    return_obj = []
    qn1_ner_list = []
    qn2_ner_list = []
    qn1_lemma_list = []
    qn2_lemma_list = []
    if type(question1) == dict and type(question1) == dict:
        if 'sentences' in question1 and 'sentences' in question2 and len(question1['sentences']) <> 0:
            for token in question1['sentences'][0]['tokens']:
                qn1_lemma_list.append(token['lemma'].lower())
                if token['ner'] <> 'O':
                    qn1_ner_list.append(token['originalText'].lower())
            for token in question1['sentences'][0]['tokens']:
                qn2_lemma_list.append(token['lemma'].lower())
                if token['ner'] <> 'O':
                    qn2_ner_list.append(token['originalText'].lower())
            
            #ner_overlap        
            return_obj.append(len(set(qn1_ner_list).intersection(set(qn2_ner_list))))
    
            #lemma_overlap
            return_obj.append(len(set(qn1_lemma_list).intersection(set(qn2_lemma_list)))/float(len(qn1_lemma_list) + len(qn2_lemma_list)))
    
            #ner_q1_count
            return_obj.append(len(qn1_ner_list))
    
            #ner_q2_count
            return_obj.append(len(qn2_ner_list))
    
            #qn_word_overlap
            return_obj.append(len(set(get_qn_word(str(row['question1']).lower())).intersection(set(get_qn_word(str(row['question2']).lower())))))
        
            #bigram_overlap
            return_obj.append(len(set(ngrams(str(row['question1']).lower(),2)).intersection(set(ngrams(str(row['question2']).lower(),2)))))
        
            #trigram_overlap
            return_obj.append(len(set(ngrams(str(row['question1']).lower(),3)).intersection(set(ngrams(str(row['question2']).lower(),3)))))
        
            #root_match
            return_obj.append(1 if get_root(question1['sentences'][0]['enhancedPlusPlusDependencies']) == get_root(question2['sentences'][0]['enhancedPlusPlusDependencies']) else 0)
        
            #subj_match
            return_obj.append(1 if get_subj(question1['sentences'][0]['enhancedPlusPlusDependencies']) == get_subj(question2['sentences'][0]['enhancedPlusPlusDependencies']) else 0)
        
            #dobj_match
            return_obj.append(1 if get_obj(question1['sentences'][0]['enhancedPlusPlusDependencies']) == get_obj(question2['sentences'][0]['enhancedPlusPlusDependencies']) else 0)
        
            if openie_feature_req:
                #openie features
                if 'openie' in question1['sentences'][0] and 'openie' in question2['sentences'][0]: 
                    qn1_openie_features = get_openie_features(question1['sentences'][0])
                    qn2_openie_features = get_openie_features(question2['sentences'][0])
        
                    if not qn1_openie_features and not qn2_openie_features:
                        #subject_overlap
                        return_obj.append(len(set(qn1_openie_features[2]).intersection(set(qn2_openie_features[2]))))
                        #relation_overlap
                        return_obj.append(len(set(qn1_openie_features[1]).intersection(set(qn2_openie_features[1]))))
                        #object_overlap
                        return_obj.append(len(set(qn1_openie_features[0]).intersection(set(qn2_openie_features[0]))))
                    
                else:
                    return_obj.append(0)
                    return_obj.append(0)
                    return_obj.append(0)
        else:
            if openie_feature_req:
                return_obj = [0,0.0,0,0,0,0,0,0,0,0,0,0,0]
            else :
                return_obj = [0,0.0,0,0,0,0,0,0,0,0]
    else:
        if openie_feature_req:
            return_obj = [0,0.0,0,0,0,0,0,0,0,0,0,0,0]
        else :
            return_obj = [0,0.0,0,0,0,0,0,0,0,0]        
    return return_obj

In [8]:
data_train = pd.read_csv('data/train.csv')
data_test = pd.read_csv('data/test.csv')

In [9]:
nlp_features_train = data_train.apply(nlp_features, axis=1, raw=True)
nlp_features_train_df = pd.DataFrame(list(nlp_features_train), columns=['ner_overlap','lemma_overlap', 
                                                                           'ner_q1_count','ner_q2_count','qn_word_overlap',
                                                                        'bigram_overlap', 'trigram_overlap',
                                                                       'root_match', 'subj_match', 'dobj_match'])

In [10]:
nlp_features_train_df.to_csv('data/nlp_features_train.csv', sep=',')

In [9]:
nlp_features_test = data_test.apply(nlp_features, axis=1, raw=True)
nlp_features_test_df = pd.DataFrame(list(nlp_features_test), columns=['ner_overlap','lemma_overlap', 
                                                                           'ner_q1_count','ner_q2_count','qn_word_overlap',
                                                                        'bigram_overlap', 'trigram_overlap',
                                                                       'root_match', 'subj_match', 'dobj_match'])
nlp_features_test_df.to_csv('data/nlp_features_test.csv', sep=',')

TypeError: ('string indices must be integers', u'occurred at index 124593')

In [None]:
#124593
data_train[0:25].apply(nlp_features, axis=1, raw=True)

In [141]:
data_train[5:10]['is_duplicate']

5    1
6    0
7    1
8    0
9    0
Name: is_duplicate, dtype: int64