In [1]:
%matplotlib inline

In [136]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [3]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [4]:
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [19]:
dev_train_df, dev_test_df = train_test_split(train_df, test_size=0.31, random_state=0)

In [8]:
print "Duplicate percentage:", np.average(train_df['is_duplicate'])

Duplicate percentage: 0.369197853026


In [14]:
qids = pd.Series(train_df['qid1'].tolist() + train_df['qid2'].tolist())

In [38]:
train_qs = pd.Series(train_df['question1'].tolist() + train_df['question2'].tolist()).astype(str)

In [21]:
print "Total question count:", len(qids.unique())
print "Duplicate question count:", sum(qids.value_counts() > 1)

 Total question count: 537933
Duplicate question count: 111780


In [39]:
print "Fraction contains math:", np.average(train_qs.apply(lambda q: '[math]' in q))

Fraction contains math: 0.00117366247001


## Baseline

In [20]:
log_loss(dev_test_df['is_duplicate'], [np.average(dev_train_tf['is_duplicate'])] * dev_test_df.shape[0])

0.65923423793715774

## Preprocessing

In [402]:
tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer(language='english')

DIGIT_LETTER_RE = re.compile(r"([a-zA-Z]+)[\.\-]*(\d+)")
MATH_RE = re.compile(r'\[math\].*?\[\/math\]')
DIGIT_COMMA_RE = re.compile(r'(\d+),(?=000)')
LETTER_LETTER_RE = re.compile(r"([a-zA-Z]+)[/\-](?=[a-zA-Z]+)")

def tokenize(text):
    return " ".join([stemmer.stem(lemmatizer.lemmatize(token)) for token in tokenizer.tokenize(text)])

def clean_unicode(text):
    return ''.join([c for c in text if ord(c) <= 127])

def split_digit_letter(text):
    """ ABC123 -> ABC 123
    """
    return DIGIT_LETTER_RE.sub(r'\1 \2', text)

def replace_math_syntax(text):
    """ [math]a=b+c[/math] -> _math_
    """
    return MATH_RE.sub('_math_', text)

def merge_digit_comma(text):
    """ 15,000 > 15000
    """
    return DIGIT_COMMA_RE.sub(r'\1', text)

def split_letter_letter(text):
    """ mind-blowing -> mind blowing
    improvement/clarification -> improvement clarification
    """
    return LETTER_LETTER_RE.sub(r'\1 ', text)

def load_replace_words_re():
    replace_words_re = []
    with open('data/word_replace.csv') as f:
        for line in f:
            word, replaced_word = line[:-1].split(',')
            replace_words_re.append((re.compile(r"(\W|^)%s(?=\W|$)" % word), r'\1%s' % replaced_word))
        
    return replace_words_re + [
        (re.compile(r'₹'), ' rupee '),
        (re.compile(r'’'), "'"),
        (re.compile(r'∧'), '^')
    ]

REPLACE_WORDS_RE = load_replace_words_re()
def replace_words(text):
    for regex, replaced_word in REPLACE_WORDS_RE:
        text = regex.sub(replaced_word, text)
    return text

def preprocess_text(text):
    text = replace_math_syntax(text)
    text = replace_words(text)
    text = merge_digit_comma(text)
    text = split_digit_letter(text)
    text = split_letter_letter(text)
    text = clean_unicode(text)
    return tokenize(text)

In [405]:
split_letter_letter('Vinyl-Leather-Rubber')

'Vinyl Leather Rubber'

In [396]:
train_df['cleaned_question1'] = train_df['question1'].apply(lambda text: preprocess_text(text))

In [397]:
train_df['cleaned_question1']

0         what is the step by step guid to invest in sha...
1         what is the stori of kohinoor ( koh i-noor ) d...
2         how can i increas the speed of my internet con...
3           whi am i mental veri lone ? how can i solv it ?
4         which one dissolv in water quik sugar , salt ,...
5         astrolog : i am a capricorn sun cap moon and c...
6                                      should i buy tiago ?
7                           how can i be a good geologist ?
8                              when do you use instead of ?
9         motorola ( compani ) : can i hack my charter m...
10        method to find separ of slit use fresnel bipri...
11               how do i read and find my youtub comment ?
12                     what can make physic easi to learn ?
13                  what wa your first sexual experi like ?
14        what are the law to chang your status from a s...
15        what would a trump presid mean for current int...
16                                  what

In [242]:
train_df[(train_df['question1'].str.contains('\[math\]')) & (train_df.is_duplicate) == 1].sample().values

array([[120855, 195960, 195961,
        'How would you simplify [math]i^{100}[/math]?',
        'How do I simplify i\xe2\x88\xa7100?', 1]], dtype=object)

In [238]:
train_df[(train_df['question1'].str.contains('\[math\]')) & (train_df.is_duplicate == 0)].sample().values

array([[184627, 101978, 282026,
        'How do I integrate [math]\\displaystyle\\int_{\\frac\\pi6}^{\\frac\\pi4}\\sqrt{1-\\cos2x}\\,dx[/math]?',
        'How can I integrate [math] \\int_{0}^{\\frac{\\pi}{4}} \\ln(1+\\tan(x)) dx [/math]?',
        0]], dtype=object)

In [257]:
train_df[(train_df['question2'].apply(lambda text: is_ascii(text)) == False) & (train_df['is_duplicate']==1)]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
120,120,241,242,Why my question was marked as needing imrovement?,How can I ask a question without getting marke...,1
176,176,353,354,What is it like to live in Cologne?,"What is it like to live in Köln, Germany?",1
199,199,399,400,What are the effects of demonitization of 500 ...,What will be the impact of scrapping of ₹500 a...,1
307,307,614,615,What job possibilities exist for a Bachelors i...,What jobs are available with a bachelor’s degr...,1
721,721,1438,1439,How does Quora quickly mark questions as needi...,Why does Quora mark my questions as needing im...,1
791,791,1577,1578,What is the Hizmet movement?,What is the Gülen movement?,1
907,907,1809,1810,What is your view on the recent demonetization...,What are your views on demonetization of ₹500 ...,1
1141,1141,2274,2275,What is the best way for a native Chinese spea...,How can I learn English well as a Chinese stud...,1
1452,1452,2892,2893,How do you view the Indian government's decisi...,What are going to be the rammifications of the...,1
1530,1530,3047,3048,"How long will the Pokémon GO ""fever"" will last?",How long you thing this Pokémon GO trend will ...,1


In [251]:
s = '4cos\xc2\xb2\xce\xb8+4sec\xc2\xb2\xce\xb8=5'
s = 'When do you use シ instead of し'

In [260]:
tokenize('Vinyl-Leather-Rubber')

u'vinyl-leather-rubb'

In [298]:
tokenize(clean_unicode('4cos\xc2\xb2\xce\xb8+4sec\xc2\xb2\xce\xb8=5'))

u'4cos+4sec=5'