In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, log_loss
from nltk import word_tokenize
import scipy.sparse as sps
from sklearn.linear_model import LogisticRegression
from scipy.spatial.distance import cosine

### Data read

In [2]:
df = pd.read_csv('data/quora-train.csv')
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
df.iloc[3]['question1']

'Why am I mentally very lonely? How can I solve it?'

In [4]:
df.iloc[3]['question2']

'Find the remainder when [math]23^{24}[/math] is divided by 24,23?'

In [5]:
df[df.is_duplicate == 1].head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


In [6]:
df[df.question2.isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0


In [7]:
df = df.fillna('a')
df[df.question2.isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate


### CV

In [8]:
x = df[['question1', 'question2']]
y = df['is_duplicate']
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_indexes, test_indexes = next(sss.split(x, y))
train = df.iloc[train_indexes]
test = df.iloc[test_indexes]

### TF-IDF

Options: use ngrams, max features, remove stopwords

In [None]:
all_texts = np.concatenate((np.array(df['question1']), np.array(df['question2'])))
vectorizer = TfidfVectorizer(tokenizer=word_tokenize)
vectorizer.fit(all_texts)

In [None]:
def extract_tfidf_vectors(df, vectorizer):
    q1 = df['question1']
    q2 = df['question2']
    q1_vector = vectorizer.transform(q1)
    q2_vector = vectorizer.transform(q2)
    return q1_vector, q2_vector

q1_vector_test, q2_vector_test = extract_tfidf_vectors(test, vectorizer)
q1_vector_train, q2_vector_train = extract_tfidf_vectors(train, vectorizer)

In [None]:
def get_cosine(a, b):
    return np.array([1-cosine(a[i].toarray(), b[i].toarray()) for i in range(a.shape[0])])

test_cosine = get_cosine(q1_vector_test, q2_vector_test)
train_cosine = get_cosine(q1_vector_train, q2_vector_train)

In [None]:
model_related = LogisticRegression(class_weight='balanced')
x_train = sps.vstack([
    sps.hstack([q1_vector_train, q2_vector_train, np.expand_dims(train_cosine, axis = 1)]),
    sps.hstack([q2_vector_train, q1_vector_train, np.expand_dims(train_cosine, axis = 1)])
])
y_train = np.concatenate([train['is_duplicate'], train['is_duplicate']]) 

model_related.fit(x_train, y_train)

In [None]:
x_test = sps.vstack([
    sps.hstack([q1_vector_test, q2_vector_test, np.expand_dims(test_cosine, axis = 1)]),
    sps.hstack([q2_vector_test, q1_vector_test, np.expand_dims(test_cosine, axis = 1)])
])
y_test = np.concatenate([test['is_duplicate'], test['is_duplicate']])

y_prediction = model_related.predict(x_test)
y_prediction_proba = model_related.predict_proba(x_test)

In [None]:
recall_score(y_test, y_prediction)

In [None]:
precision_score(y_test, y_prediction)

In [None]:
f1_score(y_test, y_prediction)

In [None]:
accuracy_score(y_test, y_prediction)

In [None]:
log_loss(y_test, y_prediction_proba)