# Prepare Models

In [1]:
# import libraries
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pal = sns.color_palette()

from scipy.spatial.distance import cosine
import pickle

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
stop = stopwords.words("english")

In [4]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
train["question1"] = train["question1"].fillna("").apply(lambda q: str(q))
train["question2"] = train["question2"].fillna("").apply(lambda q: str(q))
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
id              404290 non-null int64
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404290 non-null object
question2       404290 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [6]:
test.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [7]:
test["question1"] = test["question1"].fillna("").apply(lambda q: str(q))
test["question2"] = test["question2"].fillna("").apply(lambda q: str(q))
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2345796 entries, 0 to 2345795
Data columns (total 3 columns):
test_id      int64
question1    object
question2    object
dtypes: int64(1), object(2)
memory usage: 53.7+ MB


In [8]:
print("Test NA")
print("Train question1,", train["question1"].isnull().sum())
print("Train question2,", train["question2"].isnull().sum())
print("Test question1,", test["question1"].isnull().sum())
print("Test question2,", test["question2"].isnull().sum())

Test NA
Train question1, 0
Train question2, 0
Test question1, 0
Test question2, 0


In [9]:
questions = list(train["question1"] + train["question2"] + test["question1"] + test["question2"])
questions = [str(q) for q in questions]

In [12]:
tfidf = TfidfVectorizer(max_features=512, stop_words=stop).fit(questions)

In [13]:
# save the tfidf model for later use
with open("tfidf_512_model.pkl", "wb") as handle:
    pickle.dump(tfidf, handle)

# uncomment below to load the tfidf_model
# tfidf = pickle.load(open("tfidf_model.pkl", "rb" ) )


In [23]:
train["q1"] = [x for x in tfidf.transform(train["question1"])];
train["q2"] = [x for x in tfidf.transform(train["question2"])];
test["q1"] = [x for x in tfidf.transform(test["question1"])];
test["q2"] = [x for x in tfidf.transform(test["question2"])];

In [None]:
# save as pickle -- don't run this....will run out of ram and hang comp
# train.to_pickle("train_tfidf.pkl");
# test.to_pickle("test_tfidf.pkl");

In [24]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1,q2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"(0, 109796)\t0.112512127617\n (0, 101948)\t...","(0, 109796)\t0.117529377869\n (0, 101948)\t..."
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"(0, 109796)\t0.115928611486\n (0, 100817)\t...","(0, 110997)\t0.165165302109\n (0, 109796)\t..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"(0, 109928)\t0.297810499531\n (0, 108469)\t...","(0, 101352)\t0.293859974832\n (0, 94958)\t0..."
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"(0, 110103)\t0.202745782934\n (0, 107486)\t...","(0, 109881)\t0.146158427322\n (0, 100817)\t..."
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"(0, 109913)\t0.138140309407\n (0, 109125)\t...","(0, 110997)\t0.276220867073\n (0, 109913)\t..."


# Predictions

In [53]:
calc_cosine = (lambda x: cosine(x["q1"].toarray(), x["q2"].toarray()));
predict_dup = (lambda x: 1 if x > 0.5 else 0);

In [25]:
train["cosine"] = train.apply(calc_cosine, axis=1)

  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


In [34]:
train["predict"] = train["cosine"].apply(predict_dup);

In [39]:
(train["is_duplicate"] == train["predict"]).mean()

0.33352049271562495

In [51]:
test["cosine"] = test.apply(calc_cosine, axis=1);

  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


In [77]:
print("NA Count,", sum(test["cosine"].isnull()))

NA Count, 381


In [78]:
test["cosine"] = test["cosine"].fillna(test["cosine"].mean());
test["predict"] = test["cosine"].apply(predict_dup);

In [79]:
output = pd.DataFrame({
    "test_id": test["test_id"],
    "is_duplicate": test["cosine"]
});
output.head(10)

Unnamed: 0,is_duplicate,test_id
0,0.715819,0
1,0.393013,1
2,0.334081,2
3,1.0,3
4,0.49641,4
5,0.725554,5
6,0.385808,6
7,0.746358,7
8,0.331727,8
9,0.634419,9


In [80]:
output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2345796 entries, 0 to 2345795
Data columns (total 2 columns):
is_duplicate    float64
test_id         int64
dtypes: float64(1), int64(1)
memory usage: 35.8 MB


In [81]:
output.to_csv("submission/tfidf_1024_cosine_similarity.csv", index=False);