In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv('quora-question-pairs/train.csv')
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
df_train.shape

(404290, 6)

In [4]:
def check_null_rows(df):
    is_NaN = df.isnull()
    row_has_NaN = is_NaN.any(axis=1)
    rows_with_NaN = df[row_has_NaN]
    return rows_with_NaN

In [5]:
check_null_rows(df_train)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0
363362,363362,493340,493341,,My Chinese name is Haichao Yu. What English na...,0


In [6]:
df_train = df_train.sample(30000, random_state=42)

In [7]:
df_train.shape

(30000, 6)

In [8]:
check_null_rows(df_train)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate


# Feature Engineering

### Column's with number of character in each question

In [9]:
df_train['q1_char_len'] = df_train['question1'].str.len()
df_train['q2_char_len'] = df_train['question2'].str.len()

In [10]:
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_char_len,q2_char_len
8067,8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,34,34
368101,368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,0,53,51
70497,70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,0,96,45
226567,226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,1,44,52
73186,73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,1,38,29


### Column's with number of words in each question

In [11]:
df_train['q1_word_len'] = df_train['question1'].apply(lambda x: len(x.split(" ")))
df_train['q2_word_len'] = df_train['question2'].apply(lambda x: len(x.split(" ")))

In [12]:
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_char_len,q2_char_len,q1_word_len,q2_word_len
8067,8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,34,34,8,8
368101,368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,0,53,51,11,9
70497,70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,0,96,45,16,8
226567,226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,1,44,52,7,9
73186,73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,1,38,29,9,7


### Column with total common words in question1 and question2

In [13]:
def common_words(row):
    word_list1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    word_list2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
    return len(word_list1 & word_list2)

In [14]:
df_train['common_words'] = df_train.apply(common_words, axis=1)

In [15]:
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_char_len,q2_char_len,q1_word_len,q2_word_len,common_words
8067,8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,34,34,8,8,7
368101,368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,0,53,51,11,9,6
70497,70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,0,96,45,16,8,2
226567,226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,1,44,52,7,9,4
73186,73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,1,38,29,9,7,3


### Column with total unique words in the questions

In [16]:
def unique_words(row):
    word_list1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    word_list2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
    return len(set(list(word_list1) + list(word_list2)))

In [17]:
df_train['total_unique_words'] = df_train.apply(unique_words, axis=1)

In [18]:
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_char_len,q2_char_len,q1_word_len,q2_word_len,common_words,total_unique_words
8067,8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,34,34,8,8,7,9
368101,368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,0,53,51,11,9,6,14
70497,70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,0,96,45,16,8,2,22
226567,226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,1,44,52,7,9,4,12
73186,73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,1,38,29,9,7,3,13


### Column's with clean questions

In [19]:
from nltk.corpus import stopwords
from nltk import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

tokenizer = RegexpTokenizer(r"\w+")
lemmatizer = WordNetLemmatizer()

In [20]:
def clean_question(ques):
    ques = ques.lower()
    words = tokenizer.tokenize(ques)
    words = [lemmatizer.lemmatize(word.strip()) for word in words if word not in stopwords.words('english')]
    return " ".join(words)

In [21]:
df_train['clean_question1'] = df_train['question1'].apply(clean_question)
df_train['clean_question2'] = df_train['question2'].apply(clean_question)

In [22]:
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_char_len,q2_char_len,q1_word_len,q2_word_len,common_words,total_unique_words,clean_question1,clean_question2
8067,8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,34,34,8,8,7,9,play pokémon go korea,play pokémon go china
368101,368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,0,53,51,11,9,6,14,best side dish crab cake,good side dish buffalo chicken
70497,70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,0,96,45,16,8,2,22,advisable better material crash test automobil...,best server setup buddypress
226567,226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,1,44,52,7,9,4,12,improve logical programming skill,improve logical skill programming
73186,73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,1,38,29,9,7,3,13,close see 3rd world war,close world war iii


### Convert the question into vector

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=3000)

In [24]:
questions = list(df_train['clean_question1']) + list(df_train['clean_question2'])
q1_arr, q2_arr = np.vsplit(tfidf.fit_transform(questions).toarray(), 2)

In [25]:
q1 = pd.DataFrame(q1_arr, index=df_train.index)
q2 = pd.DataFrame(q2_arr, index=df_train.index)
df = pd.concat([q1, q2], axis=1)

In [26]:
df.shape

(30000, 6000)

# Creating matrix of features and labels

In [27]:
df_train.columns

Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate',
       'q1_char_len', 'q2_char_len', 'q1_word_len', 'q2_word_len',
       'common_words', 'total_unique_words', 'clean_question1',
       'clean_question2'],
      dtype='object')

In [28]:
df_temp = df_train[['is_duplicate',
       'q1_char_len', 'q2_char_len', 'q1_word_len', 'q2_word_len',
       'common_words', 'total_unique_words']]
df_temp.head()

Unnamed: 0,is_duplicate,q1_char_len,q2_char_len,q1_word_len,q2_word_len,common_words,total_unique_words
8067,0,34,34,8,8,7,9
368101,0,53,51,11,9,6,14
70497,0,96,45,16,8,2,22
226567,1,44,52,7,9,4,12
73186,1,38,29,9,7,3,13


In [29]:
df_final = pd.concat([df, df_temp], axis=1)
df_final.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2997,2998,2999,is_duplicate,q1_char_len,q2_char_len,q1_word_len,q2_word_len,common_words,total_unique_words
8067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,34,34,8,8,7,9
368101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,53,51,11,9,6,14
70497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,96,45,16,8,2,22
226567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1,44,52,7,9,4,12
73186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1,38,29,9,7,3,13


In [30]:
df_final.shape

(30000, 6007)

In [31]:
X = df_final.drop(columns=['is_duplicate'])
y = df_final['is_duplicate']

In [32]:
X.shape, y.shape

((30000, 6006), (30000,))

# Train the model

### Splitting the data into train and test

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Train using Random Forest Classifier

In [34]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [35]:
rf.fit(X_train, y_train)



RandomForestClassifier()

In [36]:
y_pred = rf.predict(X_test)



In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7706666666666667

# Create a pipeline

### Convert the given questions into the required features to predict directly

In [38]:
def pipeline(question1, question2):
    dic = {'question1': [question1], 'question2': [question2]}
    df = pd.DataFrame(dic)

    df['q1_char_len'] = df['question1'].str.len()
    df['q2_char_len'] = df['question2'].str.len()

    df['q1_word_len'] = df['question1'].apply(lambda x: len(x.split(" ")))
    df['q2_word_len'] = df['question2'].apply(lambda x: len(x.split(" ")))

    df['common_words'] = df.apply(common_words, axis=1)
    
    df['total_unique_words'] = df.apply(unique_words, axis=1)

    df['clean_question1'] = df['question1'].apply(clean_question)
    df['clean_question2'] = df['question2'].apply(clean_question)

    questions = list(df['clean_question1']) + list(df['clean_question2'])
    q1_arr, q2_arr = np.vsplit(tfidf.transform(questions).toarray(), 2)

    q1 = pd.DataFrame(q1_arr, index=df.index)
    q2 = pd.DataFrame(q2_arr, index=df.index)
    df_concat = pd.concat([q1, q2], axis=1)

    df_temp = df[['q1_char_len', 'q2_char_len', 'q1_word_len', 'q2_word_len', 'common_words', 'total_unique_words']]
    
    df_final = pd.concat([df_concat, df_temp], axis=1)
    
    X = df_final

    return rf.predict(X)

In [46]:
output = pipeline('Difference between tensorflow and pytorch', 'How tensorflow differ from pytorch')[0]
if output == 1:
    print('Questions are duplicate')
else:
    print('Questions are not duplicate')

Questions are not duplicate


