In [None]:
!pip install distance

In [None]:
!pip install fuzzywuzzy

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings, nltk, re, distance
warnings.filterwarnings('ignore')
from nltk.corpus import stopwords
from bs4 import BeautifulSoup 
from tqdm.notebook import tqdm
tqdm.pandas()
from fuzzywuzzy import fuzz
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout   
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, plot_confusion_matrix, plot_precision_recall_curve
import joblib

In [None]:
nltk.download('stopwords')

In [None]:
train = pd.read_csv('../input/training-data/train.csv')
train = train.sample(n=100000).reset_index(drop=True)
train.head()

In [None]:
train.shape

In [None]:
train.isnull().sum()

In [None]:
train = train.dropna()

In [None]:
train.duplicated().sum()

In [None]:
def text_preprocess(ques):
    ques = str(ques).lower().strip()

    ques = ques.replace('%',' percent')
    ques = ques.replace('$',' dollar ')
    ques = ques.replace('₹', ' rupee ')
    ques = ques.replace('€', ' euro ')
    ques = ques.replace('@', ' at ')
    ques = ques.replace('[math]','')
    ques = ques.replace(',000,000,000 ', 'b ')
    ques = ques.replace(',000,000 ', 'm ')
    ques = ques.replace(',000 ', 'k ')
    ques = re.sub(r'([0-9]+)000000000', r'\1b', ques)
    ques = re.sub(r'([0-9]+)000000', r'\1m', ques)
    ques = re.sub(r'([0-9]+)000', r'\1k', ques)

    # Decontract words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    decontracted_ques = []

    for word in ques.split():
      if word in contractions:
        word = contractions[word]
      decontracted_ques.append(word)
    
    ques = ' '.join(decontracted_ques)
    ques = ques.replace("'ve", " have")
    ques = ques.replace("n't", " not")
    ques = ques.replace("'re", " are")
    ques = ques.replace("'ll", " will")
    ques = re.sub(r"[^A-Za-z0-9]", " ", ques)
    ques = re.sub('[,\.\(\)?"\']'," ",ques)
    ques = re.sub(r"what's", "", ques)
    ques = re.sub(r"What's", "", ques)
    ques = re.sub(r"\'s", " ", ques)
    ques = re.sub(r"\'ve", " have ", ques)
    ques = re.sub(r"can't", "cannot ", ques)
    ques = re.sub(r"n't", " not ", ques)
    ques = re.sub(r"I'm", "I am", ques)
    ques = re.sub(r" m ", " am ", ques)
    ques = re.sub(r"\'re", " are ", ques)
    ques = re.sub(r"\'d", " would ", ques)
    ques = re.sub(r"\'ll", " will ", ques)
    ques = re.sub(r"\0k ", "0000 ", ques)
    ques = re.sub(r" e g ", " eg ", ques)
    ques = re.sub(r" b g ", " bg ", ques)
    ques = re.sub(r"\0s", "0", ques)
    ques = re.sub(r" 9 11 ", "911", ques)
    ques = re.sub(r"e-mail", "email", ques)
    ques = re.sub(r"\s{2,}", " ", ques)
    ques = re.sub(r"quikly", "quickly", ques)
    ques = re.sub(r" usa ", " America ", ques)
    ques = re.sub(r" USA ", " America ", ques)
    ques = re.sub(r" u s ", " America ", ques)
    ques = re.sub(r" uk ", " England ", ques)
    ques = re.sub(r" UK ", " England ", ques)
    ques = re.sub(r"india", "India", ques)
    ques = re.sub(r"china", "China", ques)
    ques = re.sub(r"chinese", "Chinese", ques) 
    ques = re.sub(r"imrovement", "improvement", ques)
    ques = re.sub(r"intially", "initially", ques)
    ques = re.sub(r"quora", "Quora", ques)
    ques = re.sub(r" dms ", "direct messages ", ques)  
    ques = re.sub(r"demonitization", "demonetization", ques) 
    ques = re.sub(r"actived", "active", ques)
    ques = re.sub(r"kms", " kilometers ", ques)
    ques = re.sub(r"KMs", " kilometers ", ques)
    ques = re.sub(r" cs ", " computer science ", ques) 
    ques = re.sub(r" upvotes ", " up votes ", ques)
    ques = re.sub(r" iPhone ", " phone ", ques)
    ques = re.sub(r"\0rs ", " rs ", ques) 
    ques = re.sub(r"calender", "calendar", ques)
    ques = re.sub(r"ios", "operating system", ques)
    ques = re.sub(r"gps", "GPS", ques)
    ques = re.sub(r"gst", "GST", ques)
    ques = re.sub(r"programing", "programming", ques)
    ques = re.sub(r"bestfriend", "best friend", ques)
    ques = re.sub(r"dna", "DNA", ques)
    ques = re.sub(r"III", "3", ques) 
    ques = re.sub(r"the US", "America", ques)
    ques = re.sub(r"Astrology", "astrology", ques)
    ques = re.sub(r"Method", "method", ques)
    ques = re.sub(r"Find", "find", ques) 
    ques = re.sub(r"banglore", "Banglore", ques)
    ques = re.sub(r" J K ", " JK ", ques)

    # Eliminate HTML tags
    ques = BeautifulSoup(ques)
    ques = ques.get_text()

    # Remove punctuation characters
    pattern = re.compile('\W')
    ques = re.sub(pattern,' ',ques).strip()
    
    return ques

In [None]:
train.question1 = train.question1.progress_apply(text_preprocess)
train.question2 = train.question2.progress_apply(text_preprocess)

In [None]:
train.head()

In [None]:
train['q1_len'] = train.question1.apply(len)
train['q2_len'] = train.question2.apply(len)

In [None]:
train['q1_num_words'] = train.question1.apply(lambda sent: len(sent.split()))
train['q2_num_words'] = train.question2.apply(lambda sent: len(sent.split()))

In [None]:
def common_words(row):
    wq1 = set(map(lambda x: x.lower().strip(),row['question1'].split()))
    wq2 = set(map(lambda x: x.lower().strip(),row['question2'].split()))
    return len(wq1 & wq2)

In [None]:
train['common_words'] = train.apply(common_words,axis=1)
train.head()

In [None]:
def total_words(row):
  wq1 = set(map(lambda x: x.lower().strip(), row['question1'].split()))
  wq2 = set(map(lambda x: x.lower().strip(), row['question2'].split()))
  return len(wq1) + len(wq2)

In [None]:
train['total_words'] = train.apply(total_words,axis=1)
train.head()

In [None]:
train['shared_words'] = round(train.common_words / train.total_words,2)
train.head()

In [None]:
def generate_token_features(row):
  ques1 = row['question1']
  ques2 = row['question2']
  dummy = 0.0001
  token_features = np.zeros((8,),'float32')
  q1_len, q2_len = len(ques1), len(ques2)
  tokens_ques1 = ques1.split()
  tokens_ques2 = ques2.split()

  if len(tokens_ques1) == 0 or len(tokens_ques2) == 0:
    return token_features
  
  non_stopwords_q1 = set([word for word in tokens_ques1 if word not in stopwords.words('english')])
  non_stopwords_q2 = set([word for word in tokens_ques2 if word not in stopwords.words('english')])

  stopwords_q1 = set([word for word in tokens_ques1 if word in stopwords.words('english')])
  stopwords_q2 = set([word for word in tokens_ques2 if word in stopwords.words('english')])
  common_word_count = len(non_stopwords_q1.intersection(non_stopwords_q2))
  common_stop_count = len(stopwords_q1.intersection(stopwords_q2))
  common_token_count = len(set(tokens_ques1).intersection(set(tokens_ques2)))
  
  token_features[0] = common_word_count / (min(q1_len,q2_len) + dummy)
  token_features[1] = common_word_count / (max(q1_len,q2_len) + dummy)
  token_features[2] = common_stop_count / (min(q1_len,q2_len) + dummy)
  token_features[3] = common_stop_count / (max(q1_len,q2_len) + dummy)
  token_features[4] = common_token_count / (min(q1_len,q2_len) + dummy)
  token_features[5] = common_token_count / (max(q1_len,q2_len) + dummy)
  token_features[6] = int(tokens_ques1[0] == tokens_ques2[0])
  token_features[7] = int(tokens_ques1[-1] == tokens_ques2[-1])

  return token_features

In [None]:
token_features = train.progress_apply(generate_token_features,axis=1)

train['cwc_min'] = list(map(lambda x: x[0], token_features))
train['cwc_max'] = list(map(lambda x: x[1], token_features))
train['csc_min'] = list(map(lambda x: x[2], token_features))
train['csc_max'] = list(map(lambda x: x[3], token_features))
train['ctc_min'] = list(map(lambda x: x[4], token_features))
train['ctc_max'] = list(map(lambda x: x[5], token_features))
train['first_word_same'] = list(map(lambda x: x[6], token_features))
train['last_word_same'] = list(map(lambda x: x[7], token_features))

train.head()

In [None]:
def generate_length_features(row):   
    ques1 = row['question1']
    ques2 = row['question2']  
    length_features = np.zeros((3,),'float32')
    q1_tokens = ques1.split()
    q2_tokens = ques2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features

    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
    length_features[1] = (len(q1_tokens) + len(q2_tokens))/2
    strs = list(distance.lcsubstrings(ques1, ques2))
    length_features[2] = len(strs) / (min(len(ques1), len(ques2)) + 1) 
    return length_features

In [None]:
length_features = train.progress_apply(generate_length_features,axis=1)

train['mean_ques_len'] = list(map(lambda x: x[0], length_features))
train['abs_len_diff'] = list(map(lambda x: x[1], length_features))
train['longest_sub_ratio'] = list(map(lambda x: x[2], length_features))

train.head()

In [None]:
def generate_fuzzy_features(row):
  ques1 = row['question1']
  ques2 = row['question2']

  fuzz_features = np.zeros((4,),'float32')

  # fuzz ratio
  fuzz_features[0] = fuzz.QRatio(ques1,ques2)
  # fuzz partial ratio
  fuzz_features[1] = fuzz.partial_ratio(ques1,ques2)
  # token set ratio
  fuzz_features[2] = fuzz.token_set_ratio(ques1,ques2)
  # token sort ratio
  fuzz_features[3] = fuzz.token_sort_ratio(ques1,ques2)

  return fuzz_features

In [None]:
fuzz_features = train.progress_apply(generate_fuzzy_features,axis=1)

train['fuzz_ratio'] = list(map(lambda x: x[0],fuzz_features))
train['fuzz_partial_ratio'] = list(map(lambda x: x[1],fuzz_features))
train['token_set_ratio'] = list(map(lambda x: x[2],fuzz_features))
train['token_sort_ratio'] = list(map(lambda x: x[3],fuzz_features))

train.head()

In [None]:
sns.pairplot(train[['cwc_min','cwc_max','ctc_min','ctc_max','csc_min','csc_max','is_duplicate']],hue='is_duplicate')

In [None]:
sns.pairplot(train[['first_word_same','last_word_same','is_duplicate']],hue='is_duplicate')

In [None]:
sns.pairplot(train[['mean_ques_len','abs_len_diff','longest_sub_ratio','is_duplicate']],hue='is_duplicate')

In [None]:
sns.pairplot(train[['fuzz_ratio','fuzz_partial_ratio','token_set_ratio','token_sort_ratio','is_duplicate']],hue='is_duplicate')

In [None]:
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(train[['cwc_min','cwc_max','ctc_min','ctc_max','csc_min','csc_max','first_word_same','last_word_same','abs_len_diff','mean_ques_len','longest_sub_ratio','fuzz_ratio','fuzz_partial_ratio','token_set_ratio','token_sort_ratio']])
target = train.is_duplicate.values

In [None]:
pca = PCA(n_components=2,random_state=101)
X_pca = pca.fit_transform(scaled_train)

In [None]:
plt.figure(figsize=(12,8))
fig = plt.scatter(X_pca[:,0],X_pca[:,1],c=target)
plt.legend(handles=fig.legend_elements()[0],labels=list([0,1]))
plt.show();

In [None]:
rest_data = train.drop(['id','qid1','qid2','question1','question2'],axis=1)
rest_data.head()

In [None]:
ques_df = train[['question1','question2']]
ques_df.head()

In [None]:
questions = np.array(list(ques_df.question1) + list(ques_df.question2))
cv = CountVectorizer(max_features=500)
ques1_arr, ques2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [None]:
q1_temp = pd.DataFrame(ques1_arr)
q2_temp = pd.DataFrame(ques2_arr)
ques = pd.concat([q1_temp,q2_temp],axis=1)
ques.head()

In [None]:
ques.index = rest_data.index

In [None]:
df = pd.concat([rest_data,ques],axis=1)
df.head()

In [None]:
X = df.drop('is_duplicate',axis=1)
y = df.is_duplicate

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=True,random_state=101)

In [None]:
features = X_train.columns
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_train = pd.DataFrame(scaled_X_train,columns=features)
scaled_X_train.head()

In [None]:
scaled_X_test = scaler.transform(X_test)
scaled_X_test = pd.DataFrame(scaled_X_test,columns=features)
scaled_X_test.head()

In [None]:
def train_and_evaluate_model(model):
    model.fit(scaled_X_train,y_train)
    pred = model.predict(scaled_X_test)
    print(classification_report(y_test,pred))
    plot_confusion_matrix(model,scaled_X_test,y_test)
    plot_precision_recall_curve(model,scaled_X_test,y_test)

In [None]:
train_and_evaluate_model(LogisticRegression())

In [None]:
train_and_evaluate_model(RandomForestClassifier())

In [None]:
train_and_evaluate_model(DecisionTreeClassifier())

In [None]:
train_and_evaluate_model(KNeighborsClassifier())

In [None]:
train_and_evaluate_model(LinearSVC())

In [None]:
train_and_evaluate_model(MultinomialNB())

In [None]:
train_and_evaluate_model(GradientBoostingClassifier())

In [None]:
train_and_evaluate_model(AdaBoostClassifier())

In [None]:
train_and_evaluate_model(BaggingClassifier());

In [None]:
train_and_evaluate_model(ExtraTreesClassifier());

In [None]:
def create_model():
    ann = Sequential()
    ann.add(Dense(units=32,activation='relu'))
    ann.add(Dense(units=64,activation='relu'))
    ann.add(Dense(units=128,activation='relu'))
    ann.add(Dropout(0.28))
    ann.add(Dense(units=1,activation='sigmoid'))
    ann.compile(loss='binary_crossentropy',optimizer='adam',metrics='accuracy')
    return ann

In [None]:
model = create_model()
model

In [None]:
es = EarlyStopping(monitor='val_accuracy',mode='max',patience=40,verbose=1)
rl = ReduceLROnPlateau(monitor='val_accuracy',mode='max',patience=5,verbose=2,min_lr=0.001,factor=0.1)

r = model.fit(scaled_X_train,
         y_train,
         epochs=50,
         batch_size=32,
         validation_data=(scaled_X_test,y_test),
         callbacks=[es,rl])

In [None]:
plt.figure(figsize=(12,8))
plt.plot(r.history['loss'],'r',label='train loss')
plt.plot(r.history['val_loss'],'b',label='test loss')
plt.xlabel('Number of Epochs')
plt.ylabel('Loss')
plt.title('Loss Graph')
plt.legend();

In [None]:
plt.figure(figsize=(12,8))
plt.plot(r.history['accuracy'],'r',label='train accuracy')
plt.plot(r.history['val_accuracy'],'b',label='test accuracy')
plt.xlabel('Number of Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy Graph')
plt.legend();

In [None]:
loss, acc = model.evaluate(scaled_X_test,y_test)
print("Test Loss:",loss)
print("Test Accuracy:",acc)

In [None]:
model.save('quora_duplicate_detector.h5')
lm = load_model('quora_duplicate_detector.h5')
lm

In [None]:
et = ExtraTreesClassifier()
et.fit(scaled_X_train,y_train)

In [None]:
joblib.dump(et,'extra_trees_clf.h5',compress=2)