In [3]:
import pandas as pd

In [4]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation
import numpy as np 
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

stopwords = set( set(nltk.corpus.stopwords.words('english')).union(set(punctuation)) )

def preprocess_tokenize(text):
    tokens = word_tokenize(re.sub(r'[^\x00-\x7F]+', ' ', text.lower()))
    tokens_without_stopwords = [word for word in tokens if word not in stopwords]
    return ' '.join(tokens_without_stopwords)

preprocessor = np.vectorize(preprocess_tokenize)

def preprocess_tree(q1,q2,dup):
    q1_preprocessed, q2_preprocessed = preprocessor(q1), preprocessor(q2)
    df = pd.DataFrame({'question1': q1_preprocessed, 'question2': q2_preprocessed, 'is_duplicate': dup})
    df.to_csv('preprocessed_neural.csv', index=False)
    return q1_preprocessed, q2_preprocessed

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [37]:
df = pd.read_csv("drive/MyDrive/preprocessed_neural.csv")
df.dropna(how='any').reset_index(drop=True)
df.head()


Unnamed: 0,question1,question2,is_duplicate
0,step step guide invest share market india,step step guide invest share market,0
1,story kohinoor kohinoor diamond,would happen indian government stole kohinoor ...,0
2,increase speed internet connection using vpn,internet speed increased hacking dns,0
3,mentally lonely solve,find remainder math2324math divided 2423,0
4,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0


In [7]:
def get_length_feature(question1, question2, vector):
    if (type(question1) is not float):
        l1 = len(question1.split())
    else:
        l1 = 1
    if(type(question2) is not float):
        l2 = len(question2.split())
    else:
        l2 = 1
    
    if(l2 == 0):
        l2 = 1
    vector.append(l1)
    vector.append(l2)
    vector.append(l1-l2)
    vector.append(l1/l2)

    return vector

In [8]:
def get_count_lowercased(question1, question2, vector):

    q1 = []
    q2 = []

    for word in word_tokenize(question1):
        if word.islower():
            q1.append(word)
    
    for word in word_tokenize(question2):
        if word.islower():
            q2.append(word)
  

    count = len(set(q1) & set(q2))

    vector.append(count)
    vector.append(count/237)

    return vector

In [9]:
def get_count_lowercased_without_stopwords(question1, question2, vector):
    q1 = []
    q2 = []

    for word in word_tokenize(question1):
        if word.islower() and word not in stopwords:
            q1.append(word)
    
    for word in word_tokenize(question2):
        if word.islower() and word not in stopwords:
            q2.append(word)


    count = len(set(q1) & set(q2))

    vector.append(count)
    vector.append(count/237)

    return vector

In [10]:
def same_last_words(question1, question2, vector):
    q1 = []
    q2 = []

    for word in word_tokenize(question1):
        q1.append(word)
    
    for word in word_tokenize(question2):
        q2.append(word)
    
    if(len(q1) > 0 and len(q2) > 0):
        vector.append(q1[-1] == q2[-1])
    else:
        vector.append(False)

    return vector

In [11]:
def get_count_uppercased(question1, question2, vector):
    
    q1 = []
    q2 = []

    for word in word_tokenize(question1):
        if word.isupper():
            q1.append(word)
    
    for word in word_tokenize(question2):
        if word.isupper():
            q2.append(word)


    count = len(set(q1) & set(q2))

    vector.append(count)
    vector.append(count/237)

    return vector

In [12]:
def same_prefix(question1, question2, vector):
    q1 = []
    q2 = []
    
    for word in word_tokenize(question1):
        q1.append(word)
    
    for word in word_tokenize(question2):
        q2.append(word)
    s=0
    for i in range(3):
        s = s + (q1[:i]==q2[:i])
    vector.append(s) 
    vector.append(s/237)
    s=0
    for i in range(4):
        s = s + (q1[:i]==q2[:i])
    vector.append(s) 
    vector.append(s/237)
    s=0
    for i in range(5):
        s = s + (q1[:i]==q2[:i])
    vector.append(s) 
    vector.append(s/237)
    s=0
    for i in range(6):
        s = s + (q1[:i]==q2[:i])
    vector.append(s) 
    vector.append(s/237)
    
    return vector

In [13]:
def misc_features(question1, question2, vector):
    q1 = []
    q2 = []

    for word in word_tokenize(question1):
        q1.append(word)
    
    for word in word_tokenize(question2):
        q2.append(word)
    
    vector.append('not' in q1)
    vector.append('not' in q2)
    vector.append('not' in q1 and 'not' in q2)

    q11 = []
    q21 = []

    for word in q1:
        if word.isdigit():
            q11.append(word)
    
    for word in q2:
        if word.isdigit():
            q21.append(word)
   
    vector.append(len(set(q11) and set(q21)))

    q12 = []
    q22 = []

    for word in q1:
        q12.append(ps.stem(word))
    
    for word in q2:
        q22.append(ps.stem(word))

    vector.append(len(set(q12) & set(q22)))
    vector.append(len(set(q12) & set(q22))/237)

    return vector

In [14]:
def create_feature_vector(question1, question2):
    vector = []
    vector = get_length_feature(question1, question2, vector)
    vector = get_count_lowercased(question1, question2, vector)
    vector = get_count_lowercased_without_stopwords(
        question1, question2, vector)
    vector = same_last_words(question1, question2, vector)
    vector = get_count_uppercased(question1, question2, vector)
    vector = same_prefix(question1, question2, vector)
    vector = misc_features(question1, question2, vector)
    return vector

In [15]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, df['is_duplicate'], test_size=0.2, random_state=42)


In [17]:
# create feature vectors
X_train = X_train.apply(lambda x: create_feature_vector(str(x['question1']), str(x['question2'])), axis=1)
X_test = X_test.apply(lambda x: create_feature_vector(str(x['question1']), str(x['question2'])), axis=1)

In [19]:
X_train = np.array(X_train.to_list())
X_test = np.array(X_test.to_list())


In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier



def avg_acc_and_f1(no_of_features):
  X_train1 = X_train[:,:no_of_features]
  X_test1 = X_test[:,:no_of_features]

  clf = DecisionTreeClassifier(max_depth=10, min_samples_leaf = 5)
  clf.fit(X_train1, y_train)
  y_pred = clf.predict(X_test1)
  a1 = accuracy_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  print(".")

  clf = RandomForestClassifier(max_depth = None, min_samples_leaf=5, n_estimators=50)
  clf.fit(X_train1, y_train)
  y_pred = clf.predict(X_test1)
  a2 = accuracy_score(y_test, y_pred)
  f2 = f1_score(y_test, y_pred)
  print(".")

  clf = GradientBoostingClassifier(max_depth=4, n_estimators=500)
  clf.fit(X_train1, y_train)
  y_pred = clf.predict(X_test1)
  a3 = accuracy_score(y_test, y_pred)
  f3 = f1_score(y_test, y_pred)

  return np.mean([a1,a2,a3]), np.mean([f1,f2,f3])

In [29]:
print("L\n")
a, f = avg_acc_and_f1(4)
print("\nAccuracy: ",a)
print("\nF score: ",f)

L

.
.

Accuracy:  0.643203373273485

F score:  0.23284812267009902


In [30]:
print("L, LC\n")
a, f = avg_acc_and_f1(6)
print("\nAccuracy: ",a)
print("\nF score: ",f)

L, LC

.
.

Accuracy:  0.6960838866837309

F score:  0.5835277745248307


In [31]:
print("L, LC, LCXS\n")
a, f = avg_acc_and_f1(8)
print("\nAccuracy: ",a)
print("\nF score: ",f)

L, LC, LCXS

.
.

Accuracy:  0.6960406078816881

F score:  0.5824456784387457


In [32]:
print("L, LC, LCXS, LW\n")
a, f = avg_acc_and_f1(9)
print("\nAccuracy: ",a)
print("\nF score: ",f)

L, LC, LCXS, LW

.
.

Accuracy:  0.7233124358546327

F score:  0.6200591001459548


In [33]:
print("L, LC, LCXS, LW, CAP\n")
a, f = avg_acc_and_f1(11)
print("\nAccuracy: ",a)
print("\nF score: ",f)

L, LC, LCXS, LW, CAP

.
.

Accuracy:  0.7233928107727121

F score:  0.6210903060972116


In [34]:
print("L, LC, LCXS, LW, CAP, PRE\n")
a, f = avg_acc_and_f1(19)
print("\nAccuracy: ",a)
print("\nF score: ",f)

L, LC, LCXS, LW, CAP, PRE

.
.

Accuracy:  0.7254021837246973

F score:  0.6293394045981935


In [35]:
print("L, LC, LCXS, LW, CAP, PRE, M\n")
a, f = avg_acc_and_f1(25)
print("\nAccuracy: ",a)
print("\nF score: ",f)

L, LC, LCXS, LW, CAP, PRE, M

.
.

Accuracy:  0.7400798803031989

F score:  0.6513022182739713
