In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score, log_loss, f1_score,
    precision_score, recall_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

import scipy
import numpy as np
from sklearn.svm import SVC
import pickle


In [13]:
BASE_DIR = './dataset/'
# train = pd.read_csv(f'{BASE_DIR}train_preprocessed.csv')
train = pd.read_csv(f'{BASE_DIR}train.csv')
train.fillna("", inplace=True)
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [14]:
y_true = train['is_duplicate']
train.drop(['id','is_duplicate'], axis=1, inplace=True)
print(train.shape)

(404290, 4)


In [18]:
def get_ngram_embedding(train, ngram=(1,1)):
    tfidf = TfidfVectorizer(analyzer='word', max_features = 5000, token_pattern=r'\w{1,}', ngram_range = ngram)

    q1word_trans = tfidf.fit_transform(train['question1'].values)
    q2word_trans = tfidf.fit_transform(train['question2'].values)

    X = scipy.sparse.hstack((q1word_trans,q2word_trans))
    y = y_true

    X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)
    return X_train, X_test, y_train, y_test 

In [15]:
X_train, X_test, y_train, y_test = train_test_split(train, y_true, stratify = y_true, test_size = 0.3)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(283003, 4) (121287, 4) (283003,) (121287,)


## Logistic Regression

In [8]:
def logistic_regression(ngram = (1,1)):
    global train

    tfidf = TfidfVectorizer(analyzer='word', max_features = 5000, token_pattern=r'\w{1,}', ngram_range = ngram)

    q1word_trans = tfidf.fit_transform(train['question1'].values)
    q2word_trans = tfidf.fit_transform(train['question2'].values)

    X = scipy.sparse.hstack((q1word_trans,q2word_trans))
    y = y_true
    
    X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

    clf = LogisticRegression(random_state=42, max_iter = 1000)
    clf.fit(X_train, y_train)
    predict_y = clf.predict_proba(X_test)
    print("Log loss:", log_loss(y_test, predict_y, labels = clf.classes_))

    predict_y = clf.predict(X_test)
    print("Accuracy : ",accuracy_score(y_test, predict_y)*100, "%")
    print("F1 score :", f1_score(y_test, predict_y)*100, "%")
    print("Precision :", precision_score(y_test, predict_y)*100, "%")
    print("Recall :", recall_score(y_test, predict_y)*100, "%")

    return clf


### Unigram

In [19]:
clf = pickle.load(open("./models/analysis3/lr_unigram_unprocessed.pkl", "rb"))
_, X_test, _, y_test = get_ngram_embedding(train)
predict_y = clf.predict(X_test)
print("Accuracy : ",accuracy_score(y_test, predict_y)*100, "%")
print("F1 score :", f1_score(y_test, predict_y)*100, "%")
print("Precision :", precision_score(y_test, predict_y)*100, "%")
print("Recall :", recall_score(y_test, predict_y)*100, "%")

# clf = logistic_regression()
# pickle.dump(clf, open("./models/analysis3/lr_unigram_unprocessed.pkl", "wb"))

Accuracy :  74.38884629020423 %
F1 score : 61.11924699285293 %
Precision : 69.84494793454628 %
Recall : 54.33161982330819 %


### Bigram

In [20]:
clf = pickle.load(open("./models/analysis3/lr_bigram_unprocessed.pkl", "rb"))
_, X_test, _, y_test = get_ngram_embedding(train, (2,2))
predict_y = clf.predict(X_test)
print("Accuracy : ",accuracy_score(y_test, predict_y)*100, "%")
print("F1 score :", f1_score(y_test, predict_y)*100, "%")
print("Precision :", precision_score(y_test, predict_y)*100, "%")
print("Recall :", recall_score(y_test, predict_y)*100, "%")

# clf = logistic_regression((2,2))
# pickle.dump(clf, open("./models/analysis3/lr_bigram_unprocessed.pkl", "wb"))

Accuracy :  73.88508249029162 %
F1 score : 58.511474379126064 %
Precision : 71.114719648486 %
Recall : 49.70291741771814 %


### Trigram

In [21]:
clf = pickle.load(open("./models/analysis3/lr_trigram_unprocessed.pkl", "rb"))
_, X_test, _, y_test = get_ngram_embedding(train, (3,3))
predict_y = clf.predict(X_test)
print("Accuracy : ",accuracy_score(y_test, predict_y)*100, "%")
print("F1 score :", f1_score(y_test, predict_y)*100, "%")
print("Precision :", precision_score(y_test, predict_y)*100, "%")
print("Recall :", recall_score(y_test, predict_y)*100, "%")


# clf = logistic_regression((3,3))
# pickle.dump(clf, open("./models/analysis3/lr_trigram_unprocessed.pkl", "wb"))

Accuracy :  72.8239629968587 %
F1 score : 54.00298636598333 %
Precision : 72.40850235760796 %
Recall : 43.05805906046242 %


## SVM

In [9]:

def svm(ngram = (1,1), kernel = "linear"):
    global train

    tfidf = TfidfVectorizer(analyzer='word', max_features=100, token_pattern=r'\w{1,}', ngram_range=ngram)

    q1word_trans = tfidf.fit_transform(train['question1'].values)
    q2word_trans = tfidf.fit_transform(train['question2'].values)

    X = scipy.sparse.hstack((q1word_trans,q2word_trans))
    y = y_true
    
    X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

    clf = SVC(gamma='auto', kernel=kernel)
    clf.fit(X_train, y_train)
    # predict_y = clf.predict_proba(X_test)
    # print("Log loss:", log_loss(y_test, predict_y, labels = clf.classes_))

    predict_y = clf.predict(X_test)
    print("Confusion Matrix", confusion_matrix(y_test, predict_y))
    print("Accuracy : ",accuracy_score(y_test, predict_y)*100, "%")
    print("F1 score :", f1_score(y_test, predict_y)*100, "%")

    return clf


### Unigram

In [None]:
clf = svm()
import pickle
pickle.dump(clf, open("./models/analysis3/svm_unigram_linear.pkl", "wb"))