In [2]:
import os
import numpy as np 
import pandas as pd
import nltk
import string
from nltk.stem.porter import PorterStemmer
import re
nltk.download('punkt')
nltk.download('stopwords')
stemmer = PorterStemmer()
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/jeopardy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jeopardy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
# load data
df = pd.read_csv('questions.csv.zip')
df.dropna(how="any").reset_index(drop=True)

# train validate test split 70:20:10
from sklearn.model_selection import train_test_split
X_train_q1, X_test_q1, X_train_q2, X_test_q2, y_train, y_test = train_test_split(df['question1'], df['question2'], df['is_duplicate'], test_size=0.3, random_state=42, stratify=df['is_duplicate'])
X_val_q1, X_test_q1, X_val_q2, X_test_q2, y_val, y_test = train_test_split(X_test_q1, X_test_q2, y_test, test_size=(1/3), random_state=42, stratify=y_test)


Ratio of duplicate questions in the splits
Train set:  0.36919749967314835
Validation set:  0.36920279997031835
Test set:  0.369190432610255


In [6]:
y_train, y_val, y_test = np.array(y_train), np.array(y_val), np.array(y_test)
# ratio of duplicate questions in train, validation and test set
print("Ratio of duplicate questions in the splits")
print("Train set: ", y_train.sum()/len(y_train))
print("Validation set: ", y_val.sum()/len(y_val))
print("Test set: ", y_test.sum()/len(y_test))

Ratio of duplicate questions in the splits
Train set:  0.36919749967314835
Validation set:  0.36920279997031835
Test set:  0.369190432610255


In [3]:
INPUT = './input/'
TRAIN_LINEAR_PATH = INPUT + 'train_linear.csv.zip'
TEST_LINEAR_PATH = INPUT + 'test_linear.csv.zip'
VAL_LINEAR_PATH = INPUT + 'val_linear.csv.zip'

In [None]:
train_linear = pd.DataFrame({'question1': X_train_q1, 'question2': X_train_q2, 'is_duplicate': y_train})
val_linear = pd.DataFrame({'question1': X_val_q1, 'question2': X_val_q2, 'is_duplicate': y_val})
test_linear = pd.DataFrame({'question1': X_test_q1, 'question2': X_test_q2, 'is_duplicate': y_test})
allQuestions = pd.concat((train_linear['question1'], train_linear['question2'])).reset_index(drop=True).astype(str)

In [22]:
# dump split files
os.makedirs(INPUT, exist_ok=True)  
train_linear.to_csv(TRAIN_LINEAR_PATH, index=False, compression='zip')
val_linear.to_csv(VAL_LINEAR_PATH, index=False, compression='zip')
test_linear.to_csv(TEST_LINEAR_PATH, index=False, compression='zip')

In [4]:
train_linear = pd.read_csv(TRAIN_LINEAR_PATH)
val_linear = pd.read_csv(VAL_LINEAR_PATH)
test_linear = pd.read_csv(TEST_LINEAR_PATH)
X_train_q1, X_train_q2, y_train = train_linear['question1'].astype('U').values, train_linear['question2'].astype('U').values, train_linear['is_duplicate'].values
X_val_q1, X_val_q2, y_val = val_linear['question1'].astype('U').values, val_linear['question2'].astype('U').values, val_linear['is_duplicate'].values
X_test_q1, X_test_q2, y_test = test_linear['question1'].astype('U').values, test_linear['question2'].astype('U').values, test_linear['is_duplicate'].values

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack as sparse_hstack, vstack as sparse_vstack, save_npz, load_npz

In [32]:
stemmer = PorterStemmer()

def tokenize(text: str) -> list[str]:
    tokens = nltk.word_tokenize(re.sub(r'[^\x00-\x7F]+',' ', text))
    tokens = [stemmer.stem(w) for w in tokens if stemmer.stem(w) not in stopwords]
    return tokens


In [5]:
N_GRAMS_PATH = './n_gram_features/'
UNIGRAM_PATH = N_GRAMS_PATH + 'unigrams_linear/'
BIGRAM_PATH = N_GRAMS_PATH + 'bigrams_linear/'
TRIGRAM_PATH = N_GRAMS_PATH + 'trigrams_linear/'
os.makedirs(N_GRAMS_PATH, exist_ok=True)
os.makedirs(UNIGRAM_PATH, exist_ok=True)
os.makedirs(BIGRAM_PATH, exist_ok=True)
os.makedirs(TRIGRAM_PATH, exist_ok=True)

### Creating Unigram features

In [44]:
unigramVectorizer = CountVectorizer(
                        analyzer='word', 
                        ngram_range=(1,1), 
                        lowercase=True,
                        tokenizer=tokenize
                    )
                    
unigramVectorizer.fit(allQuestions)
q1_train = unigramVectorizer.transform(train_linear['question1'].astype(str))
q2_train = unigramVectorizer.transform(train_linear['question2'].astype(str))
X_train_unigram = sparse_hstack([q1_train, q2_train])
q1_val = unigramVectorizer.transform(val_linear['question1'].astype(str))
q2_val = unigramVectorizer.transform(val_linear['question2'].astype(str))
X_val_unigram = sparse_hstack([q1_val, q2_val])
q1_test = unigramVectorizer.transform(test_linear['question1'].astype(str))
q2_test = unigramVectorizer.transform(test_linear['question2'].astype(str))
X_test_unigram = sparse_hstack([q1_test, q2_test])

save_npz(UNIGRAM_PATH + "train.npz", X_train_unigram)
save_npz(UNIGRAM_PATH + "val.npz", X_val_unigram)
save_npz(UNIGRAM_PATH + "test.npz", X_test_unigram)

### Creating Bigram features

In [45]:
bigramVectorizer = CountVectorizer(
                        analyzer='word', 
                        ngram_range=(1,2), 
                        lowercase=True,
                        tokenizer=tokenize
                    )
                    
bigramVectorizer.fit(allQuestions)
q1_train = bigramVectorizer.transform(train_linear['question1'].astype(str))
q2_train = bigramVectorizer.transform(train_linear['question2'].astype(str))
X_train_bigram = sparse_hstack([q1_train, q2_train])
q1_val = bigramVectorizer.transform(val_linear['question1'].astype(str))
q2_val = bigramVectorizer.transform(val_linear['question2'].astype(str))
X_val_bigram = sparse_hstack([q1_val, q2_val])
q1_test = bigramVectorizer.transform(test_linear['question1'].astype(str))
q2_test = bigramVectorizer.transform(test_linear['question2'].astype(str))
X_test_bigram = sparse_hstack([q1_test, q2_test])

save_npz(BIGRAM_PATH + "train.npz", X_train_bigram)
save_npz(BIGRAM_PATH + "val.npz", X_val_bigram)
save_npz(BIGRAM_PATH + "test.npz", X_test_bigram)

### Creating Trigram features

In [46]:
trigramVectorizer = CountVectorizer(
                        analyzer='word', 
                        ngram_range=(1,3), 
                        lowercase=True,
                        tokenizer=tokenize
                    )
                    
trigramVectorizer.fit(allQuestions)
q1_train = trigramVectorizer.transform(train_linear['question1'].astype(str))
q2_train = trigramVectorizer.transform(train_linear['question2'].astype(str))
X_train_trigram = sparse_hstack([q1_train, q2_train])
q1_val = trigramVectorizer.transform(val_linear['question1'].astype(str))
q2_val = trigramVectorizer.transform(val_linear['question2'].astype(str))
X_val_trigram = sparse_hstack([q1_val, q2_val])
q1_test = trigramVectorizer.transform(test_linear['question1'].astype(str))
q2_test = trigramVectorizer.transform(test_linear['question2'].astype(str))
X_test_trigram = sparse_hstack([q1_test, q2_test])

save_npz(TRIGRAM_PATH + "train.npz", X_train_trigram)
save_npz(TRIGRAM_PATH + "val.npz", X_val_trigram)
save_npz(TRIGRAM_PATH + "test.npz", X_test_trigram)

## Logistic Regression

### Unigrams

In [53]:
# X_train_unigram = load_npz(UNIGRAM_PATH + "train.npz")
# X_test_unigram = load_npz(UNIGRAM_PATH + "test.npz")
unigramLogisticRegressor = SGDClassifier(
                            loss='log_loss', 
                            penalty='l2', 
                            alpha=0.00001, 
                            max_iter=1000,
                            n_iter_no_change=20,
                            learning_rate='optimal',
                            n_jobs=-1,
                            random_state=42)
unigramLogisticRegressor.fit(X_train_unigram, y_train)
y_pred_unigram_logistic = unigramLogisticRegressor.predict(X_test_unigram)
print("Unigram Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred_unigram_logistic))
print("Unigram Logistic Regression F1 Score: ", f1_score(y_test, y_pred_unigram_logistic))

Unigram Logistic Regression Accuracy:  0.7418437260382399
Unigram Logistic Regression F1 Score:  0.6310840903467534


### Bigrams

In [54]:
# X_train_bigram = load_npz(BIGRAM_PATH + "train.npz")
# X_test_bigram = load_npz(BIGRAM_PATH + "test.npz")
bigramLogisticRegressor = SGDClassifier(
                            loss='log_loss', 
                            penalty='l2', 
                            alpha=0.00001, 
                            max_iter=1000,
                            n_iter_no_change=20,
                            learning_rate='optimal',
                            n_jobs=-1,
                            random_state=42)
bigramLogisticRegressor.fit(X_train_bigram, y_train)
y_pred_bigram_logistic = bigramLogisticRegressor.predict(X_test_bigram)
print("bigram Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred_bigram_logistic))
print("bigram Logistic Regression F1 Score: ", f1_score(y_test, y_pred_bigram_logistic))

bigram Logistic Regression Accuracy:  0.7962106408765984
bigram Logistic Regression F1 Score:  0.7066405554566494


### Trigrams

In [7]:
# X_train_trigram = load_npz(TRIGRAM_PATH + "train.npz")
# X_test_trigram = load_npz(TRIGRAM_PATH + "test.npz")
trigramLogisticRegressor = SGDClassifier(
                            loss='log_loss', 
                            penalty='l2', 
                            alpha=0.00001, 
                            max_iter=1000,
                            n_iter_no_change=20,
                            learning_rate='optimal',
                            n_jobs=-1,
                            random_state=42)
trigramLogisticRegressor.fit(X_train_trigram, y_train)
y_pred_trigram_logistic = trigramLogisticRegressor.predict(X_test_trigram)
print("trigram Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred_trigram_logistic))
print("trigram Logistic Regression F1 Score: ", f1_score(y_test, y_pred_trigram_logistic))

trigram Logistic Regression Accuracy:  0.8114472284746098
trigram Logistic Regression F1 Score:  0.7147614593077642


### Trigrams Tuned

Applying GridSearchCV on Trigrams model to get the best set of parameters

In [81]:
trigramLogisticRegressor = SGDClassifier(
                            loss='log_loss',
                            penalty='l2',
                            max_iter=1000,
                            learning_rate='optimal',
                            n_jobs=-1,
                            random_state=42)
parameters = dict({
                'alpha':[0.01, 0.001, 0.0001, 0.00001, 0.000001],
                'n_iter_no_change': [5, 10, 15, 20]
            })
cv_stratified_splitter = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(trigramLogisticRegressor, 
                            parameters, 
                            cv=cv_stratified_splitter, 
                            scoring=['accuracy', 'f1'], 
                            n_jobs=-1,
                            refit='f1')
grid_search.fit(sparse_vstack([X_train_trigram, X_val_trigram]), np.concatenate((y_train, y_val)))


In [86]:
gridSearchCVResults = pd.DataFrame.from_dict(grid_search.cv_results_).reset_index()
gridSearchCVResults

Unnamed: 0,index,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_n_iter_no_change,params,split0_test_accuracy,split1_test_accuracy,...,std_test_accuracy,rank_test_accuracy,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_f1,std_test_f1,rank_test_f1
0,0,32.057936,4.823794,1.462032,0.469907,0.01,5,"{'alpha': 0.01, 'n_iter_no_change': 5}",0.692152,0.691969,...,0.000602,17,0.35287,0.351276,0.352789,0.354524,0.356614,0.353614,0.001818,17
1,1,34.839753,5.847555,1.570614,0.270366,0.01,10,"{'alpha': 0.01, 'n_iter_no_change': 10}",0.69119,0.691969,...,0.000497,18,0.347076,0.351426,0.348466,0.349686,0.349993,0.349329,0.001469,18
2,2,41.940447,10.329488,1.597674,0.275955,0.01,15,"{'alpha': 0.01, 'n_iter_no_change': 15}",0.691122,0.691461,...,0.000457,19,0.34657,0.348641,0.347652,0.350843,0.35041,0.348823,0.001617,19
3,3,44.677039,6.685752,1.307405,0.39377,0.01,20,"{'alpha': 0.01, 'n_iter_no_change': 20}",0.690916,0.690843,...,0.000539,20,0.345848,0.345646,0.346039,0.347637,0.349897,0.347013,0.001605,20
4,4,24.846832,6.631401,1.670751,0.193314,0.001,5,"{'alpha': 0.001, 'n_iter_no_change': 5}",0.741415,0.742827,...,0.001528,14,0.546314,0.550779,0.54732,0.547302,0.555941,0.549531,0.003546,14
5,5,31.92757,2.169471,0.891322,0.097146,0.001,10,"{'alpha': 0.001, 'n_iter_no_change': 10}",0.742473,0.743638,...,0.001614,13,0.552838,0.556443,0.547028,0.549382,0.549654,0.551069,0.003262,13
6,6,35.785039,6.948313,1.889833,0.513881,0.001,15,"{'alpha': 0.001, 'n_iter_no_change': 15}",0.741072,0.743129,...,0.001499,15,0.54513,0.551458,0.546284,0.545874,0.550921,0.547933,0.00269,15
7,7,59.535236,2.297733,1.418594,0.29013,0.001,20,"{'alpha': 0.001, 'n_iter_no_change': 20}",0.741236,0.741796,...,0.00146,16,0.546055,0.544507,0.539016,0.542346,0.547395,0.543864,0.002949,16
8,8,32.525184,8.522953,1.53044,0.418499,0.0001,5,"{'alpha': 0.0001, 'n_iter_no_change': 5}",0.78066,0.78287,...,0.001323,9,0.652092,0.650675,0.650521,0.648484,0.657971,0.651949,0.003223,10
9,9,49.180857,11.419878,1.625803,0.334466,0.0001,10,"{'alpha': 0.0001, 'n_iter_no_change': 10}",0.780578,0.782719,...,0.001481,12,0.650713,0.653565,0.643136,0.640263,0.642371,0.64601,0.005172,12


<u>Best Parameters</u>

In [87]:
print("Best Parameters: ", grid_search.best_params_)

Best Parameters:  {'alpha': 1e-06, 'n_iter_no_change': 20}


In [92]:
bestAlpha = grid_search.best_params_['alpha']
bestNIterNoChange = grid_search.best_params_['n_iter_no_change']
trigramTunedLogisticRegressor = SGDClassifier(
                            loss='log_loss',
                            alpha=bestAlpha,
                            penalty='l2',
                            max_iter=1000,
                            learning_rate='optimal',
                            n_iter_no_change=bestNIterNoChange,
                            n_jobs=-1,
                            random_state=42)
trigramTunedLogisticRegressor.fit(X_train_trigram, y_train)
y_pred_trigram_tuned_logistic = trigramTunedLogisticRegressor.predict(X_test_trigram)
print("trigram Tuned Logistic Regression Accuracy: ", 100*accuracy_score(y_test, y_pred_trigram_tuned_logistic))
print("trigram Tuned Logistic Regression F1 Score: ", 100*f1_score(y_test, y_pred_trigram_tuned_logistic))


trigram Tuned Logistic Regression Accuracy:  80.31116277919315
trigram Tuned Logistic Regression F1 Score:  72.37454015409178


## SVM

According to paper, the default parameters to be used for SVM are $C=1.0$ and $kernel=linear$ unless specified otherwise.

##### Unigram Linear SVM Model 

In [8]:
# X_train_unigram = load_npz(UNIGRAM_PATH + "train.npz")
# X_test_unigram = load_npz(UNIGRAM_PATH + "test.npz")
unigramLinearSVM = LinearSVC(C=1.0, max_iter=10000, random_state=42)
unigramLinearSVM.fit(X_train_unigram, y_train)
y_pred_unigram_linear_svm = unigramLinearSVM.predict(X_test_unigram)
print("Unigram Linear SVM Accuracy: ", accuracy_score(y_test, y_pred_unigram_linear_svm))
print("Unigram Linear SVM F1 Score: ", f1_score(y_test, y_pred_unigram_linear_svm))

Unigram Linear SVM Accuracy:  0.7339038808775878
Unigram Linear SVM F1 Score:  0.6413282656531306


##### Bigram Linear SVM Model

In [8]:
# X_train_bigram = load_npz(BIGRAM_PATH + "train.npz")
# X_test_bigram = load_npz(BIGRAM_PATH + "test.npz")
bigramLinearSVM = LinearSVC(C=1.0, max_iter=10000, random_state=42)
bigramLinearSVM.fit(X_train_bigram, y_train)
y_pred_bigram_linear_svm = bigramLinearSVM.predict(X_test_bigram)
print("Bigram Linear SVM Accuracy: ", accuracy_score(y_test, y_pred_bigram_linear_svm))
print("Bigram Linear SVM F1 Score: ", f1_score(y_test, y_pred_bigram_linear_svm))

Bigram Linear SVM Accuracy:  0.7765465383759182
Bigram Linear SVM F1 Score:  0.6993877279382404


##### Trigram Linear SVM Model

In [8]:
# X_train_trigram = load_npz(TRIGRAM_PATH + "train.npz")
# X_test_trigram = load_npz(TRIGRAM_PATH + "test.npz")
trigramLinearSVM = LinearSVC(C=1.0, max_iter=10000, random_state=42)
trigramLinearSVM.fit(X_train_trigram, y_train)
y_pred_trigram_linear_svm = trigramLinearSVM.predict(X_test_trigram)
print("Trigram Linear SVM Accuracy: ", accuracy_score(y_test, y_pred_trigram_linear_svm))
print("Trigram Linear SVM F1 Score: ", f1_score(y_test, y_pred_trigram_linear_svm))

Trigram Linear SVM Accuracy:  0.7926241064582354
Trigram Linear SVM F1 Score:  0.7131910235358511


### Parameter tuning for Trigram SVM Model with different kernels

In [10]:
# X_train_trigram = load_npz(TRIGRAM_PATH + "train.npz")
# X_test_trigram = load_npz(TRIGRAM_PATH + "test.npz")
# X_val_trigram = load_npz(TRIGRAM_PATH + "val.npz")

In [12]:
parameters = dict({
                    'C':[0.001, 0.005, 0.1, 0.5, 1.0, 10, 50], 
                    'kernel':['linear', 'rbf']
                })
trigramSVM = SVC(max_iter=-1, random_state=42, gamma='auto')
cv_stratified_splitter = StratifiedKFold(n_splits=2)
grid_search = GridSearchCV(trigramSVM, 
                            parameters, 
                            cv=cv_stratified_splitter, 
                            scoring=['accuracy', 'f1'], 
                            n_jobs=-1,
                            refit='accuracy',
                            verbose=1)
grid_search.fit(sparse_vstack([X_train_trigram, X_val_trigram]), np.concatenate((y_train, y_val)))

Fitting 2 folds for each of 14 candidates, totalling 28 fits


In [14]:
print("Best Parameters: ", grid_search.best_params_)

Best Parameters:  {'C': 0.1, 'kernel': 'linear'}


In [15]:
bestC=grid_search.best_params_['C']
bestKernel=grid_search.best_params_['kernel']
trigramTunedSVM = SVC(C=bestC, kernel=bestKernel, max_iter=-1, random_state=42, gamma='auto')
trigramTunedSVM.fit(X_train_trigram, y_train)
y_pred_trigram_tuned_svm = trigramTunedSVM.predict(X_test_trigram)
print("Trigram Tuned SVM Accuracy: ", accuracy_score(y_test, y_pred_trigram_tuned_svm))
print("Trigram Tuned SVM F1 Score: ", f1_score(y_test, y_pred_trigram_tuned_svm))

Trigram Tuned SVM Accuracy:  0.8011445843330283
Trigram Tuned SVM F1 Score:  0.712977065767285


## Sentence embeddings as feature vectors

50-dimensional word vectors are obtained using GloVe vectors (GloVe.6B.50d.txt). The sentence embeddings are obtained by simply summing the word embeddings in a sentence. The sentence embeddings are then used as feature vectors for classification in the following two ways:
- Plain sentence embeddings
- Distance measure between vectors

In [85]:
GLOVE_PATH = INPUT + 'glove.6B.50d.txt'

In [79]:
def preprocessAndTokenizeForGlove(text: str) -> list[str]:
    text = re.sub(r'[^\x00-\x7F]+',' ', text.lower())
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [stemmer.stem(w) for w in tokens if stemmer.stem(w) not in stopwords]
    return tokens

In [83]:
try:
    X_train_q1_tokenized = np.array([preprocessAndTokenizeForGlove(ques) for ques in X_train_q1], dtype=object)
    X_train_q2_tokenized = np.array([preprocessAndTokenizeForGlove(ques) for ques in X_train_q2], dtype=object)
    X_test_q1_tokenized = np.array([preprocessAndTokenizeForGlove(ques) for ques in X_test_q1], dtype=object)
    X_test_q2_tokenized = np.array([preprocessAndTokenizeForGlove(ques) for ques in X_test_q2], dtype=object)
    X_val_q1_tokenized = np.array([preprocessAndTokenizeForGlove(ques) for ques in X_val_q1], dtype=object)
    X_val_q2_tokenized = np.array([preprocessAndTokenizeForGlove(ques) for ques in X_val_q2], dtype=object)
except Exception as e:
    print(e)

In [86]:
# Load GloVe Word Embeddings
GloVe_embeddings = {}
with open(GLOVE_PATH, 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        GloVe_embeddings[word] = vector

### Creating sentence embeddings for each question

According to the paper, the sentence embeddings are obtained by simply summing the embeddings of all the tokens in a sentence.

In [181]:
X_train_q1_embeddings = np.array([np.sum([GloVe_embeddings[w] for w in ques if w in GloVe_embeddings] + [np.zeros((50,))], axis=0) for ques in X_train_q1_tokenized])
X_train_q2_embeddings = np.array([np.sum([GloVe_embeddings[w] for w in ques if w in GloVe_embeddings] + [np.zeros((50,))], axis=0) for ques in X_train_q2_tokenized])
X_test_q1_embeddings = np.array([np.sum([GloVe_embeddings[w] for w in ques if w in GloVe_embeddings] + [np.zeros((50,))], axis=0) for ques in X_test_q1_tokenized])
X_test_q2_embeddings = np.array([np.sum([GloVe_embeddings[w] for w in ques if w in GloVe_embeddings] + [np.zeros((50,))], axis=0) for ques in X_test_q2_tokenized])
X_val_q1_embeddings = np.array([np.sum([GloVe_embeddings[w] for w in ques if w in GloVe_embeddings] + [np.zeros((50,))], axis=0) for ques in X_val_q1_tokenized])
X_val_q2_embeddings = np.array([np.sum([GloVe_embeddings[w] for w in ques if w in GloVe_embeddings] + [np.zeros((50,))], axis=0) for ques in X_val_q2_tokenized])

In [5]:
QUESTION_EMBEDDINGS = './question_embeddings/'
os.makedirs(QUESTION_EMBEDDINGS, exist_ok=True)
TRAIN_Q1_EMBEDDINGS = QUESTION_EMBEDDINGS + 'train_q1_embeddings.npz'
TRAIN_Q2_EMBEDDINGS = QUESTION_EMBEDDINGS + 'train_q2_embeddings.npz'
TEST_Q1_EMBEDDINGS = QUESTION_EMBEDDINGS + 'test_q1_embeddings.npz'
TEST_Q2_EMBEDDINGS = QUESTION_EMBEDDINGS + 'test_q2_embeddings.npz'
VAL_Q1_EMBEDDINGS = QUESTION_EMBEDDINGS + 'val_q1_embeddings.npz'
VAL_Q2_EMBEDDINGS = QUESTION_EMBEDDINGS + 'val_q2_embeddings.npz'

In [183]:
np.savez_compressed(TRAIN_Q1_EMBEDDINGS, X_train_q1_embeddings)
np.savez_compressed(TRAIN_Q2_EMBEDDINGS, X_train_q2_embeddings)
np.savez_compressed(TEST_Q1_EMBEDDINGS, X_test_q1_embeddings)
np.savez_compressed(TEST_Q2_EMBEDDINGS, X_test_q2_embeddings)
np.savez_compressed(VAL_Q1_EMBEDDINGS, X_val_q1_embeddings)
np.savez_compressed(VAL_Q2_EMBEDDINGS, X_val_q2_embeddings)

In [6]:
X_train_q1_embeddings = np.load(TRAIN_Q1_EMBEDDINGS)['arr_0']
X_train_q2_embeddings = np.load(TRAIN_Q2_EMBEDDINGS)['arr_0']
X_test_q1_embeddings = np.load(TEST_Q1_EMBEDDINGS)['arr_0']
X_test_q2_embeddings = np.load(TEST_Q2_EMBEDDINGS)['arr_0']
X_val_q1_embeddings = np.load(VAL_Q1_EMBEDDINGS)['arr_0']
X_val_q2_embeddings = np.load(VAL_Q2_EMBEDDINGS)['arr_0']

### 1. Plain sentence embeddings

$100$-dimensional feature vector = $50$-dimensional question $1$ sentence embedding + $50$-dimensional question $2$ sentence embedding

In [188]:
X_train_plain_embeddings = np.hstack((X_train_q1_embeddings, X_train_q2_embeddings))
X_test_plain_embeddings = np.hstack((X_test_q1_embeddings, X_test_q2_embeddings))
X_val_plain_embeddings = np.hstack((X_val_q1_embeddings, X_val_q2_embeddings))

In [191]:
SVMmodel = SVC(kernel='rbf', C=1.0, random_state=42, max_iter=-1, gamma='auto')
SVMmodel.fit(X_train_plain_embeddings, y_train)
y_pred = SVMmodel.predict(X_test_plain_embeddings)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.7738499097182715
F1 Score: 0.6989414766371224


In [193]:
SVMmodel = SVC(kernel='linear', C=1.0, random_state=42, max_iter=-1, gamma='auto')
SVMmodel.fit(X_train_plain_embeddings, y_train)
y_pred = SVMmodel.predict(X_test_plain_embeddings)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))



Accuracy: 0.63856125306092164
F1 Score: 0.6193053333333333


### 2. Distance measure between vectors
Feature vector is obtained by taking various distance measures between the sentence embeddings of the two questions:
- Bray Curtis distance:
$$
d_{Bray Curtis} = \frac{1}{2} \sum_{i=1}^{50} \frac{|q_1[i] - q_2[i]|}{|q_1[i] + q_2[i]|}
$$
- Canberra distance:
$$
d_{Canberra} = \sum_{i=1}^{50} \frac{|q_1[i] - q_2[i]|}{|q_1[i]| + |q_2[i]|}
$$
- Chebyshev distance:
$$d_{Chebyshev} = max(|q_1[i] - q_2[i]|)$$
- City block distance:
$$d_{City block} = \sum_{i=1}^{50} |q_1[i] - q_2[i]|$$
- Correlation distance:
$$
\begin{aligned}
d_{Correlation} = 1 - \frac{\sum_{i=1}^{50} (q_1[i] - \bar{q_1})(q_2[i] - \bar{q_2})}{\sqrt{\sum_{i=1}^{50} (q_1[i] - \bar{q_1})^2} \sqrt{\sum_{i=1}^{50} (q_2[i] - \bar{q_2})^2}}
\end{aligned}
$$
- Cosine distance: 
$$
d_{Cosine} = 1 - \frac{\sum_{i=1}^{50} q_1[i]q_2[i]}{\sqrt{\sum_{i=1}^{50} q_1[i]^2} \sqrt{\sum_{i=1}^{50} q_2[i]^2}}
$$
- Euclidean distance:
$$d_{Euclidean} = \sqrt{\sum_{i=1}^{50} (q_1[i] - q_2[i])^2}$$

In [7]:
import scipy.spatial.distance as scipyDistance

def distances(q1, q2):
    distanceFeatureVector = np.array([
        scipyDistance.braycurtis(q1, q2),
        scipyDistance.canberra(q1, q2),
        scipyDistance.chebyshev(q1, q2),
        scipyDistance.cityblock(q1, q2),
        scipyDistance.correlation(q1, q2),
        scipyDistance.cosine(q1, q2),
        scipyDistance.euclidean(q1, q2)
    ])
    distanceFeatureVector = np.nan_to_num(distanceFeatureVector)
    return distanceFeatureVector


X_train_distances = np.array([distances(q1, q2) for q1, q2 in zip(X_train_q1_embeddings, X_train_q2_embeddings)])
X_test_distances = np.array([distances(q1, q2) for q1, q2 in zip(X_test_q1_embeddings, X_test_q2_embeddings)])
X_val_distances = np.array([distances(q1, q2) for q1, q2 in zip(X_val_q1_embeddings, X_val_q2_embeddings)])

  return l1_diff.sum() / l1_sum.sum()
  dist = 1.0 - uv / np.sqrt(uu * vv)


In [7]:
distanceSVM = SVC(kernel='rbf', C=1.0, random_state=42, max_iter=-1, gamma='auto')
distanceSVM.fit(X_train_distances, y_train)
y_pred = distanceSVM.predict(X_test_distances)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.683458686969944
F1 Score: 0.679439494994484


In [8]:
distanceSVM = SVC(kernel='linear', C=1.0, random_state=42, max_iter=-1, gamma='auto')
distanceSVM.fit(X_train_distances, y_train)
y_pred = distanceSVM.predict(X_test_distances)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.639099378488921
F1 Score: 0.625289938484038
