# Duplicates Predictions 
## Based on Features Engineerings

In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing, svm, metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import time

In [2]:
import pickle
def pickle_store(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)

def pickle_load(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [3]:
def evaluation(clf, clf_name, X, y, k=5):
    starting_tm = time.time()
    clf_precision = 0
    clf_recall = 0
    clf_f1 = 0
    clf_accuracy = 0
    
    skf = StratifiedKFold(n_splits=k)
    for train_index, test_index in skf.split(X, y):
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        
        clf_precision += metrics.precision_score(y_test, predictions, average='micro')
        clf_recall += metrics.recall_score(y_test, predictions, average='micro')
        clf_f1 += metrics.f1_score(y_test, predictions, average='micro')
        clf_accuracy += metrics.accuracy_score(y_test, predictions)
    
     # compute the average of each value
    precision_score = clf_precision/k
    recall_score = clf_recall/k
    f1_score = clf_f1/k
    accuracy_score = clf_accuracy/k
    
    print(clf_name + "\nPrecision: " + str(precision_score)
          + "\nRecall: " + str(recall_score)
          + "\nF1-Measure: " + str(f1_score) 
          + "\nAccuracy: " + str(accuracy_score)
          + "\nExecution time: " + str(time.time() - starting_tm))

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

def grid_evaluattion(clf, msg, X, y, tuned_parameters, scores):
    print(msg)
    
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)    
    for score in scores:
        print("# Tuning hyper-parameters for %s\n" % score)
    
        clf = GridSearchCV(clf, tuned_parameters, scoring=score)
        clf.fit(X_train, y_train)
    
        print("Best parameters set found on development set: ")
        print(clf.best_params_)
        print("\nGrid scores on development set:")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()
    
        print("Detailed classification report:\n")
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.\n")
        
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print() 

## Loading the Data

In [5]:
data = pd.read_csv("data/train.csv")

data = data.fillna('')
y = data["IsDuplicate"]
print(len(data["IsDuplicate"]))

283004


# Features Extraction

## Basic Fetures
- Length of question1
- Length of question2
- Difference between the two lengths
- Character length of question1 without spaces
- Character length of question2 without spaces
- Number of words in question1
- Number of words in question2
- Number of common words in question1 and question2


In [6]:
data['len_q1'] = data["Question1"].apply(lambda x: len(str(x)))
data['len_q2'] = data["Question2"].apply(lambda x: len(str(x)))
data['diff_len'] = data.len_q1 - data.len_q2

# character length based features
data['len_char_q1'] = data["Question1"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_char_q2'] = data["Question2"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))

# word length based features
data['len_word_q1'] = data["Question1"].apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data["Question2"].apply(lambda x: len(str(x).split()))

# common words in the two questions
data['common_words'] = data.apply(lambda x: len(set(str(x[["Question1"]]).lower().split()).intersection(set(str(x['Question2']).lower().split()))), axis=1)



In [7]:
feats_1 = ['len_q1', 'len_q2', 'diff_len',
    'len_char_q1', 'len_char_q2', 
    'len_word_q1', 'len_word_q2', 
    'common_words']

## Fuzzy Features
- QRatio
- WRatio
- Partial ratio
- Partial token set ratio
- Partial token sort ratio
- Token set ratio
- Token sort ratio

In [8]:
data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['Question1']), str(x['Question2'])), axis=1)

data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['Question1']), str(x['Question2'])), axis=1)

data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['Question1']), str(x['Question2'])), axis=1)

data['fuzz_partial_token_set_ratio'] = data.apply(lambda x:fuzz.partial_token_set_ratio(str(x['Question1']), str(x['Question2'])), axis=1)

data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['Question1']), str(x['Question2'])), axis=1)

data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['Question1']), str(x['Question2'])), axis=1)

data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['Question1']), str(x['Question2'])), axis=1)

In [9]:
feats_2 = ['fuzz_qratio', 'fuzz_WRatio',
    'fuzz_partial_ratio', 'fuzz_partial_token_set_ratio', 
    'fuzz_partial_token_sort_ratio', 'fuzz_token_set_ratio', 
    'fuzz_token_sort_ratio']

## Mapping with Word2vec embeddings
### Features:
- Cosine distance between vectors of question1 and question2
- Manhattan distance between vectors of question1 and question2
- Jaccard similarity between vectors of question1 and question2
- Canberra distance between vectors of question1 and question2
- Euclidean distance between vectors of question1 and question2
- Minkowski distance between vectors of question1 and question2
- Braycurtis distance between vectors of question1 and question2

In [10]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/teomandi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/teomandi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [12]:
from nltk.corpus import stopwords
from nltk import word_tokenize
stop_words = set(stopwords.words('english'))

#adds the vectors for all words in a sentence that are available 
#in the Google news vectors and gives a normalized vector at the end
def sentence2vec(s, model): 
    M = []
    words = word_tokenize(str(s).lower())
    for word in words:
        #It shouldn't be a stopword
        if word not in stop_words:
            #nor contain numbers
            if word.isalpha():
                #and be part of word2vec
                if word in model:
                    M.append(model[word])
    M = np.array(M)  # <---
    if len(M) > 0:
        v = M.sum(axis=0)
        return v / np.sqrt((v ** 2).sum())
    else:
        return np.zeros(300)


In [13]:
w2v_q1 = np.array([sentence2vec(q, model) for q in data["Question1"]])
w2v_q2 = np.array([sentence2vec(q, model) for q in data["Question2"]])

In [2]:
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

data['cosine_distance'] = [cosine(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['cityblock_distance'] = [cityblock(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['jaccard_distance'] = [jaccard(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['canberra_distance'] = [canberra(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['euclidean_distance'] = [euclidean(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['minkowski_distance'] = [minkowski(x,y,3) for (x,y) in zip(w2v_q1, w2v_q2)]
data['braycurtis_distance'] = [braycurtis(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]




In [15]:
feats_3 = ['cosine_distance', 'cityblock_distance', 
         'jaccard_distance', 'canberra_distance', 
         'euclidean_distance', 'minkowski_distance',
         'braycurtis_distance']


---

In [16]:
d = data[ feats_1 + feats_2 + feats_3 ]
d = d.fillna(d.mean())
train = d.values

In [19]:
pickle_store(train, "vars/trainf1f2f3")

# Hyper Paramaters Look up

- ## Stohastic Gradient Descent

In [30]:
import time
starting_tm = time.time()
sgd_clf = SGDClassifier(n_jobs=12)
msg ="Using Stohastic Gradient Descent"
tuned_parameters = [{'loss':['hinge', 'modified_huber', 'log', 'squared_hinge'], 'max_iter':[3000]}]
scores = ['accuracy']
grid_evaluattion(sgd_clf, msg, train, y, tuned_parameters, scores)
print("Execution time: " + str(time.time() - starting_tm))


Using Stohastic Gradient Descent
# Tuning hyper-parameters for accuracy

Best parameters set found on development set: 
{'loss': 'modified_huber', 'max_iter': 3000}

Grid scores on development set:
0.635 (+/-0.043) for {'loss': 'hinge', 'max_iter': 3000}
0.656 (+/-0.043) for {'loss': 'modified_huber', 'max_iter': 3000}
0.653 (+/-0.037) for {'loss': 'log', 'max_iter': 3000}
0.608 (+/-0.018) for {'loss': 'squared_hinge', 'max_iter': 3000}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.63      1.00      0.77     88789
           1       0.00      0.00      0.00     52713

    accuracy                           0.63    141502
   macro avg       0.31      0.50      0.39    141502
weighted avg       0.39      0.63      0.48    141502


Execution time: 140.95240998268127


- ## Random Forest Classifier

In [31]:
## hyper param for Random Forest

starting_tm = time.time()
rf = RandomForestClassifier(n_jobs=4)
msg = "Using Random Forest Classifier"
tuned_parameters = [{'n_estimators': [i*10 for i in range(30, 51)]}]
scores = ['accuracy']
grid_evaluattion(rf, msg, train, y, tuned_parameters, scores)


Using Random Forest Classifier
# Tuning hyper-parameters for accuracy

Best parameters set found on development set: 
{'n_estimators': 460}

Grid scores on development set:
0.742 (+/-0.001) for {'n_estimators': 300}
0.741 (+/-0.002) for {'n_estimators': 310}
0.742 (+/-0.001) for {'n_estimators': 320}
0.742 (+/-0.001) for {'n_estimators': 330}
0.742 (+/-0.002) for {'n_estimators': 340}
0.742 (+/-0.002) for {'n_estimators': 350}
0.742 (+/-0.003) for {'n_estimators': 360}
0.741 (+/-0.002) for {'n_estimators': 370}
0.742 (+/-0.002) for {'n_estimators': 380}
0.742 (+/-0.001) for {'n_estimators': 390}
0.742 (+/-0.001) for {'n_estimators': 400}
0.742 (+/-0.002) for {'n_estimators': 410}
0.742 (+/-0.002) for {'n_estimators': 420}
0.742 (+/-0.000) for {'n_estimators': 430}
0.742 (+/-0.002) for {'n_estimators': 440}
0.743 (+/-0.002) for {'n_estimators': 450}
0.743 (+/-0.002) for {'n_estimators': 460}
0.742 (+/-0.001) for {'n_estimators': 470}
0.742 (+/-0.001) for {'n_estimators': 480}
0.742 (+/-

---

## Evaluation

In [32]:
import xgboost as xgb

print("Classifiers Performance using " + str(train.shape[0]) + " documents.\n")

rf = RandomForestClassifier(n_estimators=460, n_jobs=4)
# svm_clf =svm.SVC(gamma=0.1, C=10, kernel='rbf')
xgb_clf = xgb.XGBClassifier(random_state=1,learning_rate=0.01)
sgd_clf = SGDClassifier(loss='hinge', max_iter=3000, n_jobs=4)

clfs = [(rf, "Random Forest Classifier"), (xgb_clf, "XGBoost"), (sgd_clf, "Stohastic Gradient Descent")] #(svm_clf, "SVM Classifier")
for clf, clf_name in clfs:
    evaluation(clf, clf_name, train, y)
    print("\n\n")

Classifiers Performance using 283004 documents.

Random Forest Classifier
Precision: 0.7468693052401548
Recall: 0.7468693052401548
F1-Measure: 0.7468693052401549
Accuracy: 0.7468693052401548
Execution time: 472.4215581417084



XGBoost
Precision: 0.7186294166683436
Recall: 0.7186294166683436
F1-Measure: 0.7186294166683435
Accuracy: 0.7186294166683436
Execution time: 74.0878894329071



Stohastic Gradient Descent
Precision: 0.6302844364241136
Recall: 0.6302844364241136
F1-Measure: 0.6302844364241136
Accuracy: 0.6302844364241136
Execution time: 117.15589690208435





In [18]:
rf = RandomForestClassifier(n_estimators=460, n_jobs=4)
evaluation(rf, "Random Forest 460", train, y)


Random Forest 430
Precision: 0.7464806220874293
Recall: 0.7464806220874293
F1-Measure: 0.7464806220874293
Accuracy: 0.7464806220874293
Execution time: 499.32548666000366


---

# Predictions

In [20]:
test = pd.read_csv("data/test_without_labels.csv")

test['len_q1'] = test["Question1"].apply(lambda x: len(str(x)))
test['len_q2'] = test["Question2"].apply(lambda x: len(str(x)))
test['diff_len'] = test.len_q1 - test.len_q2
test['len_char_q1'] = test["Question1"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
test['len_char_q2'] = test["Question2"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
test['len_word_q1'] = test["Question1"].apply(lambda x: len(str(x).split()))
test['len_word_q2'] = test["Question2"].apply(lambda x: len(str(x).split()))
test['common_words'] = test.apply(lambda x: len(set(str(x[["Question1"]]).lower().split()).intersection(set(str(x['Question2']).lower().split()))), axis=1)

test['fuzz_qratio'] = test.apply(lambda x: fuzz.QRatio(str(x['Question1']), str(x['Question2'])), axis=1)
test['fuzz_WRatio'] = test.apply(lambda x: fuzz.WRatio(str(x['Question1']), str(x['Question2'])), axis=1)
test['fuzz_partial_ratio'] = test.apply(lambda x: fuzz.partial_ratio(str(x['Question1']), str(x['Question2'])), axis=1)
test['fuzz_partial_token_set_ratio'] = test.apply(lambda x:fuzz.partial_token_set_ratio(str(x['Question1']), str(x['Question2'])), axis=1)
test['fuzz_partial_token_sort_ratio'] = test.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['Question1']), str(x['Question2'])), axis=1)
test['fuzz_token_set_ratio'] = test.apply(lambda x: fuzz.token_set_ratio(str(x['Question1']), str(x['Question2'])), axis=1)
test['fuzz_token_sort_ratio'] = test.apply(lambda x: fuzz.token_sort_ratio(str(x['Question1']), str(x['Question2'])), axis=1)

#---

w2v_q1 = np.array([sent2vec(q, model) for q in test["Question1"]])
w2v_q2 = np.array([sent2vec(q, model) for q in test["Question2"]])

test['cosine_distance'] = [cosine(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
test['cityblock_distance'] = [cityblock(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
test['jaccard_distance'] = [jaccard(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
test['canberra_distance'] = [canberra(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
test['euclidean_distance'] = [euclidean(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
test['minkowski_distance'] = [minkowski(x,y,3) for (x,y) in zip(w2v_q1, w2v_q2)]
test['braycurtis_distance'] = [braycurtis(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]

In [21]:
dtest = test[ feats_1 + feats_2 + feats_3 ]
dtest = dtest.fillna(dtest.mean())
vtest = dtest.values

In [22]:
pickle_store(d, "vars/testf1f2f3")

### Prediciting with:

In [23]:
# Random Forest

rf = RandomForestClassifier(n_estimators=460, n_jobs=4)
rf.fit(train, y)
predictions = rf.predict(vtest)

predictions_df = pd.DataFrame(data={'Id': list(test['Id']), 'Predicted':predictions})
predictions_df.to_csv("data/predictionsRFfull460.csv", index=False)

In [51]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(random_state=1,learning_rate=0.01)
xgb_clf.fit(train, y)
predictions = xgb_clf.predict(vtest)

predictions_df = pd.DataFrame(data={'Id': list(test['Id']), 'Predicted':predictions})
predictions_df.to_csv("data/predictionsXGBfull.csv", index=False)

### Storing the results

In [52]:
predictions_df = pd.DataFrame(data={'Id': list(test['Id']), 'Predicted':predictions})
predictions_df.to_csv("data/predictions.csv", index=False)