# Duplicates Predictions 
## Based on TF-IDF Feature

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing, svm, metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import time

In [1]:
import pickle
def pickle_store(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)

def pickle_load(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [3]:
def evaluation(clf, clf_name, X, y, k=5):
    starting_tm = time.time()
    clf_precision = 0
    clf_recall = 0
    clf_f1 = 0
    clf_accuracy = 0
    
    skf = StratifiedKFold(n_splits=k)
    for train_index, test_index in skf.split(X, y):
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        
        clf_precision += metrics.precision_score(y_test, predictions, average='micro')
        clf_recall += metrics.recall_score(y_test, predictions, average='micro')
        clf_f1 += metrics.f1_score(y_test, predictions, average='micro')
        clf_accuracy += metrics.accuracy_score(y_test, predictions)
    
     # compute the average of each value
    precision_score = clf_precision/k
    recall_score = clf_recall/k
    f1_score = clf_f1/k
    accuracy_score = clf_accuracy/k
    
    print(clf_name + "\nPrecision: " + str(precision_score)
          + "\nRecall: " + str(recall_score)
          + "\nF1-Measure: " + str(f1_score) 
          + "\nAccuracy: " + str(accuracy_score)
          + "\nExecution time: " + str(time.time() - starting_tm))

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

def grid_evaluattion(clf, msg, X, y, tuned_parameters, scores):
    print(msg)
    
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)    
    for score in scores:
        print("# Tuning hyper-parameters for %s\n" % score)
    
        clf = GridSearchCV(clf, tuned_parameters, scoring=score)
        clf.fit(X_train, y_train)
    
        print("Best parameters set found on development set: ")
        print(clf.best_params_)
        print("\nGrid scores on development set:")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()
    
        print("Detailed classification report:\n")
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.\n")
        
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print() 

# test all the classifiers
def hyper_parameter_tuning(train_set, labels):
    starting_tm = time.time()
    rf = RandomForestClassifier(n_jobs=4)
    msg = "Using Random Forest Classifier"
    tuned_parameters = [{'n_estimators': [i*10 for i in range(10, 31)]}]
    scores = ['accuracy']
    grid_evaluattion(rf, msg, train_set, labels, tuned_parameters, scores)
    
    svm_clf = svm.SVC()
    msg = "Using SMV Classifier"
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1, 1e-1, 1e-2,], 'C': [10, 100, 1000]}]
    scores = ['accuracy']
    grid_evaluattion(svm_clf, msg, train_set, labels, tuned_parameters, scores)
    
    sgd_clf = SGDClassifier(n_jobs=12)
    msg ="Using Stohastic Gradient Descent"
    tuned_parameters = [{'loss':['hinge', 'modified_huber', 'log', 'squared_hinge'], 'max_iter':[3000]}]
    scores = ['accuracy']
    grid_evaluattion(sgd_clf, msg, train_set, labels, tuned_parameters, scores)
    print("Execution time: " + str(time.time() - starting_tm))

In [5]:
data = pd.read_csv("data/train.csv")

In [6]:
data = data.fillna('')
y = data["IsDuplicate"]
print(len(data["IsDuplicate"]))

283004


## Vectorizing

In [16]:
tfv = TfidfVectorizer(stop_words='english')

In [17]:
q1q2 = data["Question1"].fillna("") 
q1q2 += " " + data["Question2"].fillna("")
train = tfv.fit_transform(q1q2)
print(train.shape)

(283004, 73264)


## SVD

In [18]:
svd = TruncatedSVD(n_components=180)
train = svd.fit_transform(train)
print(train.shape)

(283004, 180)


---

# Hyper Paramaters Look up

- ## Stohastic Gradient Descent

In [24]:
sgd_clf = SGDClassifier(n_jobs=12)
msg ="Using Stohastic Gradient Descent"
tuned_parameters = [{'loss':['hinge', 'modified_huber', 'log', 'squared_hinge'], 'max_iter':[6000]}]
scores = ['accuracy']
grid_evaluattion(sgd_clf, msg, train, y, tuned_parameters, scores)
print("Execution time: " + str(time.time() - starting_tm))

Using Stohastic Gradient Descent
# Tuning hyper-parameters for accuracy

Best parameters set found on development set: 
{'loss': 'modified_huber', 'max_iter': 6000}

Grid scores on development set:
0.696 (+/-0.003) for {'loss': 'hinge', 'max_iter': 6000}
0.705 (+/-0.003) for {'loss': 'modified_huber', 'max_iter': 6000}
0.701 (+/-0.003) for {'loss': 'log', 'max_iter': 6000}
0.605 (+/-0.030) for {'loss': 'squared_hinge', 'max_iter': 6000}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.71      0.89      0.79     88789
           1       0.68      0.39      0.50     52713

    accuracy                           0.70    141502
   macro avg       0.70      0.64      0.64    141502
weighted avg       0.70      0.70      0.68    141502


Execution time: 3417.5345871448517


- ## Random Forest Classifier

In [None]:
starting_tm = time.time()
rf = RandomForestClassifier(n_jobs=4)
msg = "Using Random Forest Classifier"
tuned_parameters = [{'n_estimators': [i*10 for i in range(30, 51)]}]
scores = ['accuracy']
grid_evaluattion(rf, msg, train, y, tuned_parameters, scores)

---

## Evaluation

In [25]:
import xgboost as xgb

print("Classifiers Performance using " + str(train.shape[0]) + " documents.\n")

rf = RandomForestClassifier(n_estimators=460, n_jobs=4)
# svm_clf =svm.SVC(gamma=0.1, C=10, kernel='rbf')
sgd_clf = SGDClassifier(loss='modified_huber', max_iter=6000, n_jobs=4)
xgb_clf = xgb.XGBClassifier(random_state=1,learning_rate=0.01)

clfs = [(rf, "Random Forest Classifier"), (xgb_clf, "XGBoost"), (sgd_clf, "Stohastic Gradient Descent")]
for clf, clf_name in clfs:
    evaluation(clf, clf_name, train, y)
    print("\n\n")



Classifiers Performance using 283004 documents.

Random Forest Classifier
Precision: 0.7814871869849752
Recall: 0.7814871869849752
F1-Measure: 0.7814871869849752
Accuracy: 0.7814871869849752
Execution time: 5115.616062879562



XGBoost
Precision: 0.7040960672637082
Recall: 0.7040960672637082
F1-Measure: 0.7040960672637082
Accuracy: 0.7040960672637082
Execution time: 811.0072736740112



Stohastic Gradient Descent
Precision: 0.7042939363222616
Recall: 0.7042939363222616
F1-Measure: 0.7042939363222616
Accuracy: 0.7042939363222616
Execution time: 11.409584283828735





---

# Predictions

In [27]:
test = pd.read_csv("data/test_without_labels.csv")
tfv = TfidfVectorizer(stop_words='english')


In [28]:
testq1q2 = test["Question1"].fillna("") 
testq1q2 += " " + test["Question2"].fillna("")
vtest = tfv.fit_transform(testq1q2)
print(vtest.shape)

svd = TruncatedSVD(n_components=180)
vtest = svd.fit_transform(vtest)

(121287, 50444)


### Prediciting with:


In [31]:
# Random Forest

rf = RandomForestClassifier(n_estimators=460, n_jobs=4)
rf.fit(train, y)
predictions = rf.predict(vtest)

# predictions_df = pd.DataFrame(data={'Id': list(test['Id']), 'Predicted':predictions})
# predictions_df.to_csv("data/predictionsRFfull460_TF.csv", index=False)

In [32]:
predictions = rf.predict(vtest)


---

### Storing the results

In [34]:
predictions_df = pd.DataFrame(data={'Id': list(test['Id']), 'Predicted':predictions})
predictions_df.to_csv("data/predictionsRFfull460_2_TF.csv", index=False)