In [163]:
# import pandas as pd
# import numpy as np

# from sklearn.linear_model import LogisticRegression

# from sklearn.model_selection import train_test_split
# from sklearn.grid_search import GridSearchCV  
# from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# from sklearn.metrics import confusion_matrix
# from sklearn.model_selection import GridSearchCV

# import pickle as pkl


## Logistic Regression Classifier

Here, I implement a simple logistic regression classifier to the different feature groups that I am experimenting with (described more in `xgboost/xgboost_extended_features`. 

I am by no means tuning this model as best I can - I do not expect a logistic regression to be able to learn functions complex enough to solve this problem well. I am doing this so that I can have a baseline to compare my more advanced ensemble tree classifeirs with. 

### 0. Load data

In [29]:
data = pd.read_csv("../data/features.csv")

In [30]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.2,Unnamed: 0.1.1,id,qid1,qid2,question1,question2,is_duplicate,...,lstm_2_q2_pred,lstm_3_q1_pred,lstm_3_q2_pred,lstm_4_q1_pred,lstm_4_q2_pred,lstm_5_q1_pred,lstm_5_q2_pred,lstm_vote_q1,lstm_vote_q2,lstm_vote_agree
0,0,1,1,1,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,...,2,1,1,2,2,1,1,1,1,True
1,1,2,2,2,1,3,4,What is the story of Kohinoor Koh - i - Noor D...,What would happen if the Indian government sto...,0,...,2,1,3,2,1,5,1,2,1,False
2,2,3,3,3,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,...,1,1,1,1,1,1,1,1,1,True
3,3,4,4,4,3,7,8,Why am I mentally very lonely ? How can I solv...,Find the remainder when math 23 24 math is div...,0,...,5,1,5,1,1,1,5,1,5,False
4,4,5,5,5,4,9,10,"Which one dissolve in water quickly sugar , sa...",Which fish would survive in salt water ?,0,...,2,2,2,2,2,3,2,3,2,False


### 1. Experimental setups

I'll split the data into train/test sets, using a 80:20 split. I will be usin 3-fold cross validation fo hyper-parameter tuning. As usual, I set my random seed to `550` everywhere. 

In [31]:
features = [col for col in data.columns if col != "is_duplicate"]
labels = data['is_duplicate'].values

# Split data into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(
    data[features], labels, test_size=0.2, random_state=550, )

In [32]:
# sanity check
print(data.shape)
print
print(X_train.shape)
print(X_test.shape)
print
print(y_train.shape)
print(y_test.shape)

(404288, 68)

(323430, 69)
(80858, 69)

(323430,)
(80858,)


In [33]:
data.columns[1:5]

Index([u'Unnamed: 0.1', u'Unnamed: 0.1', u'Unnamed: 0.1.1', u'id'], dtype='object')

In [34]:
X_train.columns[1:5]

Index([u'Unnamed: 0.1', u'Unnamed: 0.1', u'Unnamed: 0.1', u'Unnamed: 0.1'], dtype='object')

The reason there is an extra column in the training data than in the original column is that when splitting an extra two columns named `Unnamed` were added.

That's annoying. 

### 2. Features to train on

Here are the different feature groups I will test and compare

In [39]:
# all features
all_features = [u'len_intersection',
                 u'num_words_q1',
                 u'num_words_q2',
                 u'num_chars_q1',
                 u'num_chars_q2',
                 u'num_chars_diff',
                 u'partial_ratio',
                 u'partial_ratio_sw',
                 u'token_set_ratio',
                 u'token_set_ratio_sw',
                 u'partial_token_sort_ratio_sw',
                 u'wratio_sw',
                 u'word_intersection_tfidf_weight',
                 u'word_symmetric_difference_tfidf_weight',
                 u'euclidean_distance_sentence_embeddings',
                 u'cosine_distance_sentence_embeddings',
                 u'cityblock_distance_sentence_embeddings',
                 u'braycurtis_distance_sentence_embeddings',
                 u'euclidean_distance_max_tfidf_word',
                 u'cosine_distance_max_tfidf_word',
                 u'lch_similarity',
                 u'embedding_similarity_score', 
                 'lstm_vote_q1', 
                 'lstm_vote_q2',
                 'lstm_vote_agree']

In [38]:
# Excluding TF-IDF features
no_tfidf = [u'len_intersection',
                 u'num_words_q1',
                 u'num_words_q2',
                 u'num_chars_q1',
                 u'num_chars_q2',
                 u'num_chars_diff',
                 u'partial_ratio',
                 u'partial_ratio_sw',
                 u'token_set_ratio',
                 u'token_set_ratio_sw',
                 u'partial_token_sort_ratio_sw',
                 u'wratio_sw',
                 u'euclidean_distance_sentence_embeddings',
                 u'cosine_distance_sentence_embeddings',
                 u'cityblock_distance_sentence_embeddings',
                 u'braycurtis_distance_sentence_embeddings',
                 u'lch_similarity',
                 u'embedding_similarity_score', 
                 'lstm_vote_q1', 
                 'lstm_vote_q2',
                 'lstm_vote_agree']

In [40]:
# Excluding similarity scores
no_similarity_scores = [u'len_intersection',
                 u'num_words_q1',
                 u'num_words_q2',
                 u'num_chars_q1',
                 u'num_chars_q2',
                 u'num_chars_diff',
                 u'partial_ratio',
                 u'partial_ratio_sw',
                 u'token_set_ratio',
                 u'token_set_ratio_sw',
                 u'partial_token_sort_ratio_sw',
                 u'wratio_sw',
                 u'word_intersection_tfidf_weight',
                 u'word_symmetric_difference_tfidf_weight',
                 u'euclidean_distance_sentence_embeddings',
                 u'cosine_distance_sentence_embeddings',
                 u'cityblock_distance_sentence_embeddings',
                 u'braycurtis_distance_sentence_embeddings',
                 u'euclidean_distance_max_tfidf_word',
                 u'cosine_distance_max_tfidf_word',
                'lstm_vote_q1', 
                 'lstm_vote_q2',
                 'lstm_vote_agree']

In [41]:
# No TREC features
no_trec = [u'len_intersection',
                 u'num_words_q1',
                 u'num_words_q2',
                 u'num_chars_q1',
                 u'num_chars_q2',
                 u'num_chars_diff',
                 u'partial_ratio',
                 u'partial_ratio_sw',
                 u'token_set_ratio',
                 u'token_set_ratio_sw',
                 u'partial_token_sort_ratio_sw',
                 u'wratio_sw',
                 u'word_intersection_tfidf_weight',
                 u'word_symmetric_difference_tfidf_weight',
                 u'euclidean_distance_sentence_embeddings',
                 u'cosine_distance_sentence_embeddings',
                 u'cityblock_distance_sentence_embeddings',
                 u'braycurtis_distance_sentence_embeddings',
                 u'euclidean_distance_max_tfidf_word',
                 u'cosine_distance_max_tfidf_word',
                 u'lch_similarity',
                 u'embedding_similarity_score']

In [42]:
# similarity or tfidf
no_similarity_or_tfidf = list(set(no_similarity_scores).intersection(set(no_tfidf)))

In [44]:
# No similarity or trec
no_similarity_or_trec = list(set(no_similarity_scores).intersection(set(no_trec)))

In [45]:
# no tfidf or trec
no_tfidf_or_trec = list(set(no_tfidf).intersection(no_trec))

In [46]:
# no extra features
no_extra_features = list(set(no_tfidf_or_trec).intersection(set(no_similarity_scores)))

### 3. Determining the regularization parameter `C`. 

The only parameter tuning that I will do is with the regularization term, $C = \frac{1}{\lambda}$. I will only get to the precision of a power of 10, since I'm not expecting (or trying) to get great performance. 

In [49]:
# parameters to serach through
C_grid = {"C": [1e-2, 1e-1, 1, 1e2, 1e3, 1e4]}

In [50]:
# a vanilla logistic regression (with regularization)
model_full = LogisticRegression(random_state = 550)

In [56]:
full_grid_search = GridSearchCV(estimator = model_full, param_grid = C_grid,  n_jobs = -1, \
                                scoring= "neg_log_loss", cv = 3, verbose = 3)

In [57]:
# search through the different regularization values. 
full_grid_result = full_grid_search.fit(X_train[all_features], y_train )

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.01 ..........................................................
[CV] C=0.01 ..........................................................
[CV] C=0.01 ..........................................................
[CV] C=0.1 ...........................................................
[CV] .................... C=0.01, score=-0.532362371105, total=  27.6s
[CV] C=0.1 ...........................................................
[CV] .................... C=0.01, score=-0.531210507564, total=  27.6s
[CV] C=0.1 ...........................................................
[CV] .................... C=0.01, score=-0.532734558122, total=  29.4s
[CV] C=1 .............................................................
[CV] ..................... C=0.1, score=-0.525532774716, total=  42.0s
[CV] C=1 .............................................................
[CV] ..................... C=0.1, score=-0.525737471425, total=  28.8s
[CV] C=1 ........

[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:  2.7min finished


In [58]:
full_grid_result.best_params_

{'C': 1000.0}

The highest value of `C` was chosen - which means an even higher value might be prefered. I'll run it again using higher regularization parameters

In [61]:
# parameters to serach through
C_grid = {"C": [1e4, 1e5, 1e6]}

In [62]:
full_grid_search = GridSearchCV(estimator = model_full, param_grid = C_grid,  n_jobs = -1, \
                                scoring= "neg_log_loss", cv = 3, verbose = 3)

In [63]:
# search through the different regularization values. 
full_grid_result = full_grid_search.fit(X_train[all_features], y_train )

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] C=10000.0 .......................................................
[CV] C=10000.0 .......................................................
[CV] C=10000.0 .......................................................
[CV] C=100000.0 ......................................................
[CV] ................ C=100000.0, score=-0.524688920246, total=  44.9s
[CV] C=100000.0 ......................................................
[CV] ................. C=10000.0, score=-0.524872247639, total=  49.3s
[CV] C=100000.0 ......................................................
[CV] .................. C=10000.0, score=-0.52461356948, total=  49.9s
[CV] C=1000000.0 .....................................................
[CV] ................. C=10000.0, score=-0.522934089239, total=  50.9s
[CV] C=1000000.0 .....................................................
[CV] ............... C=1000000.0, score=-0.522940816562, total=  34.0s
[CV] C=1000000.0 .

[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:  1.5min remaining:   44.2s


[CV] ............... C=1000000.0, score=-0.524709348239, total=  43.1s
[CV] ................ C=100000.0, score=-0.522937767939, total=  48.0s
[CV] ............... C=1000000.0, score=-0.525446749697, total=  19.7s


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  1.8min finished


### 4. Training models and testing on Test set. 

Now that we have that the regularziation parameter should be in the neighborhood of `10000`, I will train the logistic regression on the different subsets of the data. 

Note: as I am not doing any more parameter tuning, I will not be using a developement set. 

In [68]:
model_full = LogisticRegression(C=10000, n_jobs= -1, random_state = 550).fit(X_train[all_features], y_train)

  " = {}.".format(self.n_jobs))


In [69]:
# test error
accuracy_score(y_test, model_full.predict(X_test[all_features]))

0.6962576368448391

In [176]:
print(classification_report(y_test, model_full.predict(X_test[all_features]), digits = 4))

             precision    recall  f1-score   support

          0     0.7549    0.7710    0.7628     51235
          1     0.5887    0.5671    0.5777     29623

avg / total     0.6940    0.6963    0.6950     80858



In [70]:
# no tfidf scores
model_no_tfidf = LogisticRegression(C=10000, random_state = 550).fit(X_train[no_tfidf], y_train)

In [71]:
accuracy_score(y_test, model_no_tfidf.predict(X_test[no_tfidf]))

0.68384080734126496

In [177]:
print(classification_report(y_test, model_no_tfidf.predict(X_test[no_tfidf]), digits = 4))

             precision    recall  f1-score   support

          0     0.7408    0.7707    0.7555     51235
          1     0.5736    0.5336    0.5529     29623

avg / total     0.6796    0.6838    0.6813     80858



In [72]:
# No similarity scores
model_no_similiarity_scores = LogisticRegression(C=10000, random_state = 550).fit(X_train[no_similarity_scores], y_train)

In [74]:
accuracy_score(y_test, model_no_similiarity_scores.predict(X_test[no_similarity_scores]))

0.68377897054094838

In [178]:
print(classification_report(y_test, model_no_similiarity_scores.predict(X_test[no_similarity_scores]),
                            digits = 4))

             precision    recall  f1-score   support

          0     0.7412    0.7697    0.7552     51235
          1     0.5733    0.5351    0.5536     29623

avg / total     0.6797    0.6838    0.6813     80858



In [75]:
# No TREC predictions
model_no_trec = LogisticRegression(C=10000, random_state = 550).fit(X_train[no_trec], y_train)

In [76]:
accuracy_score(y_test, model_no_trec.predict(X_test[no_trec]))

0.69510747235895021

In [171]:
print(classification_report(y_test, model_no_trec.predict(X_test[no_trec]), digits=4))

             precision    recall  f1-score   support

          0     0.7537    0.7706    0.7621     51235
          1     0.5873    0.5645    0.5756     29623

avg / total     0.6927    0.6951    0.6938     80858



In [77]:
# No TFIDF or Similarity Scores
model_no_similarity_or_tfidf = LogisticRegression(C=10000, random_state = 550).fit(X_train[no_similarity_or_tfidf], y_train)

In [78]:
accuracy_score(y_test, model_no_similarity_or_tfidf.predict(X_test[no_similarity_or_tfidf]))

0.67434267481263455

In [172]:
print(classification_report(y_test, model_no_similarity_or_tfidf.predict(X_test[no_similarity_or_tfidf]), digits=4))

             precision    recall  f1-score   support

          0     0.7302    0.7710    0.7500     51235
          1     0.5615    0.5072    0.5330     29623

avg / total     0.6684    0.6743    0.6705     80858



In [79]:
# No TF-IDF or TREC
model_no_tfidf_or_trec = LogisticRegression(C=10000, random_state = 550).fit(X_train[no_tfidf_or_trec], y_train)

In [80]:
accuracy_score(y_test, model_no_tfidf_or_trec.predict(X_test[no_tfidf_or_trec]))

0.68363056222018848

In [173]:
print(classification_report(y_test, model_no_tfidf_or_trec.predict(X_test[no_tfidf_or_trec]), digits=4))

             precision    recall  f1-score   support

          0     0.7407    0.7703    0.7552     51235
          1     0.5733    0.5337    0.5528     29623

avg / total     0.6794    0.6836    0.6811     80858



In [82]:
# No similarity or TREC
model_no_similarity_or_trec = LogisticRegression(C=10000, random_state = 550).fit(X_train[no_similarity_or_trec], y_train)

In [83]:
accuracy_score(y_test, model_no_similarity_or_trec.predict(X_test[no_similarity_or_trec]))

0.68270301021543944

In [174]:
print(classification_report(y_test, model_no_similarity_or_trec.predict(X_test[no_similarity_or_trec]), digits=4))

             precision    recall  f1-score   support

          0     0.7397    0.7704    0.7547     51235
          1     0.5721    0.5311    0.5508     29623

avg / total     0.6783    0.6827    0.6800     80858



In [84]:
# No extra features
model_no_extra_features =  LogisticRegression(C=10000, random_state = 550).fit(X_train[no_extra_features], y_train)

In [85]:
accuracy_score(y_test, model_no_extra_features.predict(X_test[no_extra_features]))

0.67279675480471934

In [175]:
print(classification_report(y_test, model_no_extra_features.predict(X_test[no_extra_features]), digits=4))

             precision    recall  f1-score   support

          0     0.7287    0.7705    0.7490     51235
          1     0.5593    0.5038    0.5301     29623

avg / total     0.6666    0.6728    0.6688     80858



Finally, a random baseline which predicts zero or one with equal probability

In [142]:
np.random.seed(550)
accuracy_score(y_test, np.random.randint(2, size = y_test.shape[0]))

0.50148408320759852

### 5. Save the models!

In [154]:
model_names = ["all_features", "no_similarity_scores", "no_tfidf", "no_trec", "no_similarity_or_tfidf", \
       "no_similarity_or_trec", "no_tfidf_or_trec", "no_extra_features"]
models = [model_full, model_no_similiarity_scores, model_no_tfidf, model_no_trec, model_no_similarity_or_tfidf, \
         model_no_similarity_or_trec, model_no_tfidf_or_trec, model_no_extra_features]
file_names = ["../models/logreg_" + name + ".pickle" for name in model_names]

In [155]:
file_names

['../models/logreg_all_features.pickle',
 '../models/logreg_no_similarity_scores.pickle',
 '../models/logreg_no_tfidf.pickle',
 '../models/logreg_no_trec.pickle',
 '../models/logreg_no_similarity_or_tfidf.pickle',
 '../models/logreg_no_similarity_or_trec.pickle',
 '../models/logreg_no_tfidf_or_trec.pickle',
 '../models/logreg_no_extra_features.pickle']

In [156]:
for i in zip(file_names, models):
    with open(i[0], 'wb') as handle:
        pkl.dump(i[1], handle)

In [157]:
# testing it out
test = pkl.load(open('../models/logreg_all_features.pickle', 'rb'))

In [162]:
test

LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=550, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

---
That's a wrap! Very lazy - I know. I just wanted to see how good my xgb models really are, relative to a more simple model. 