# Feature Selection

We will use forward selection using AIC criterion on logistic regression.

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import random

import os
import os.path as path
import pickle

In [18]:
train = pd.read_csv('data/train_out.csv')

read = open('train/train_na.pkl', 'rb')
train_na = pickle.load(read)
read.close()

train.index = train.msno
train = train[~train_na]
train = train.sample(n = 10000)


train_X = train.drop(['msno', 'is_churn', 'concated'], axis = 1)
train_y = train.is_churn

In [19]:
train.columns

Index(['msno', 'is_churn', '201501', '201502', '201503', '201504', '201505',
       '201506', '201507', '201508', '201509', '201510', '201511', '201512',
       '201601', '201602', '201603', '201604', '201605', '201606', '201607',
       '201608', '201609', '201610', '201611', '201612', '201701', '201702',
       'bd', 'registration_init_time', 'expiration_date', 'payment_plan_days',
       'transaction_date', 'membership_expire_date', 'is_cancel',
       'is_auto_renew', 'num_transactions', 'num_unq', 'total_secs',
       'concated', 'consecutive_zeros', 'consecutive_ones', 'city_1.0',
       'city_3.0', 'city_4.0', 'city_5.0', 'city_6.0', 'city_7.0', 'city_8.0',
       'city_9.0', 'city_10.0', 'city_11.0', 'city_12.0', 'city_13.0',
       'city_14.0', 'city_15.0', 'city_16.0', 'city_17.0', 'city_18.0',
       'city_19.0', 'city_20.0', 'city_21.0', 'city_22.0', 'city_999.0',
       'gender_NA', 'gender_female', 'gender_male', 'registered_via_3.0',
       'registered_via_4.0', 'registe

Let's see how logistic regression does using all the features.

In [20]:
log = LogisticRegression()
log.fit(train_X, train_y)
score = log.score(train_X, train_y)
print('Best Guess score:              %f' % (1 - np.mean(train_y)))
print('Logistic Regression score:     %f' % score)

Best Guess score:              0.932700
Logistic Regression score:     0.968000


93% is pretty bad, since that is just the mean of what we are predicting. Thus we must do forward selection to identify relevant variables.

### Forward Variable Selection

The basic idea of forward variable selection is to continually add more variables to our model and seeing which variable creates the most improvements. We will continue iterating only if the model is improving. Otherwise, we will stop in order to avoid taking in noisy and irrelevant features.

In [21]:
isImproving = True
features = train_X.columns
ranked_features = []
score = []
logloss = []
AUC = [0]
i = 0
while isImproving:
    top_AUC = AUC[i]
    print('Iteration %s' % str(i+1))
    for var in train_X.columns.difference(ranked_features):
        log = LogisticRegression()
        log.fit(train_X[[var] + ranked_features], train_y)
        pred = log.predict(train_X[[var] + ranked_features])
        prob = log.predict_proba(train_X[[var] + ranked_features])
        curr_AUC = roc_auc_score(train_y, pred)
        if curr_AUC > top_AUC:
            best_feature = var
            top_AUC = curr_AUC
            best_score= np.mean(pred == train_y)
            top_log_loss = log_loss(train_y, prob)
    if(AUC[i] >= top_AUC):
        print('Logfit is not improving...')
        isImproving = False
        break
    else:
        print('best feature: %s' % best_feature)
        print('AUC:          %f' % top_AUC)
        print('Score:        %f' % best_score)
        print('Log Loss:     %f' % top_log_loss)
        print('-----------------------')
        AUC.append(top_AUC)
        ranked_features.append(best_feature)
        score.append(best_score)
        logloss.append(top_log_loss)
    i += 1

Iteration 1
best feature: expiration_date
AUC:          0.685736
Score:        0.957700
Log Loss:     0.199016
-----------------------
Iteration 2
best feature: 201702
AUC:          0.751973
Score:        0.959100
Log Loss:     0.130417
-----------------------
Iteration 3
best feature: is_auto_renew
AUC:          0.809379
Score:        0.964600
Log Loss:     0.096936
-----------------------
Iteration 4
best feature: 201612
AUC:          0.823426
Score:        0.963800
Log Loss:     0.093696
-----------------------
Iteration 5
best feature: payment_plan_days
AUC:          0.829997
Score:        0.963200
Log Loss:     0.090591
-----------------------
Iteration 6
best feature: 201509
AUC:          0.831859
Score:        0.964100
Log Loss:     0.090419
-----------------------
Iteration 7
best feature: city_12.0
AUC:          0.833344
Score:        0.964300
Log Loss:     0.090377
-----------------------
Iteration 8
best feature: city_6.0
AUC:          0.833505
Score:        0.964600
Log Los

In [25]:
ranked_features = ['expiration_date',
                  'membership_expire_date',
                  'registered_via_7.0',
                  'is_auto_renew',
                  'is_cancel']

### Forward Variable Selection with Cross Validation

We can also find the best features with cross validation fold (n_fold = 5). This will give a better idea ofwhat our actual score will be in the end.

In [22]:
n_fold = 3
cv = StratifiedKFold(n_splits = n_fold, random_state = 42)


In [28]:
train_X = train_X.reset_index().drop(['msno'], axis = 1)
train_y = train_y.reset_index().drop(['msno'], axis = 1)

In [35]:
train_X.index.shape

(10000,)

In [36]:
isImproving = True
features = train_X.columns
ranked_features2 = []
score = []
curr_score = []
logloss = []
curr_logloss = []
AUC = [[0]]
curr_cv_AUC = []
i = 0
while isImproving:
    top_AUC = np.mean(AUC[i])
    print('Iteration %s' % str(i+1))
    for var in train_X.columns.difference(ranked_features2):
        ### Add CV
        
        curr_cv_AUC = []
        curr_logloss = []
        curr_score = []
        
        for i_trn, i_val in cv.split(train_X, train_y):
            
            log = LogisticRegression()
            log.fit(train_X.loc[i_trn, [var] + ranked_features2], train_y[i_trn])
            pred = log.predict(train_X.loc[i_val, [var] + ranked_features2])
            prob = log.predict_proba(train_X.loc[i_val, [var] + ranked_features2])
            curr_AUC = roc_auc_score(train_y[i_val], pred)
            curr_cv_AUC.append(curr_AUC)
            curr_score.append(pred == train_y[i_val])
            curr_logloss.append(log_loss(train_y[i_val], prob))
          
        if np.mean(curr_cv_AUC) > np.mean(top_AUC):
            best_feature = var
            top_AUC = curr_cv_AUC
            best_score= curr_score
            top_log_loss = curr_logloss

    if(np.mean(AUC[i]) >= np.mean(top_AUC)):
        print('Logfit is not improving...')
        isImproving = False
        break
    else:
        print('best feature: %s' % best_feature)
        print('AUC:          %f' % np.mean(top_AUC))
        print('Score:        %f' % np.mean(best_score))
        print('Log Loss:     %f' % np.mean(top_log_loss))
        print('-----------------------')
        score.append(curr_score)
        logloss.append(curr_logloss)
        AUC.append(curr_cv_AUC)
        AUC.append(top_AUC)
        ranked_features2.append(best_feature)
        score.append(best_score)
        logloss.append(top_log_loss)
    i += 1

Iteration 1


IndexError: too many indices for array

[[0.22852491985673834, 0.22771551762172304, 0.22826281756063532],
 [0.19168042715323549, 0.19166939403941957, 0.19118863098225081],
 [0.18128228601947433, 0.18068370876893211, 0.18071214246597558],
 [0.14575567569476386, 0.14517778847512317, 0.14490048127113253],
 [0.14165583797073164, 0.14080240696779375, 0.14086321786718495],
 [0.13762068423989871, 0.13649567387260006, 0.13640331458236057],
 [0.13596901720417562, 0.13467281080735002, 0.13479734456770029],
 [0.12979934839749135, 0.1289415959384472, 0.1290855058112341],
 [0.12818631881663184, 0.12716833201299571, 0.12756544221999314],
 [0.12818631881663184, 0.12716833201299571, 0.12756544221999314]]

In [25]:
log = LogisticRegression()
log.fit(train_X[ranked_features], train_y)
prob = log.predict_proba(train_X[ranked_features])
predict = log.predict(train_X[ranked_features])
print("Log Loss of Logistic Regression:         %f" % log_loss(train_y, prob))
print('Accuracy of Logistic Regression:         %f' % (1 - np.mean(train_y - predict)))
output = open('models/log_probs.pkl', 'wb')
pickle.dump(prob, output, protocol=pickle.HIGHEST_PROTOCOL)
output.close
print('Logistic Regression Probs Saved ~~~~')
output = open('train/best_features.pkl', 'wb')
pickle.dump(ranked_features, output, protocol=pickle.HIGHEST_PROTOCOL)
output.close()
print('Best features saved!')

Log Loss of Logistic Regression:         0.094033
Accuracy of Logistic Regression:         0.986200
Logistic Regression Probs Saved ~~~~
Best features saved!
