Source: https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781789616729/7/ch07lvl1sec55/training-a-logistic-regression-model

In [32]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

n_rows = 30000
df = pd.read_csv("train.csv", nrows=n_rows)
X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], 
                                                     axis=1).values
Y = df['click'].values
n_train = 1000
X_train = X[:n_train]
Y_train = Y[:n_train]
X_test = X[n_train:]
Y_test = Y[n_train:]
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
X_train_enc = enc.fit_transform(X_train)
X_test_enc = enc.transform(X_test)

In [23]:
def sigmoid(input):
    return 1.0 / (1 + np.exp(-input))

In [24]:
def update_weights_sgd(X_train, y_train, weights, 
                                           learning_rate):
    """ One weight update iteration: moving weights by one step based on each individual sample
    Args:
    X_train, y_train (numpy.ndarray, training data set)
    weights (numpy.ndarray)
    learning_rate (float)
    Returns:
    numpy.ndarray, updated weights
    """
    for X_each, y_each in zip(X_train, y_train):
        prediction = compute_prediction(X_each, weights)
        weights_delta = X_each.T * (y_each - prediction)
        weights += learning_rate * weights_delta
    return weights

In [28]:
>>> def compute_cost(X, y, weights):
...     """ Compute the cost J(w)
...     Args:
...         X, y (numpy.ndarray, data set)
...         weights (numpy.ndarray)
...     Returns:
...         float
...     """
...     predictions = compute_prediction(X, weights)
...     cost = np.mean(-y * np.log(predictions)
                      - (1 - y) * np.log(1 - predictions))
...     return cost

In [25]:
>>> def compute_prediction(X, weights):
...     """ Compute the prediction y_hat based on current weights
...     Args:
...         X (numpy.ndarray)
...         weights (numpy.ndarray)
...     Returns:
...         numpy.ndarray, y_hat of X under weights
...     """
...     z = np.dot(X, weights)
...     predictions = sigmoid(z)
...     return predictions

In [26]:
>>> def train_logistic_regression_sgd(X_train, y_train, max_iter, 
                              learning_rate, fit_intercept=False):
...     """ Train a logistic regression model via SGD
...     Args:
...     X_train, y_train (numpy.ndarray, training data set)
...     max_iter (int, number of iterations)
...     learning_rate (float)
...     fit_intercept (bool, with an intercept w0 or not)
...     Returns:
...     numpy.ndarray, learned weights
...     """
...     if fit_intercept:
...         intercept = np.ones((X_train.shape[0], 1))
...         X_train = np.hstack((intercept, X_train))
...     weights = np.zeros(X_train.shape[1])
...     for iteration in range(max_iter):
...         weights = update_weights_sgd(X_train, y_train, weights, 
                                                     learning_rate)
...         # Check the cost for every 2 (for example) iterations
...         if iteration % 2 == 0:
...             print(compute_cost(X_train, y_train, weights))
...     return weights   

In [30]:
>>> def predict(X, weights):
...     if X.shape[1] == weights.shape[0] - 1:
...         intercept = np.ones((X.shape[0], 1))
...         X = np.hstack((intercept, X))
...     return compute_prediction(X, weights)

In [33]:
import timeit
start_time = timeit.default_timer()
weights = train_logistic_regression_sgd(X_train_enc.toarray(), 
    Y_train, max_iter=10, learning_rate=0.01, fit_intercept=True)

print("--- %0.3fs seconds ---" % 
                      (timeit.default_timer() - start_time))

pred = predict(X_test_enc.toarray(), weights)
print('Training samples: {0}, AUC on testing set: {1:.3f}'.format(n_train, roc_auc_score(Y_test, pred)))

0.41983169269579085
0.4021233331759999
0.3918493614983904
0.38399065762361156
0.3774836704388587
--- 0.172s seconds ---
Training samples: 1000, AUC on testing set: 0.684


In [34]:
>>> from sklearn.linear_model import SGDClassifier
>>> sgd_lr = SGDClassifier(loss='log', penalty=None, 
             fit_intercept=True, n_iter=10, 
             learning_rate='constant', eta0=0.01)

In [37]:
>>> sgd_lr.fit(X_train_enc.toarray(), Y_train)
>>> pred = sgd_lr.predict_proba(X_test_enc.toarray())[:, 1]
>>> print('Training samples: {0}, AUC on testing set: {1:.3f}'.format(n_train, roc_auc_score(Y_test, pred)))




Training samples: 1000, AUC on testing set: 0.682


In [38]:
>>> sgd_lr_l1 = SGDClassifier(loss='log', penalty='l1', alpha=0.0001, 
                             fit_intercept=True, n_iter=10, 
                             learning_rate='constant', eta0=0.01)
>>> sgd_lr_l1.fit(X_train_enc.toarray(), Y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.01, fit_intercept=True,
       l1_ratio=0.15, learning_rate='constant', loss='log', max_iter=None,
       n_iter=10, n_iter_no_change=5, n_jobs=None, penalty='l1',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [40]:
>>> coef_abs = np.abs(sgd_lr_l1.coef_)
>>> print(coef_abs)


[[7.31162825e-02 7.22992299e-02 2.11133851e-01 1.32195846e-01
  1.40252709e-01 3.76614751e-02 9.41960760e-03 1.40295772e-01
  1.01083968e-02 1.55753667e-03 6.15903358e-03 7.01961347e-03
  4.94103500e-02 2.74633403e-02 9.17245788e-04 2.12395575e-02
  5.97860863e-04 2.17144122e-02 5.00039561e-03 9.51680698e-03
  1.13374230e-02 0.00000000e+00 7.21754264e-03 0.00000000e+00
  7.94848432e-02 1.39830542e-03 0.00000000e+00 7.16765310e-03
  2.60361525e-02 1.79613267e-02 9.05360469e-03 3.73093433e-04
  8.62136409e-03 9.76495365e-02 0.00000000e+00 1.87183675e-02
  2.28209434e-02 7.54983234e-04 5.52393588e-02 0.00000000e+00
  6.59221110e-02 8.73496197e-03 7.66520004e-02 1.18257538e-02
  1.67143253e-02 4.95905934e-04 5.60343673e-02 1.62618099e-01
  1.30932217e-01 1.70530279e-01 9.50300513e-02 4.14397650e-03
  4.70746557e-02 5.83370303e-03 4.32156405e-02 0.00000000e+00
  0.00000000e+00 1.61473436e-02 9.63180174e-02 3.58885217e-02
  7.66738298e-03 0.00000000e+00 1.01534632e-02 1.99583459e-02
  3.2225

In [41]:
>>> print(np.sort(coef_abs)[0][:10])
>>> bottom_10 = np.argsort(coef_abs)[0][:10]

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [42]:
>>> feature_names = enc.get_feature_names()
>>> print('10 least important features are:\n', 
                                   feature_names[bottom_10])

10 least important features are:
 ['x8_beb6d112' 'x8_96b83ebe' 'x8_fd621b1f' 'x8_9d21b1a9' 'x5_f4510c5e'
 'x8_9ea0eb04' 'x2_f5476ff8' 'x6_813f3323' 'x17_100002' 'x8_f16efb5e']


In [44]:
>>> print(np.sort(coef_abs)[0][-10:])
>>> top_10 = np.argsort(coef_abs)[0][-10:]
>>> print('10 most important features are:\n', feature_names[top_10])

[0.27109837 0.27109837 0.2720559  0.29767184 0.30206933 0.30462499
 0.30462499 0.42518759 0.42518759 0.57089128]
10 most important features are:
 ['x17_100228' 'x16_171' 'x18_157' 'x13_50' 'x8_a5bce124' 'x16_1063'
 'x14_1993' 'x3_98572c79' 'x2_d9750ee7' 'x11_15701']
