# Using regularized logistic regression to classify email

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
from sklearn import linear_model
#import sklearn.cross_validation
from sklearn import model_selection
#from sklearn.cross_validation import KFold
import scipy.io

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [3]:
# No modifications in this cell
# complete the functions in utils.py; then run the cell

Xtrain,Xtest,ytrain,ytest = utils.load_spam_data()

# Preprocess the data 

Xtrain_std,mu,sigma = utils.std_features(Xtrain)
Xtrain_logt = utils.log_features(Xtrain)
Xtrain_bin = utils.bin_features(Xtrain)

Xtest_std = (Xtest - mu)/sigma
Xtest_logt = utils.log_features(Xtest)
Xtest_bin = utils.bin_features(Xtest)

# find good lambda by cross validation for these three sets

def run_dataset(X,ytrain,Xt,ytest,typea,penalty):

    best_lambda = utils.select_lambda_crossval(X,ytrain,0.1,5.1,0.5,penalty)
    print("best_lambda = %.3f" %best_lambda)

    # train a classifier on best_lambda and run it
    if penalty == "l2":
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='lbfgs',fit_intercept=True, max_iter = 1000)
    else:
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='liblinear',fit_intercept=True, max_iter = 1000)
    lreg.fit(X,ytrain)
    print("Coefficients = %s" %lreg.intercept_,lreg.coef_)
    predy = lreg.predict(Xt)
    print("Accuracy on set aside test set for %s = %.4f" %(typea, np.mean(predy==ytest)))

print("L2 Penalty experiments -----------")
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l2")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l2")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l2")

print("L1 Penalty experiments -----------")
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l1")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l1")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l1")

L2 Penalty experiments -----------
best_lambda = 0.100
Coefficients = [-4.86040334] [[-2.74989021e-02 -2.25078145e-01  1.21933342e-01  2.27519296e+00
   2.70510908e-01  2.32902366e-01  9.28075879e-01  2.95203238e-01
   1.62391465e-01  6.78236056e-02 -8.32285561e-02 -1.60331906e-01
  -4.73192480e-02  1.09035781e-02  1.88419472e-01  8.20190621e-01
   5.10133424e-01  3.99166951e-02  2.67707779e-01  3.47720628e-01
   2.60450591e-01  3.63304053e-01  7.24359842e-01  1.96754553e-01
  -3.15921102e+00 -4.03826233e-01 -1.25620964e+01 -6.07598298e-02
  -1.55647644e+00 -5.63375367e-02 -3.19234250e-02  4.07136076e-01
  -3.68546152e-01 -1.39395310e+00 -5.81551120e-01  4.43928532e-01
   4.22303189e-02 -1.56981219e-01 -4.55678098e-01 -1.02371553e-01
  -3.52786604e+00 -1.73940286e+00 -4.36522978e-01 -1.06146498e+00
  -9.18569211e-01 -1.75180743e+00 -1.67476526e-01 -9.53309298e-01
  -3.65592304e-01 -1.36376560e-01 -6.58582617e-02  2.06565615e-01
   1.70665682e+00  1.22652148e+00 -3.33950136e-01  1.55492