In [25]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


## 0. Load Data

Here, I'll use the data preprocessed using the "Largest bouding cricle" method. 

In [4]:
X = np.load("../data/preproccessed/circle/X_train.npy")
y = np.load("../data/preproccessed/circle/y_train.npy")

Right now the picutures are stored as matrices - we can unroll them to vectors to be more friendly with logistic regression

In [9]:
X = X.reshape(-1,28*28)

In [12]:
y = y.reshape(-1,)

In [10]:
X.shape

(50000, 784)

In [13]:
y.shape

(50000,)

Split into training and validation splits:

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=1)

## 1. Creating a  hyperparameter parameter grid to search through

The most important hyperparameter with regularized logistic regression is the regularization coefficient, `C`. I'll also try using l1 and l2 loss

In [17]:
grid = {"penalty" : ["l1", "l2"],
        "C" : [.01 * 3**i for i in range(8)]}

In [16]:
[.01 * 3**i for i in range(8)]

[0.01, 0.03, 0.09, 0.27, 0.81, 2.43, 7.29, 21.87]

## 2. Grid search through hyperparameters

In [18]:
clf = LogisticRegression(random_state=1)

In [20]:
gs = GridSearchCV(clf, grid, n_jobs=-1, verbose=3)

In [21]:
gs.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=0.01, penalty=l1 ..............................................
[CV] C=0.01, penalty=l1 ..............................................
[CV] C=0.01, penalty=l1 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] ..... C=0.01, penalty=l1, score=0.7302444144549408, total= 2.4min
[CV] C=0.01, penalty=l2 ..............................................
[CV] ..... C=0.01, penalty=l1, score=0.7249062265566392, total= 2.5min
[CV] C=0.01, penalty=l2 ..............................................
[CV] ..... C=0.01, penalty=l1, score=0.7368736873687368, total= 2.5min
[CV] C=0.03, penalty=l1 ..............................................
[CV] ..... C=0.03, penalty=l1, score=0.7303943619733093, total= 3.8min
[CV] C=0.03, penalty=l1 ..............................................
[CV] ..... C=0.03, penalty=l1, score=0.7358235823582359, total= 3.1min
[CV] C=0.03, pen

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 203.5min


[CV] ..... C=0.27, penalty=l2, score=0.7214971497149715, total=60.1min
[CV] C=0.81, penalty=l2 ..............................................
[CV] ...... C=0.27, penalty=l2, score=0.709152288072018, total=60.2min
[CV] C=0.81, penalty=l2 ..............................................
[CV] ..... C=0.27, penalty=l2, score=0.7174239016344279, total=64.5min
[CV] C=2.43, penalty=l1 ..............................................
[CV] ..... C=2.43, penalty=l1, score=0.7304693357324936, total= 2.9min
[CV] C=2.43, penalty=l1 ..............................................
[CV] ..... C=2.43, penalty=l1, score=0.7352985298529853, total= 3.2min
[CV] C=2.43, penalty=l1 ..............................................
[CV] ..... C=2.43, penalty=l1, score=0.7206301575393849, total= 3.2min
[CV] C=2.43, penalty=l2 ..............................................
[CV] ..... C=0.81, penalty=l2, score=0.7172739541160594, total=59.6min
[CV] C=2.43, penalty=l2 ..............................................
[CV] .

[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 426.4min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.01, 0.03, 0.09, 0.27, 0.81, 2.43, 7.29, 21.87]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [22]:
gs.best_params_

{'C': 0.01, 'penalty': 'l1'}

Getting the Validation score:

In [24]:
best_clf = gs.best_estimator_

In [26]:
accuracy_score(best_clf.predict(X_valid), y_valid)

0.7336