# Logistic Regresssion

## Load X,y data from NPZ

Using the function added to *mylib.py* file, it's now easy to grab data and X/y vectors ready to be used for model training and tuning

In [19]:
# Run content of mylib.py file
%run mylib.py

# Load data from NPZ file
#data=loadNpz()
(data, X, y)=loadXy()

Loading 'train' set
  loading  data
     shape: (281, 299, 299, 3) - dtype: float64
  loading  features
     shape: (281, 2048) - dtype: float64
  loading  filenames
     shape: (281,) - dtype: <U46
  loading  labels
     shape: (281,) - dtype: int32


Loading 'test' set
  loading  data
     shape: (51, 299, 299, 3) - dtype: float64
  loading  features
     shape: (51, 2048) - dtype: float64
  loading  filenames
     shape: (51,) - dtype: <U50
  loading  labels
     shape: (51,) - dtype: int32


Loading 'valid' set
  loading  data
     shape: (139, 299, 299, 3) - dtype: float64
  loading  features
     shape: (139, 2048) - dtype: float64
  loading  filenames
     shape: (139,) - dtype: <U30
  loading  labels
     shape: (139,) - dtype: int32


building 'trainX' set
  building  data
     shape: (420, 299, 299, 3) - dtype: float64
  building  features
     shape: (420, 2048) - dtype: float64
  building  filenames
     shape: (420,) - dtype: <U46
  building  labels
     shape: (420,) - dt

## Evaluate without any hyperparameters tuning

To do so, I will simply create a LogisticRegression estimator object using default parameters, fit it, and evaluate it agains the *test* dataset.

> Note: I've explicitly set some hyperparameters to avoid warnings about default values that will be changed in the future.


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Cross Validation K-Fold dimension
CV=5

lr=LogisticRegression()

grid_param={
    'multi_class': ['auto'],
    'C': [1.0],
    'solver': ['lbfgs'],
    'max_iter': [1000]
}



grid_lr=GridSearchCV(lr, grid_param, cv=CV, refit=True, return_train_score=True, verbose=True, n_jobs=-1, iid=True)

grid_lr.fit(X['trainX'], y['trainX'])

accuracy_lr=grid_lr.score(X['test'], y['test'])*100

print("LogisticRegression score : {:.1f}%".format(accuracy_lr))


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.2s finished


LogisticRegression score : 96.1%


## Do some hyperparameters tuning with GridSearchCV

In [22]:

lr=LogisticRegression(multi_class='ovr', solver='liblinear', max_iter=10000)

grid_param={
    'multi_class': ['ovr', 'auto'],
    'C': np.logspace(-4, 4, num=10),
    'solver': ['sag', 'saga', 'lbfgs', 'liblinear'],
    'max_iter': [1000],
}

grid_lr=GridSearchCV(lr, grid_param, cv=CV, refit=True, return_train_score=True, verbose=True, n_jobs=-1, iid=True)

grid_lr.fit(X['trainX'], y['trainX'])




Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 10.7min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'multi_class': ['ovr', 'auto'], 'C': array([1.00000e-04, 7.74264e-04, 5.99484e-03, 4.64159e-02, 3.59381e-01,
       2.78256e+00, 2.15443e+01, 1.66810e+02, 1.29155e+03, 1.00000e+04]), 'solver': ['sag', 'saga', 'lbfgs', 'liblinear'], 'max_iter': [1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)

In [23]:
import pandas as pd

columns=['param_solver', 'param_multi_class', 'param_C', 'mean_test_score', 'std_test_score', 'mean_train_score']
pd.DataFrame(grid_lr.cv_results_).sort_values('mean_test_score', ascending=False)[columns].head()




Unnamed: 0,param_solver,param_multi_class,param_C,mean_test_score,std_test_score,mean_train_score
37,saga,auto,0.359381,0.940476,0.011882,1.0
36,sag,auto,0.359381,0.940476,0.011882,1.0
29,saga,auto,0.0464159,0.935714,0.014651,0.995234
53,saga,auto,21.5443,0.935714,0.012824,1.0
32,sag,ovr,0.359381,0.935714,0.01111,1.0


In [24]:
best_lr=grid_lr.best_estimator_

best_accuracy_lr=best_lr.score(X['test'], y['test'])*100

print("LogisticRegression score : {:.1f}%".format(best_accuracy_lr))



LogisticRegression score : 96.1%


In [25]:
saveModel(best_lr, 'logistic')

Saving model logistic to model-logistic.sav


## Compute probability

In [44]:
idx=[x*10 for x in range(10)]

print(idx)

proba_df=pd.DataFrame(best_lr.predict_proba(X['train'][idx]), columns=data['class_name'])

proba_df['labels']=data['train']['filenames'][idx]
proba_df


[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]


Unnamed: 0,bike,car,motorcycle,other,truck,van,labels
0,0.998757,0.00024,0.000152,0.00057,9.7e-05,0.000184,bike/bike-0001.png
1,0.999584,0.000105,0.000109,0.000181,9e-06,1.2e-05,bike/bike-0011.png
2,0.998002,0.000731,0.000102,0.000919,0.000169,7.6e-05,bike/bike-0021.png
3,0.998835,0.000246,0.000338,0.000492,2.7e-05,6.2e-05,bike/bike-0031.png
4,0.999398,0.000224,2.6e-05,0.000273,2.9e-05,4.9e-05,bike/bike-0041.png
5,0.999153,0.00025,3.1e-05,0.000497,2.4e-05,4.5e-05,bike/bike-0051.png
6,0.997892,0.001015,0.000492,7e-05,1.6e-05,0.000515,bike/bike-0061.png
7,0.000524,0.994302,0.000162,0.000726,5.6e-05,0.00423,car/car-0005.png
8,1.9e-05,0.999811,1.8e-05,2.8e-05,6e-06,0.000117,car/car-0015.png
9,4.5e-05,0.99952,1.1e-05,8.8e-05,1e-05,0.000326,car/car-0025.png
