In [1]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import os

In [2]:
df = pd.read_csv(os.path.join("..", "Resources", "myopia.csv"))
df.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY,MYOPIC
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1,0
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0,0


In [3]:
target = df["MYOPIC"]
target_names = ["negative", "positive"]

In [4]:
data = df.drop("MYOPIC", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [6]:
# Create the Logistic Model
model = LogisticRegression(max_iter=1000)
model

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [7]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust
# Try adjusting `C` with values of 0.001, 0.01, 0.1, 1, 10, 100. Adjust `tol` by using .0001, 0.001, and 0.01.
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'tol': [0.0001, 0.001, 0.01]}
grid_clf = GridSearchCV(model, param_grid, verbose=3)

In [8]:
# Fit the model by using the grid search estimator. 
# This will take the LogisticRegression model and try each combination of parameters.
grid_clf.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END ...............C=0.001, tol=0.0001;, score=0.860 total time=   0.0s
[CV 2/5] END ...............C=0.001, tol=0.0001;, score=0.860 total time=   0.0s
[CV 3/5] END ...............C=0.001, tol=0.0001;, score=0.860 total time=   0.0s
[CV 4/5] END ...............C=0.001, tol=0.0001;, score=0.870 total time=   0.0s
[CV 5/5] END ...............C=0.001, tol=0.0001;, score=0.870 total time=   0.0s
[CV 1/5] END ................C=0.001, tol=0.001;, score=0.860 total time=   0.0s
[CV 2/5] END ................C=0.001, tol=0.001;, score=0.860 total time=   0.0s
[CV 3/5] END ................C=0.001, tol=0.001;, score=0.860 total time=   0.0s
[CV 4/5] END ................C=0.001, tol=0.001;, score=0.870 total time=   0.0s
[CV 5/5] END ................C=0.001, tol=0.001;, score=0.870 total time=   0.0s
[CV 1/5] END .................C=0.001, tol=0.01;, score=0.860 total time=   0.0s
[CV 2/5] END .................C=0.001, tol=0.01;

0,1,2
,estimator,LogisticRegre...max_iter=1000)
,param_grid,"{'C': [0.001, 0.01, ...], 'tol': [0.0001, 0.001, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.01
,C,100
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [9]:
# List the best parameters for this dataset
print(grid_clf.best_params_)

{'C': 100, 'tol': 0.01}


In [10]:
# List the best score
print(grid_clf.best_score_)

0.8920289855072463


In [11]:
# Create the parameter object for the RandomizedSearchCV estimator
# Try adjusting `C` with values from 0 to 10 and 'tol` with values from 0 to 0.001
param_grid = {
    'C' : np.arange(0, 10, 0.01),
    'tol': np.arange(0, 0.001, 1e-5),
}
param_grid

{'C': array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
        0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
        0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
        0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
        0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
        0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
        0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
        0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
        0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
        0.99, 1.  , 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09,
        1.1 , 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19, 1.2 ,
        1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3 , 1.31,
        1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4 , 1.41, 1.42,
        1.43, 1.44, 1.45, 1.46, 1

In [12]:
# Create the RandomizedSearch estimator by using the LogisticRegression model and the parameter grid that you created
from sklearn.model_selection import RandomizedSearchCV
random_clf = RandomizedSearchCV(model, param_grid, random_state=0, verbose=3)

In [13]:
# Fit the model by using the randomized search estimator. 
# This will take the LogisticRegression model and a random sample of combinations of parameters
random_clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ...............C=6.82, tol=0.00068;, score=0.882 total time=   0.0s
[CV 2/5] END ...............C=6.82, tol=0.00068;, score=0.849 total time=   0.0s
[CV 3/5] END ...............C=6.82, tol=0.00068;, score=0.871 total time=   0.0s
[CV 4/5] END ...............C=6.82, tol=0.00068;, score=0.913 total time=   0.0s
[CV 5/5] END ...............C=6.82, tol=0.00068;, score=0.880 total time=   0.0s
[CV 1/5] END .C=4.3500000000000005, tol=0.00067;, score=0.882 total time=   0.0s
[CV 2/5] END .C=4.3500000000000005, tol=0.00067;, score=0.849 total time=   0.0s
[CV 3/5] END .C=4.3500000000000005, tol=0.00067;, score=0.871 total time=   0.0s
[CV 4/5] END .C=4.3500000000000005, tol=0.00067;, score=0.902 total time=   0.0s
[CV 5/5] END .C=4.3500000000000005, tol=0.00067;, score=0.880 total time=   0.0s
[CV 1/5] END C=4.26, tol=0.00013000000000000002;, score=0.882 total time=   0.0s
[CV 2/5] END C=4.26, tol=0.00013000000000000002;

0,1,2
,estimator,LogisticRegre...max_iter=1000)
,param_distributions,"{'C': array([0. , ..., 9.98, 9.99]), 'tol': array([0.0e+0...-04, 9.9e-04])}"
,n_iter,10
,scoring,
,n_jobs,
,refit,True
,cv,
,verbose,3
,pre_dispatch,'2*n_jobs'
,random_state,0

0,1,2
,penalty,'l2'
,dual,False
,tol,np.float64(0.00068)
,C,np.float64(6.82)
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [14]:
# List the best parameters for this dataset
print(random_clf.best_params_)

{'tol': np.float64(0.00068), 'C': np.float64(6.82)}


In [15]:
# List the best score
print(random_clf.best_score_)

0.8791257597007947


In [16]:
# Make predictions with the hypertuned model
predictions = random_clf.predict(X_test)

In [17]:
# Calculate the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.91      0.97      0.94       137
    positive       0.56      0.28      0.37        18

    accuracy                           0.89       155
   macro avg       0.73      0.62      0.66       155
weighted avg       0.87      0.89      0.87       155

