# **`LOGISTIC REGRESSION HYPERPARAMETER`**

In [1]:
import pandas as pd
## Lets see one more example of complex data

# make a prediction with multinomial logistic model:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

#define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_classes=2, random_state=1)

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Model training hyperparameter tuning
## Gridsearchcv

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix ,accuracy_score , classification_report
import warnings
warnings.filterwarnings("ignore")

In [4]:
parameter = {"penalty":('l1', 'l2', 'elasticnet'),"C":[1,10,20,30]}

In [5]:
classifier = LogisticRegression()

In [6]:
clf = GridSearchCV(classifier,param_grid=parameter,cv=5)

In [7]:
# Splitting of Training data into train and validation
clf.fit(X_train,y_train)

In [8]:
clf.best_params_

{'C': 1, 'penalty': 'l2'}

In [9]:
clf.best_score_

0.81875

In [10]:
classifier = LogisticRegression(C=1,penalty="l2")

In [11]:
classifier.fit(X_train,y_train)

In [12]:
y_pred = classifier.predict(X_test)

In [13]:
y_pred

array([1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1])

In [14]:
classifier.predict_proba(X_test)

array([[0.04642154, 0.95357846],
       [0.0720128 , 0.9279872 ],
       [0.76804229, 0.23195771],
       [0.60627995, 0.39372005],
       [0.02549653, 0.97450347],
       [0.39168378, 0.60831622],
       [0.70975553, 0.29024447],
       [0.24680441, 0.75319559],
       [0.49009698, 0.50990302],
       [0.19073432, 0.80926568],
       [0.77749059, 0.22250941],
       [0.93050038, 0.06949962],
       [0.95630477, 0.04369523],
       [0.83426471, 0.16573529],
       [0.95639081, 0.04360919],
       [0.91755689, 0.08244311],
       [0.047208  , 0.952792  ],
       [0.19753084, 0.80246916],
       [0.82997678, 0.17002322],
       [0.32847084, 0.67152916],
       [0.88317244, 0.11682756],
       [0.38723375, 0.61276625],
       [0.9188561 , 0.0811439 ],
       [0.89388609, 0.10611391],
       [0.11796127, 0.88203873],
       [0.65668978, 0.34331022],
       [0.80843533, 0.19156467],
       [0.60902702, 0.39097298],
       [0.65765904, 0.34234096],
       [0.91256312, 0.08743688],
       [0.

In [15]:
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[92 17]
 [27 64]]
0.78
              precision    recall  f1-score   support

           0       0.77      0.84      0.81       109
           1       0.79      0.70      0.74        91

    accuracy                           0.78       200
   macro avg       0.78      0.77      0.78       200
weighted avg       0.78      0.78      0.78       200



## Randomized SearchCV

In [16]:
from sklearn.model_selection import RandomizedSearchCV

In [17]:
random_cf = RandomizedSearchCV(LogisticRegression(),param_distributions=parameter,cv=5,n_iter=20)

In [18]:
random_cf

In [19]:
random_cf.fit(X_train,y_train)

In [20]:
random_cf.best_params_

{'penalty': 'l2', 'C': 1}

## Internal assignment : Logistic regression on IRIS dataset

In [21]:
from sklearn.datasets import load_iris

In [22]:
dataset = load_iris()

In [30]:
print(dataset.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [38]:
dataset.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [40]:
df = pd.DataFrame(dataset.data,columns=dataset.feature_names)

In [41]:
df["target"] = dataset.target

## data for binomial logistic regression

In [45]:
data = df[df["target"] != 2]

In [46]:
data.target.unique()

array([0, 1])

In [67]:
##X and y
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## GRIDSEARCHCV

In [69]:
parameter = {"penalty":('l1', 'l2', 'elasticnet'),"C":[1,10,20,30], "solver" : ('lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga')}

In [70]:
classifier = LogisticRegression()

In [71]:
clf = GridSearchCV(classifier,param_grid=parameter,cv=10)

In [72]:
clf.fit(X_train,y_train)

In [73]:
clf.best_params_

{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}

In [74]:
clf.best_score_

1.0

In [75]:
classifier = LogisticRegression(C=1,penalty = "l1",solver="saga")

In [76]:
classifier.fit(X_train,y_train)

In [77]:
y_pred = classifier.predict(X_test)

In [78]:
y_pred

array([1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0])

In [91]:
classifier.predict_proba(X_test)

array([[2.30776236e-06, 9.99997692e-01],
       [2.35992477e-04, 9.99764008e-01],
       [1.47336946e-05, 9.99985266e-01],
       [9.97497987e-01, 2.50201338e-03],
       [9.93844782e-01, 6.15521843e-03],
       [9.98265718e-01, 1.73428163e-03],
       [9.99885688e-01, 1.14312154e-04],
       [9.52701622e-04, 9.99047298e-01],
       [9.98833075e-01, 1.16692464e-03],
       [9.99048103e-01, 9.51897133e-04],
       [9.96844590e-01, 3.15540971e-03],
       [9.96117750e-01, 3.88224999e-03],
       [2.94288139e-05, 9.99970571e-01],
       [9.99682153e-01, 3.17847359e-04],
       [9.74988869e-05, 9.99902501e-01],
       [9.99256068e-01, 7.43932404e-04],
       [8.89838540e-06, 9.99991102e-01],
       [3.39794377e-06, 9.99996602e-01],
       [9.98295049e-01, 1.70495119e-03],
       [9.96979749e-01, 3.02025067e-03]])

In [80]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[12  0]
 [ 0  8]]
1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         8

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



## RANDOMIZEDSEARCCV

In [81]:
classifier = LogisticRegression()

In [82]:
clf = RandomizedSearchCV(classifier,param_distributions=parameter,cv=10)

In [83]:
clf.fit(X_train,y_train)

In [84]:
clf.best_params_

{'solver': 'newton-cholesky', 'penalty': 'l2', 'C': 30}

In [85]:
clf.best_score_

1.0

In [86]:
classifier = LogisticRegression(C=30,penalty="l2",solver="newton-cholesky")

In [87]:
classifier.fit(X_train,y_train)

In [88]:
y_pred = classifier.predict(X_test)

In [89]:
classifier.predict_proba(X_test)

array([[2.30776236e-06, 9.99997692e-01],
       [2.35992477e-04, 9.99764008e-01],
       [1.47336946e-05, 9.99985266e-01],
       [9.97497987e-01, 2.50201338e-03],
       [9.93844782e-01, 6.15521843e-03],
       [9.98265718e-01, 1.73428163e-03],
       [9.99885688e-01, 1.14312154e-04],
       [9.52701622e-04, 9.99047298e-01],
       [9.98833075e-01, 1.16692464e-03],
       [9.99048103e-01, 9.51897133e-04],
       [9.96844590e-01, 3.15540971e-03],
       [9.96117750e-01, 3.88224999e-03],
       [2.94288139e-05, 9.99970571e-01],
       [9.99682153e-01, 3.17847359e-04],
       [9.74988869e-05, 9.99902501e-01],
       [9.99256068e-01, 7.43932404e-04],
       [8.89838540e-06, 9.99991102e-01],
       [3.39794377e-06, 9.99996602e-01],
       [9.98295049e-01, 1.70495119e-03],
       [9.96979749e-01, 3.02025067e-03]])

In [90]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[12  0]
 [ 0  8]]
1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         8

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

