## Logistic Regression (OVO - One vs One and OVR - One vs Rest)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Making dataset

In [2]:
from sklearn.datasets import make_classification

In [3]:
## create the dataset
X, y = make_classification(n_samples=10000, n_features=10, n_classes=2, random_state=42)

In [5]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.633563,0.357385,-0.503931,0.935066,0.647981,-0.050796,-1.933989,2.081684,0.041266,-0.258298
1,1.283905,1.109459,-0.908953,1.006586,0.492219,1.107295,1.243526,-0.172200,1.150359,0.147744
2,-0.966476,-0.593314,0.458020,1.032323,1.283685,-0.317640,1.499045,0.434477,0.423678,1.251380
3,2.429309,-1.306530,-1.869925,3.092164,2.028800,-0.879635,-0.393494,-0.101213,-1.624066,0.443553
4,-1.204798,0.078464,0.705181,0.224765,0.618707,1.534946,-0.302288,2.325055,0.495505,0.538133
...,...,...,...,...,...,...,...,...,...,...
9995,-1.606404,0.228927,0.959690,0.145821,0.682755,-0.927143,-0.280438,0.789222,-1.330100,-1.463687
9996,1.154048,-0.120265,-0.643621,-0.467386,-0.825604,0.725398,-1.439272,-1.132146,1.511610,-0.114986
9997,0.841468,-0.593749,-0.391671,-0.955036,-1.169619,0.683856,-1.629486,0.289335,-0.434358,-1.271335
9998,1.080252,-0.607761,-0.488605,-1.338506,-1.605447,-1.689724,0.202908,0.291496,0.827980,-1.069399


In [6]:
pd.DataFrame(y)

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,0
...,...
9995,0
9996,0
9997,0
9998,0


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=42)

### Implementing Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
logistic_reg = LogisticRegression()

In [12]:
logistic_reg.fit(X_train, y_train)

In [13]:
y_pred = logistic_reg.predict(X_test)

In [15]:
print(y_pred)

[1 1 1 ... 0 1 0]


In [16]:
y_proba = logistic_reg.predict_proba(X_test)

In [17]:
print(y_proba)

[[0.01640705 0.98359295]
 [0.11288143 0.88711857]
 [0.11091352 0.88908648]
 ...
 [0.95175071 0.04824929]
 [0.47883592 0.52116408]
 [0.97621358 0.02378642]]


### Performance Metrics

In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [19]:
score = accuracy_score(y_test, y_pred)
score

0.8923333333333333

In [20]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[1329,  155],
       [ 168, 1348]])

In [22]:
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.89      0.90      0.89      1484
           1       0.90      0.89      0.89      1516

    accuracy                           0.89      3000
   macro avg       0.89      0.89      0.89      3000
weighted avg       0.89      0.89      0.89      3000



## Hyperparameter Tuning and Cross Validation

In [23]:
logistic_reg = LogisticRegression()

In [43]:
penalty = ['l1', 'l2', 'elasticnet']
c_vals = [0.1,0.01,1.0,10.0,100.0]
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']

In [44]:
hyperparameters = dict(penalty = penalty, C = c_vals, solver = solvers)

In [45]:
hyperparameters

{'penalty': ['l1', 'l2', 'elasticnet'],
 'C': [0.1, 0.01, 1.0, 10.0, 100.0],
 'solver': ['lbfgs',
  'liblinear',
  'newton-cg',
  'newton-cholesky',
  'sag',
  'saga']}

#### Grid Search CV

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [50]:
cv = StratifiedKFold()
grid = GridSearchCV(estimator=logistic_reg, param_grid=hyperparameters, scoring='accuracy', cv=cv, n_jobs=-1)

In [51]:
grid

In [52]:
grid.fit(X_train, y_train)

250 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/dhruvsmac/Desktop/development/machine learning/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/dhruvsmac/Desktop/development/machine learning/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/dhruvsmac/Desktop/development/machine learning/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py"

In [54]:
grid.best_params_

{'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}

In [55]:
grid.best_score_

np.float64(0.8917142857142857)

In [56]:
y_pred = grid.predict(X_test)

In [58]:
## Scoring
score = accuracy_score(y_test, y_pred)
score

0.895

In [61]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[1332,  152],
       [ 163, 1353]])

In [60]:
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.89      0.90      0.89      1484
           1       0.90      0.89      0.90      1516

    accuracy                           0.90      3000
   macro avg       0.89      0.90      0.89      3000
weighted avg       0.90      0.90      0.90      3000

