In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
import os

In [2]:
wisconsin = pd.read_csv('G:/My Drive/#myCDACfiles/MachineLearning/Datasets/BreastCancer.csv', index_col=0)
le = LabelEncoder()
wisconsin['Class'] = le.fit_transform(wisconsin['Class'])
le.classes_

array(['Benign', 'Malignant'], dtype=object)

In [3]:
X, y = wisconsin.drop('Class', axis=1), wisconsin['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25, stratify=y)

In [4]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.intercept_, lr.coef_

(array([-10.13685623]),
 array([[ 0.54707603,  0.04474491,  0.20952885,  0.19219542, -0.00722953,
          0.39656381,  0.60692755,  0.19441253,  0.53085196]]))

In [5]:
X_test.shape

(210, 9)

In [6]:
lr.predict(X_test)

array([0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0])

In [7]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[137   1]
 [  6  66]]


In [8]:
(137+66)/210

0.9666666666666667

In [9]:
accuracy_score(y_test, y_pred)

0.9666666666666667

In [10]:
hr = pd.read_csv('G:/My Drive/#myCDACfiles/MachineLearning/Datasets/HR_comma_sep.csv')
X, y = hr.drop('left', axis=1), hr['left']
ohe = OneHotEncoder(drop='first', sparse_output=False).set_output(transform='pandas')
col_trnf = ColumnTransformer([('OHE',ohe, make_column_selector(dtype_include=object))],
                             remainder = 'passthrough', verbose_feature_names_out=False)
col_trnf = col_trnf.set_output(transform='pandas')
X = col_trnf.fit_transform(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25, stratify=y)

In [12]:
lr = LogisticRegression(solver='liblinear', penalty='l1')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[3206  223]
 [ 672  398]]
0.8010669037563903


In [13]:
lr = LogisticRegression(solver='lbfgs', penalty='l2')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88      3429
           1       0.65      0.41      0.50      1070

    accuracy                           0.81      4499
   macro avg       0.74      0.67      0.69      4499
weighted avg       0.79      0.81      0.79      4499



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Hyper params tuning

In [18]:
solvers = ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
Cs = np.linspace(0.001, 15, 20)
penalties = ['l2',None]
scores = []
for s in solvers:
    for p in penalties:
        for c in Cs:
            lr = LogisticRegression(solver=s, penalty=p, C=c)
            lr.fit(X_train, y_train)
            y_pred = lr.predict(X_test)
            scores.append([s, p,c, accuracy_score(y_test, y_pred)])
df_scores = pd.DataFrame(scores, columns=['solver', 'penalty','C', 'score'])
df_scores.sort_values('score', ascending=False)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

Unnamed: 0,solver,penalty,C,score
15,lbfgs,l2,11.842316,0.812625
6,lbfgs,l2,4.737526,0.811291
7,lbfgs,l2,5.526947,0.809958
14,lbfgs,l2,11.052895,0.809513
9,lbfgs,l2,7.105789,0.809291
...,...,...,...,...
138,sag,l2,14.210579,0.750611
123,sag,l2,2.369263,0.750389
122,sag,l2,1.579842,0.750389
158,sag,,14.210579,0.750389
