In [3]:
%matplotlib inline

In [76]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer

from sklearn.linear_model import LogisticRegression, Lasso, Ridge, ElasticNet

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import classification_report, roc_curve

## Misc notes

conda env create  
Docker  
ML Flow  

pareidolia

sklearn pipelines  
luigi


sklearn make_column_selector

pickle in python

irreducible error (noise) - неизбежна грешка

variance, statistical error

bias, assumption error

L1 Regularization (Lasso):  

Minimizes the sum of absolute values of the coefficients.
Encourages sparsity by forcing some coefficients to be exactly zero, which helps in feature selection.

L2 Regularization (Ridge):

Minimizes the sum of squares of the coefficients.
Discourages large weights but keeps all coefficients small rather than setting some of them to zero.

L0 Regularization:

Counts the number of non-zero parameters. Ideally, it would produce the sparsest possible model, but it's hard to optimize and not used in practice.

L∞ Regularization:

Minimizes the maximum absolute value of the coefficients, controlling the influence of the largest weight and ensuring robustness to extremes.

hyperopt (also for sklearn) - alternative to GridSearchCV

optuna - mostly for neural networks

Yellowbrick - visualization library

## Lecture demo

In [5]:
EPS = 1e-10

In [6]:
diabetes_data = pd.read_csv("data/diabetic_data.csv")

In [7]:
diabetes_data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [8]:
diabetes_data.columns[diabetes_data.columns.str.contains("_id")]

Index(['encounter_id', 'admission_type_id', 'discharge_disposition_id',
       'admission_source_id'],
      dtype='object')

In [9]:
diabetes_data.pioglitazone.unique()

array(['No', 'Steady', 'Up', 'Down'], dtype=object)

In [10]:
diabetes_data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [11]:
diabetes_attributes = diabetes_data.drop(columns = "readmitted")
diabetes_target = diabetes_data.readmitted

In [12]:
diabetes_attributes_dummies = pd.get_dummies(diabetes_attributes)

In [13]:
logistic_regression = LogisticRegression()

In [14]:
logistic_regression.fit(diabetes_attributes_dummies, diabetes_target)

In [15]:
logistic_regression.coef_

array([[-3.31701395e-09, -3.86905107e-09, -1.15814534e-15, ...,
        -3.26369841e-16, -1.63318872e-16, -3.02903976e-16],
       [-1.50329991e-10,  4.51003945e-09, -1.47130836e-16, ...,
        -1.14848644e-16, -1.11876233e-16,  7.41720589e-17],
       [ 3.46734394e-09, -6.40988385e-10,  1.30527618e-15, ...,
         4.41218484e-16,  2.75195106e-16,  2.28731917e-16]])

In [16]:
scaler = MinMaxScaler()

In [17]:
diabetes_attributes_scaled = scaler.fit_transform(diabetes_attributes_dummies)

In [18]:
logistic_regression.fit(diabetes_attributes_scaled, diabetes_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
logistic_regression.score(diabetes_attributes_scaled, diabetes_target)

0.5951987893795571

In [20]:
logistic_regression.score(diabetes_attributes_dummies, diabetes_target)



0.5386966177308727

In [21]:
pipeline = Pipeline([
    ("scaler", MinMaxScaler()),
    ("model", LogisticRegression())
])

In [22]:
pipeline

In [23]:
sample_data = diabetes_data.sample(5000, random_state = 42)

In [24]:
sample_attributes = sample_data.drop(columns = "readmitted")

In [25]:
sample_target = sample_data.readmitted

In [26]:
sample_attributes = pd.get_dummies(sample_attributes)

In [27]:
pipeline.fit(sample_attributes, sample_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
ohe = OneHotEncoder()

In [29]:
sample_data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [30]:
categorical_columns = sample_data.dtypes[sample_data.dtypes == np.object_].index.values

In [31]:
categorical_columns = categorical_columns[:-1]

In [32]:
categorical_columns

array(['race', 'gender', 'age', 'weight', 'payer_code',
       'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum',
       'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone',
       'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide',
       'citoglipton', 'insulin', 'glyburide-metformin',
       'glipizide-metformin', 'glimepiride-pioglitazone',
       'metformin-rosiglitazone', 'metformin-pioglitazone', 'change',
       'diabetesMed'], dtype=object)

In [33]:
numerical_columns = [
    'admission_type_id', 'discharge_disposition_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
    'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses'
]

In [34]:
preprocessor = ColumnTransformer([
    ("categorical", OneHotEncoder(), categorical_columns),
    ("numerical", MinMaxScaler(), numerical_columns)
], remainder = "passthrough")

In [35]:
preprocessor

In [36]:
log_transformer = FunctionTransformer(np.log10)

In [37]:
number_processor = Pipeline([
    ("log_transformer", FunctionTransformer(lambda x: np.log10(x + EPS))),
    ("minmax", MinMaxScaler())
])

In [38]:
number_processor

In [39]:
preprocessor = ColumnTransformer([
    ("categorical", OneHotEncoder(), categorical_columns),
    ("numerical", number_processor, numerical_columns)
])

In [40]:
preprocessor

In [41]:
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("classifier", LogisticRegression())
])

In [42]:
sample_attributes = sample_data.drop(columns = "readmitted")
sample_target = sample_data.readmitted

In [43]:
pipeline.fit(sample_attributes, sample_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
pipeline.score(sample_attributes, sample_target)

0.6536

In [45]:
pickle.dump(logistic_regression, open("prediction_pipeline.pkl", "wb"))

In [46]:
pipeline.predict(sample_attributes)

array(['NO', 'NO', 'NO', ..., 'NO', '>30', 'NO'], dtype=object)

In [47]:
diabetes_data = pd.read_csv("data/diabetic_data.csv")

In [48]:
sample_data_2 = diabetes_data.sample(5000, random_state = 12341234)
attributes = sample_data.drop(columns = ["readmitted"])
target = sample_data.readmitted

In [49]:
pipeline.fit(attributes, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [52]:
pipeline["classifier"].coef_

array([[-0.55514279,  0.02120968,  0.04448777, ...,  0.13727923,
         0.3915153 , -0.13270708],
       [ 0.12678974,  0.08800068,  0.03791863, ...,  0.19328463,
         0.18011862,  0.59068217],
       [ 0.42835304, -0.10921036, -0.0824064 , ..., -0.33056387,
        -0.57163392, -0.45797509]])

In [53]:
pipeline_reg = Pipeline([
    ("preprocess", preprocessor),
    ("classifier", LogisticRegression(C = 0.00001))
])

In [54]:
pipeline_reg.fit(attributes, target)

In [55]:
pipeline_reg["classifier"].coef_

array([[-7.88055964e-05,  3.02571721e-05, -7.33316206e-06, ...,
         1.79056917e-04,  7.65948701e-04,  1.09658901e-04],
       [-1.36925626e-04, -6.14118072e-05, -2.42008740e-05, ...,
         6.62637403e-04,  1.51759689e-03,  3.15485794e-04],
       [ 2.15731223e-04,  3.11546351e-05,  3.15340360e-05, ...,
        -8.41694319e-04, -2.28354559e-03, -4.25144695e-04]])

In [57]:
train_test_split(attributes)

[        encounter_id  patient_nbr             race  gender      age    weight  \
 54218      158104902     70891839  AfricanAmerican  Female  [40-50)         ?   
 56962      163012440     53776458        Caucasian  Female  [70-80)         ?   
 14311       55890654     20016900        Caucasian  Female  [70-80)         ?   
 1566        10645878     12590199        Caucasian  Female  [70-80)         ?   
 85393      270267876     67982049        Caucasian    Male  [80-90)         ?   
 ...              ...          ...              ...     ...      ...       ...   
 100121     421925582     57454947        Caucasian  Female  [70-80)         ?   
 81409      252264120     85416777        Caucasian    Male  [60-70)  [75-100)   
 6301        31419150     14364063        Caucasian  Female  [70-80)         ?   
 35956      110939484     19274094        Caucasian  Female  [70-80)         ?   
 65004      181032426     21117519        Caucasian  Female  [60-70)         ?   
 
         admis

In [67]:
attributes_train, attributes_test, target_train, target_test = train_test_split(
    attributes,
    target,
    test_size = 0.1,
    random_state = 121212,
    stratify = target
)

In [60]:
attributes_train.shape, attributes_test.shape, target_train.shape, target_test.shape

((4500, 49), (500, 49), (4500,), (500,))

In [64]:
target.value_counts(normalize = True)

readmitted
NO     0.5346
>30    0.3514
<30    0.1140
Name: proportion, dtype: float64

In [65]:
target_train.value_counts(normalize = True)

readmitted
NO     0.534667
>30    0.351333
<30    0.114000
Name: proportion, dtype: float64

In [66]:
target_test.value_counts(normalize = True)

readmitted
NO     0.534
>30    0.352
<30    0.114
Name: proportion, dtype: float64

In [68]:
pipeline.fit(attributes_train, target_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [69]:
pipeline.score(attributes_train, target_train)

0.6611111111111111

In [None]:
pipeline.score(attributes_test, target_test)

In [72]:
print(classification_report(target_train, pipeline.predict(attributes_train)))

              precision    recall  f1-score   support

         <30       0.67      0.18      0.28       513
         >30       0.62      0.54      0.58      1581
          NO       0.68      0.84      0.75      2406

    accuracy                           0.66      4500
   macro avg       0.66      0.52      0.54      4500
weighted avg       0.66      0.66      0.64      4500



In [74]:
roc_curve(pipeline.predict_proba)

# or

roc_curve(pipeline.decision_function)

# and not .predict

TypeError: roc_curve() missing 1 required positional argument: 'y_score'

In [79]:
cv = GridSearchCV(pipeline, param_grid = {
    "classifier__C": [1, 10, 100],
    "preprocess__numerical__minmax__feature_range": [(-1, 1), (0, 1), (-5, 5)]
}, cv = 10).fit(attributes, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 444, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 718, in score
    Xt = transform.transform(Xt)
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3

In [81]:
cv.best_estimator_

In [82]:
cv.best_params_

{'classifier__C': 1, 'preprocess__numerical__minmax__feature_range': (-1, 1)}

In [83]:
cv.cv_results_

{'mean_fit_time': array([0.36259732, 0.36528413, 0.38364916, 0.41433687, 0.37383745,
        0.38515785, 0.46861107, 0.43655906, 0.41420364]),
 'std_fit_time': array([0.02694638, 0.04796   , 0.07022053, 0.02125337, 0.04679792,
        0.04000985, 0.06384144, 0.10257405, 0.05966125]),
 'mean_score_time': array([0.00872223, 0.00724046, 0.00732613, 0.00804622, 0.00730131,
        0.00718019, 0.00865386, 0.00897255, 0.00826979]),
 'std_score_time': array([0.00558287, 0.00051906, 0.00036108, 0.000887  , 0.00050358,
        0.00038222, 0.00068029, 0.00264954, 0.00177019]),
 'param_classifier__C': masked_array(data=[1, 1, 1, 10, 10, 10, 100, 100, 100],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_preprocess__numerical__minmax__feature_range': masked_array(data=[(-1, 1), (0, 1), (-5, 5), (-1, 1), (0, 1), (-5, 5),
                    (-1, 1), (0, 1), (-5, 5)],
              mas