# Simple machine learning with Pandas

For this module, we will be exploring simple machine learning techniques using the pandas library. You will learn about:

* Prepping data
* Exploring data
* Cleaning data
* Making model
* Tuning model
* Evaluating model
 

## Get Data

In [None]:
# https://archive.ics.uci.edu/ml/datasets/heart+disease
import numpy as np
import pandas as pd

import glob

In [None]:
names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']

files = glob.glob('data/processed*.data')
df = pd.concat([pd.read_csv(f, sep=',',names=names, na_values='?',
                            dtype_backend='pyarrow',
                           engine='pyarrow')
                for f in files], axis='index', ignore_index=True)
df

In [None]:
df.fbs.value_counts()

In [None]:
df.dtypes

In [None]:
def tweak_heart(df): 
  return (df
   .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]', 
            'chol': 'int16[pyarrow]',
            'num': 'int8[pyarrow]', 'trestbps': 'int16[pyarrow]',
            'fbs': 'bool[pyarrow]', 'restecg': 'int8[pyarrow]',
            'thalach': 'int16[pyarrow]', 'exang': 'bool[pyarrow]',
           })
   .assign(sex=df.sex
                .astype('string[pyarrow]')
                .replace({'1.0': 'male', '0.0':'female'}),
          thal=(df.thal.astype('string[pyarrow]')
                       .replace({'3.0': 'normal',
                          '6.0': 'fixed', '7.0': 'reversible'})
                       .astype('category')),
          slope=(df.slope
                .astype('string[pyarrow]')
                .replace({'1.0': 'upsloping', '2.0': 'flat',
                         '3.0': 'downsloping'})
               ),
    )
  )

heart = tweak_heart(df)

In [None]:
heart

In [None]:
df.memory_usage(deep=True).sum()

In [None]:
heart.memory_usage(deep=True).sum()

## Explore Data

In [None]:
(heart
    .groupby('num')
    .mean(numeric_only=True)
    .style
    .background_gradient(cmap='RdBu', axis='index')
)

In [None]:
(heart
    .groupby('num')
    .mean(numeric_only=True)
    .T
    .astype('float[pyarrow]')
    .style
    .background_gradient(cmap='RdBu', axis='columns')
)

In [None]:
(heart
    .corr(method='spearman', numeric_only=True)
    .style
    .background_gradient(cmap='RdBu', axis='columns', vmin=-1, vmax=1)
)

In [None]:
heart.plot.scatter(x='num', y='thalach')

In [None]:
import numpy as np
def jitter(df, col, amount=1):
    return df[col] + (np.random.random(len(df)) * amount) - amount /2
(heart
 .assign(numj=lambda df_:jitter(df_, 'num', .8))
 .plot.scatter(x='numj', y='thalach', alpha=.5)
)

In [None]:
heart.plot.scatter(x='num', y='cp')

In [None]:
import numpy as np
def jitter(df, col, amount=1):
    return df[col] + (np.random.random(len(df)) * amount) - amount /2
(heart
 .assign(numj=lambda df_:jitter(df_, 'num', .8),
         cpj=lambda df_:jitter(df_, 'cp', .8))
 .plot.scatter(x='numj', y='cpj', alpha=.5)
)

In [None]:
(heart
 .groupby('sex')
 .mean(numeric_only=True)
 .T
)

In [None]:
heart.assign(sex=(heart.sex == 'male').astype('int8[pyarrow]')).corr(numeric_only=True)

## Prepping for ML

In [None]:
heart.columns

In [None]:
heart.num

In [None]:
heart.num.value_counts()

In [None]:
heart.isna()

In [None]:
heart.isna().any()

In [None]:
heart.isna().sum()

In [None]:
heart.isna().mean()*100

In [None]:
heart.num.value_counts(dropna=False)

In [None]:
heart.dtypes

In [None]:
heart.select_dtypes('int64[pyarrow]')

In [None]:
heart.select_dtypes('number')

In [None]:
import xgboost as xg
from sklearn import model_selection

X = (heart
     .assign(**heart.select_dtypes(object).astype('category'),
            # boolean is not supported 
             **heart.select_dtypes('number').astype(float),
            )
     .astype({'sex': 'category', 'fbs': float, 'exang': float,
             'slope': 'category'})
     .drop(columns=['num'])
     
    )
y = heart.num

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, stratify=y, random_state=42)

xgb = xg.XGBClassifier(enable_categorical=True, tree_method='hist')
xgb.fit(X_train, y_train)

In [None]:
xgb.score(X_test, y_test)

In [None]:
xgb.score(X_train, y_train)

## Tuning the Model

In [None]:
!pip install hyperopt

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import accuracy_score, roc_auc_score  
from sklearn import metrics

from typing import Any, Dict, Union

def hyperparameter_tuning(space: Dict[str, Union[float, int]], 
                    X_train: pd.DataFrame, y_train: pd.Series, 
                    X_test: pd.DataFrame, y_test: pd.Series, 
                    early_stopping_rounds: int=50,
                    metric:callable=accuracy_score) -> Dict[str, Any]:
    """
    Perform hyperparameter tuning for an XGBoost classifier.

    This function takes a dictionary of hyperparameters, training 
    and test data, and an optional value for early stopping rounds, 
    and returns a dictionary with the loss and model resulting from 
    the tuning process. The model is trained using the training 
    data and evaluated on the test data. The loss is computed as 
    the negative of the accuracy score.

    Parameters
    ----------
    space : Dict[str, Union[float, int]]
        A dictionary of hyperparameters for the XGBoost classifier.
    X_train : pd.DataFrame
        The training data.
    y_train : pd.Series
        The training target.
    X_test : pd.DataFrame
        The test data.
    y_test : pd.Series
        The test target.
    early_stopping_rounds : int, optional
        The number of early stopping rounds to use. The default value 
        is 50.
    metric : callable
        Metric to maximize. Default is accuracy

    Returns
    -------
    Dict[str, Any]
        A dictionary with the loss and model resulting from the 
        tuning process. The loss is a float, and the model is an 
        XGBoost classifier.
    """
    int_vals = ['max_depth', 'reg_alpha']
    space = {k: (int(val) if k in int_vals else val)
             for k,val in space.items()}
    space['early_stopping_rounds'] = early_stopping_rounds
    model = xg.XGBClassifier(**space)
    evaluation = [(X_train, y_train),
                  (X_test, y_test)]
    model.fit(X_train, y_train,
              eval_set=evaluation, 
              verbose=False)    
         
    pred = model.predict(X_test)
    score = metric(y_test, pred)
    return {'loss': -score, 'status': STATUS_OK, 'model': model}

params = {'random_state': 42}

rounds = [#{'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),},
          {'max_depth': hp.quniform('max_depth', 1, 8, 1),  # tree
           'min_child_weight': hp.loguniform('min_child_weight', -2, 3)},
          {'subsample': hp.uniform('subsample', 0.5, 1),   # stochastic
           'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)},

          {'reg_alpha': hp.uniform('reg_alpha', 0, 10),
            'reg_lambda': hp.uniform('reg_lambda', 1, 10),},
          {'gamma': hp.loguniform('gamma', -10, 10)}, # regularization
          {'learning_rate': hp.loguniform('learning_rate', -7, 0)} # boosting
]

all_trials = []
for round in rounds:
    params = {**params, **round, 
             'enable_categorical':True, 'tree_method':'hist'}
    trials = Trials()
    best = fmin(fn=lambda space: hyperparameter_tuning(space, X_train, 
                                        y_train, X_test, y_test,
                                        metric=lambda *args: metrics.f1_score(*args, average='weighted')),            
        space=params,           
        algo=tpe.suggest,            
        max_evals=20,            
        trials=trials,
    )
    params = {**params, **best}
    all_trials.append(trials)

In [None]:
params

In [None]:
params = {'random_state': 42,
 'max_depth': 4,
 'min_child_weight': 14.29114166877302,
 'enable_categorical': True,
 'tree_method': 'hist',
 'subsample': 0.721322460424803,
 'colsample_bytree': 0.5936738818753071,
 'reg_alpha': 0.4623525681816598,
 'reg_lambda': 6.545829654680657,
 'gamma': 0.003379106348942887,
 'learning_rate': 0.8040669979502167}
xgb_step = xg.XGBClassifier(**params, 
                            early_stopping_rounds=50,
                           n_estimators=2500)
xgb_step.fit(X_train, y_train,
            eval_set=[(X_train, y_train),
                 (X_test, y_test)
                ],
        verbose=100)

In [None]:
xgb_step.score(X_test, y_test)

In [None]:
xgb.score(X_test, y_test)

In [None]:
xgb_step.score(X_train, y_train)

In [None]:
xgb.score(X_train, y_train)

## Confusion Matrix

In [None]:
metrics.ConfusionMatrixDisplay.from_estimator(xgb_step, X_test, y_test,
        cmap='Blues')

In [None]:
metrics.ConfusionMatrixDisplay.from_estimator(xgb_step, X_train, y_train,
        cmap='Blues')

In [None]:
# Checking for overfitting with default model
metrics.ConfusionMatrixDisplay.from_estimator(xgb, X_test, y_test,
        cmap='Blues')

In [None]:
metrics.ConfusionMatrixDisplay.from_estimator(xgb, X_train, y_train,
        cmap='Blues')