In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip uninstall xgboost -y
!pip install -U xgboost

!pip uninstall scikit-learn -y
!pip install -U scikit-learn

Found existing installation: xgboost 0.90
Uninstalling xgboost-0.90:
  Successfully uninstalled xgboost-0.90
Collecting xgboost
  Downloading xgboost-1.5.0-py3-none-manylinux2014_x86_64.whl (173.5 MB)
[K     |████████████████████████████████| 173.5 MB 28 kB/s 
Installing collected packages: xgboost
Successfully installed xgboost-1.5.0


Found existing installation: scikit-learn 1.0.1
Uninstalling scikit-learn-1.0.1:
  Successfully uninstalled scikit-learn-1.0.1
Collecting scikit-learn
  Downloading scikit_learn-1.0.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.2 MB)
[K     |████████████████████████████████| 23.2 MB 1.5 MB/s 
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.0.1


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
import matplotlib.gridspec as gridspec
import seaborn as sns

from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import brier_score_loss
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
import warnings
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
import xgboost as xgb
import joblib
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
import os

In [None]:
basedir = '/content/drive/MyDrive/recidivism/'
raw_datadir = os.path.join(basedir,'data','01_raw')
interim_datadir = os.path.join(basedir,'data','02_interim')
processed_datadir = os.path.join(basedir,'data','03_processed')
modelsdir = os.path.join(basedir,'models')

In [None]:
import sklearn
def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

In [None]:
xtrain_path = os.path.join(processed_datadir, "Xtrain.csv")
xtest_path = os.path.join(processed_datadir, "Xtest.csv")
ytrain_path = os.path.join(processed_datadir, "ytrain.csv")
ytest_path = os.path.join(processed_datadir, "ytest.csv")

X_train = pd.read_csv(xtrain_path)
X_test = pd.read_csv(xtest_path)
y_train = pd.read_csv(ytrain_path, squeeze=True)
y_test = pd.read_csv(ytest_path, squeeze=True)

print("X_train:",X_train.shape)
print("X_test:",X_test.shape)
print("y_train:",y_train.shape)
print("y_test:",y_test.shape)

X_train: (17430, 42)
X_test: (7470, 42)
y_train: (17430,)
y_test: (7470,)


In [None]:
columns = ['Age_at_Release',
'Education_Level',
'Dependents',
'Prison_Years',
'Prior_Arrest_Episodes_Felony',
'Prior_Arrest_Episodes_Misd',
'Prior_Arrest_Episodes_Violent',
'Prior_Arrest_Episodes_Property',
'Prior_Arrest_Episodes_Drug',
'Prior_Arrest_Episodes_PPViolationCharges',
'Prior_Conviction_Episodes_Felony',
'Prior_Conviction_Episodes_Misd',
'Prior_Conviction_Episodes_Prop',
'Prior_Conviction_Episodes_Drug',
'Delinquency_Reports',
'Program_Attendances',
'Program_UnexcusedAbsences',
'Residence_Changes',
'Drug_Test_Results',
'Percent_Days_Employed']
result = pd.Series({c: X_train[columns][c].unique() for c in X_train[columns]})

In [None]:
class GroupImputer(BaseEstimator, TransformerMixin):
    '''
    Class used for imputing missing values in a pd.DataFrame using either mean or median of a group.
    
    Parameters
    ----------    
    group_cols : list
        List of columns used for calculating the aggregated value 
    target : str
        The name of the column to impute
    metric : str
        The metric to be used for remplacement, can be one of ['mean', 'median']

    Returns
    -------
    X : array-like
        The array with imputed values in the target column
    '''
    def __init__(self, group_cols, target, metric='mean'):
        
        assert metric in ['mean', 'median'], 'Unrecognized value for metric, should be mean/median'
        assert type(group_cols) == list, 'group_cols should be a list of columns'
        assert type(target) == str, 'target should be a string'
        
        self.group_cols = group_cols
        self.target = target
        self.metric = metric
    
    def fit(self, X, y=None):
        
        assert pd.isnull(X[self.group_cols]).any(axis=None) == False, 'There are missing values in group_cols'
        
        impute_map = X.groupby(self.group_cols)[self.target].agg(self.metric) \
                                                            .reset_index(drop=False)
        
        self.impute_map_ = impute_map
        
        return self 
    
    def transform(self, X, y=None):
        
        # make sure that the imputer was fitted
        check_is_fitted(self, 'impute_map_')
        
        X = X.copy()
        
        for index, row in self.impute_map_.iterrows():
            ind = (X[self.group_cols] == row[self.group_cols]).all(axis=1)
            X.loc[ind, self.target] = X.loc[ind, self.target].fillna(row[self.target])
        
        return X.values



## Define Feature Selector
Use SelectKBest to select best categorical features


In [None]:
selector = SelectKBest(score_func=mutual_info_classif)

## Create imputers
* imputer Avg_Days_per_DrugTest based upon column mean

In [None]:

simpleImputer = SimpleImputer()

## Create Encoders

### One Hot Encoder

In [None]:
oneHot = OneHotEncoder(drop='if_binary')



### Ordinal Encoder

#### Define order for categories and create encoder

In [None]:
ageList = ['18-22','23-27','28-32','33-37','38-42','43-47','48 or older']
education = ['Less than HS diploma','High School Diploma', 'At least some college']
dependents = ['0', '1', '2', '3 or more']
prisonYears = ['Less than 1 year', '1-2 years', 'Greater than 2 to 3 years','More than 3 years']
felony = ['0', '1', '2', '3', '4','5', '6', '7', '8', '9', '10 or more']
misd = ['0','1','2','3','4','5','6 or more']
violent = ['0','1','2', '3 or more']
prop = ['0','1','2','3','4','5 or more']
drug = ['0','1','2','3','4','5 or more']
ppviolation = ['0','1','2','3','4','5 or more']
felony2 = ['0', '1', '2', '3 or more']
misd2 = ['0','1', '2', '3', '4 or more']
prop2 = ['0', '1', '2', '3 or more']
drug2 = ['0','1', '2 or more']
delinquency = ['0' ,'1', '2','3', '4 or more']
attendances = ['0','1','2','3','4','5', '6','7','8','9', '10 or more']
unexcused = ['0', '1', '2', '3 or more']
residence = ['0','1','2', '3 or more']
employment = ['None', 'Less Than Half', 'More Than Half', 'All']
 
ordinalEnc = OrdinalEncoder(categories=[ageList,education,dependents,
                                        prisonYears,felony,misd,violent,prop,drug,
                                        ppviolation,felony2,misd2,prop2,drug2,
                                        delinquency,attendances,unexcused,
                                        residence,employment])

In [None]:
ordlist = ['Age_at_Release','Education_Level',
'Dependents','Prison_Years','Prior_Arrest_Episodes_Felony',
'Prior_Arrest_Episodes_Misd','Prior_Arrest_Episodes_Violent',
'Prior_Arrest_Episodes_Property','Prior_Arrest_Episodes_Drug',
'Prior_Arrest_Episodes_PPViolationCharges','Prior_Conviction_Episodes_Felony',
'Prior_Conviction_Episodes_Misd','Prior_Conviction_Episodes_Prop',
'Prior_Conviction_Episodes_Drug','Delinquency_Reports',
'Program_Attendances','Program_UnexcusedAbsences',
'Residence_Changes','Percent_Days_Employed']

onehotlist = ['Gender','Race','Residence_PUMA','Gang_Affiliated',
'Supervision_Level_First','Prison_Offense','Prior_Arrest_Episodes_DVCharges',
'Prior_Arrest_Episodes_GunCharges','Prior_Conviction_Episodes_Viol',
'Prior_Conviction_Episodes_PPViolationCharges','Prior_Conviction_Episodes_DomesticViolenceCharges',
'Prior_Conviction_Episodes_GunCharges','Prior_Revocations_Parole',
'Prior_Revocations_Probation','Condition_MH_SA','Condition_Cog_Ed',
'Condition_Other','Violations_Instruction',
'Violations_FailToReport','Violations_MoveWithoutPermission',
'Employment_Exempt','Drug_Test_Results']

catlist = ['Age_at_Release','Education_Level',
'Dependents','Prison_Years','Prior_Arrest_Episodes_Felony',
'Prior_Arrest_Episodes_Misd','Prior_Arrest_Episodes_Violent',
'Prior_Arrest_Episodes_Property','Prior_Arrest_Episodes_Drug',
'Prior_Arrest_Episodes_PPViolationCharges','Prior_Conviction_Episodes_Felony',
'Prior_Conviction_Episodes_Misd','Prior_Conviction_Episodes_Prop',
'Prior_Conviction_Episodes_Drug','Delinquency_Reports',
'Program_Attendances','Program_UnexcusedAbsences',
'Residence_Changes','Percent_Days_Employed','Gender','Race','Residence_PUMA','Gang_Affiliated',
'Supervision_Level_First','Prison_Offense','Prior_Arrest_Episodes_DVCharges',
'Prior_Arrest_Episodes_GunCharges','Prior_Conviction_Episodes_Viol',
'Prior_Conviction_Episodes_PPViolationCharges','Prior_Conviction_Episodes_DomesticViolenceCharges',
'Prior_Conviction_Episodes_GunCharges','Prior_Revocations_Parole',
'Prior_Revocations_Probation','Condition_MH_SA','Condition_Cog_Ed',
'Condition_Other','Violations_Instruction',
'Violations_FailToReport','Violations_MoveWithoutPermission',
'Employment_Exempt','Drug_Test_Results']


scalerlist = ['Supervision_Risk_Score_First']

## Define Standard Scaler

In [None]:
scaler = StandardScaler()

## Define Column Transformer for use in models

In [None]:


ct = ColumnTransformer([('ordinal',ordinalEnc,ordlist),
                       ('oneHot',oneHot,onehotlist),
                        ('scaler', scaler, ['Supervision_Risk_Score_First'])
                        ], 
                       remainder='passthrough',sparse_threshold=0)

ct2 = ColumnTransformer([
                        ('scaler', scaler, ['Supervision_Risk_Score_First'])
                        ], 
                       remainder='passthrough')

In [None]:
selector = SelectFromModel(lsvc_select, threshold='median')

# Model Building

## Define function to calculate scores and produce ROC curve
## Define common variables for use in all models

In [None]:
def calc_scores(model,X_test,y_test,estimator_name):
    proba = model.predict_proba(X_test)
    preds = model.predict(X_test)
    pos_proba = proba[:,1]
    bs = brier_score_loss(y_test, pos_proba)
    f1 = f1_score(y_test,preds)
    rs = recall_score(y_test,preds)
    ps = precision_score(y_test,preds)
    roc = roc_auc_score(y_test, pos_proba)
    fpr, tpr, thresholds = roc_curve(y_test,preds)
    # display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc,estimator_name=estimator_name)
    display = RocCurveDisplay.from_predictions(y_test, pos_proba,name=estimator_name)
    scores = [bs, f1, rs, ps, roc]
    return scores, display

tests = ['Brier Score', 'F1 Score', 'Recall', 'Precision', 'ROC_AUC']
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=3)
cv = StratifiedKFold(n_splits=10)

## K-Nearest Neighbors

In [None]:
knn_model = KNeighborsClassifier()

knn_pipeline = Pipeline([("columnTransformer",ct),('select', selector),("knn", knn_model)])


In [None]:
knn_params = {'knn__n_neighbors':[5,10,15,20], 'knn__metric':['minkowski','chebyshev' ],'select__k':[5,10,15,20,25]}


In [None]:
knn_cv = GridSearchCV(knn_pipeline, knn_params, scoring='roc_auc', n_jobs=-1, cv=cv)

In [None]:
knn_cv.fit(X_train,y_train)



KeyboardInterrupt: ignored

In [None]:

joblib.dump(knn_cv.best_estimator_, os.path.join(modelsdir,'model_knn_auc_ksel_best.pkl'))
joblib.dump(knn_cv, os.path.join(modelsdir,'model_knn_auc_ksel.pkl'))

['/content/drive/MyDrive/recidivism/models/model_knn_sel.pkl']

## Logistic Regression with Elastic Net Regularization

In [None]:
lr_model = LogisticRegression()

lr_params = {'lr__warm_start': [True, False],'lr__penalty':['elasticnet'],'lr__C':[0.01, 0.05, 0.1, 0.15,0.25, 0.5, 1.0],'lr__solver':['saga'], 'lr__max_iter':[5000], 'lr__l1_ratio':np.linspace(0,1,20,True),'select__k':[5,10,15,20,25]}

lr_pipeline = Pipeline([("columnTransformer",ct),('selector', selector),("lr", lr_model)])


In [None]:
lr_cv = GridSearchCV(lr_pipeline, lr_params, scoring='roc_auc', n_jobs=-1,cv=cv)
lr_cv.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('columnTransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          sparse_threshold=0,
                                                          transformers=[('ordinal',
                                                                         OrdinalEncoder(categories=[['18-22',
                                                                                                     '23-27',
                                                                                                     '28-32',
                                                                                                     '33-37',
                                                                                                     '38-42',
                                                                

In [None]:
joblib.dump(lr_cv.best_estimator_, os.path.join(modelsdir,'model_lr_auc_ksel_best.pkl'))
joblib.dump(lr_cv, os.path.join(modelsdir,'model_lr_auc_ksel.pkl'))


['/content/drive/MyDrive/recidivism/models/model_lr_sel.pkl']

In [None]:
lr_cv.best_params_

{'lr__C': 0.05,
 'lr__l1_ratio': 0.0,
 'lr__max_iter': 5000,
 'lr__penalty': 'elasticnet',
 'lr__solver': 'saga',
 'lr__warm_start': False}

In [None]:
lr_cv.best_estimator_['lr'].coef_

array([[-0.15729236, -0.14241479,  0.23615811, -0.13363854,  0.03045618,
         0.14845079, -0.37158272,  0.38359284,  0.06365795, -0.20389454,
        -0.13433287, -0.17512876, -0.20530109,  0.2180699 ,  0.16062828,
         0.12642869, -0.24946554,  0.22028419,  0.2421364 , -0.21351359,
         0.59710643, -0.38273936,  0.01430186,  0.03648273, -0.17473468,
         0.12480357,  0.24350415,  0.15011978, -0.30262154, -0.01780261,
         0.3016732 , -0.09886603,  0.32349003,  0.23396126, -0.11483826,
        -0.20047642, -0.05758193,  0.25891182,  0.12205073]])

## Decision Tree

In [None]:
dt_model = DecisionTreeClassifier()
dt_params = {'dt__criterion':['gini', 'entropy'], 'dt__splitter':['best', 'random'],'select__k':[5,10,15,20,25]}
dt_pipeline = Pipeline([('columnTransformer', ct),('selector', selector),('dt', dt_model)])



In [None]:
dt_cv = GridSearchCV(dt_pipeline, dt_params, scoring='roc_auc', n_jobs=-1, cv=cv)
dt_cv.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('columnTransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          sparse_threshold=0,
                                                          transformers=[('ordinal',
                                                                         OrdinalEncoder(categories=[['18-22',
                                                                                                     '23-27',
                                                                                                     '28-32',
                                                                                                     '33-37',
                                                                                                     '38-42',
                                                                

In [None]:
joblib.dump(dt_cv.best_estimator_, os.path.join(modelsdir,"model_dt_auc_ksel_best.pkl"))
joblib.dump(dt_cv, os.path.join(modelsdir,"model_dt_auc_ksel.pkl"))

['/content/drive/MyDrive/recidivism/models/model_dt_sel.pkl']

## Random Forest

In [None]:
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_params = {'rf__criterion':['gini', 'entropy'], 'rf__max_depth':[5,10,15,25,50],'select__k':[5,10,15,20,25]}
rf_pipeline = Pipeline([('columnTransformer', ct),('selector', selector),('rf', rf_model)])



In [None]:
rf_cv = GridSearchCV(rf_pipeline, rf_params, scoring='roc_auc', cv=cv)
rf_cv.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('columnTransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          sparse_threshold=0,
                                                          transformers=[('ordinal',
                                                                         OrdinalEncoder(categories=[['18-22',
                                                                                                     '23-27',
                                                                                                     '28-32',
                                                                                                     '33-37',
                                                                                                     '38-42',
                                                                

In [None]:
joblib.dump(rf_cv.best_estimator_, os.path.join(modelsdir,"model_rf_auc_ksel_best.pkl"))
joblib.dump(rf_cv, os.path.join(modelsdir,"model_rf_auc_ksel.pkl"))

['/content/drive/MyDrive/recidivism/models/model_rf_sel.pkl']

In [None]:
rf_cv.best_estimator_

Pipeline(steps=[('columnTransformer',
                 ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                                   transformers=[('ordinal',
                                                  OrdinalEncoder(categories=[['18-22',
                                                                              '23-27',
                                                                              '28-32',
                                                                              '33-37',
                                                                              '38-42',
                                                                              '43-47',
                                                                              '48 '
                                                                              'or '
                                                                              'older'],
                                           

## SGD Classifier

In [None]:
sgd_model = SGDClassifier(random_state=42)
sgd_params = {'sgd__loss':['modified_huber','log'], 'sgd__penalty':['elasticnet'], 
              'sgd__alpha':[0.0001,0.001, 0.01, 0.1, 1,10,100], 'sgd__l1_ratio':np.linspace(0,1,10,True),'select__k':[5,10,15,20,25]}
sgd_pipeline = Pipeline([('columnTransformer', ct),('selector', selector),('sgd', sgd_model)])



In [None]:
sgd_cv = GridSearchCV(sgd_pipeline, sgd_params, scoring='roc_auc', cv=cv)
sgd_cv.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('columnTransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          sparse_threshold=0,
                                                          transformers=[('ordinal',
                                                                         OrdinalEncoder(categories=[['18-22',
                                                                                                     '23-27',
                                                                                                     '28-32',
                                                                                                     '33-37',
                                                                                                     '38-42',
                                                                

In [None]:

joblib.dump(sgd_cv.best_estimator_, os.path.join(modelsdir,'model_sgd_auc_ksel_best'))
joblib.dump(sgd_cv, os.path.join(modelsdir,'model_sgd_auc_ksel'))

['/content/drive/MyDrive/recidivism/models/model_sgd_sel']

In [None]:
sgd_cv.best_params_

{'sgd__alpha': 0.01,
 'sgd__l1_ratio': 0.0,
 'sgd__loss': 'log',
 'sgd__penalty': 'elasticnet'}

## XGBoost Model


In [None]:
xgb_params = {
    'xgb__n_estimators':[1000],
    'xgb__booster': ['gbtree'],
    'xgb__eta': [.0001, .001, .01, .1], 
    'xgb__max_depth': [3, 6, 8],  
    'xgb__objective': ['binary:logistic'],  
    'xgb__subsample': [0.5, 0.75],
    'xgb__tree_method': ['approx', 'hist'],
    'xgb__eval_metric':['auc'], 
    'xgb__min_child_weight': [1, 5],
    'xgb__max_delta_step': [1, 5],
    'xgb__random_state': [42],
    'select__k':[5,10,15,20,25]} 

xgb_model = xgb.XGBClassifier(use_label_encoder=False)
xgb_pipeline = Pipeline([('columnTransformer', ct),('selector', selector), ('xgb', xgb_model)])
xgb_cv = GridSearchCV(xgb_pipeline, xgb_params, scoring='roc_auc', cv=cv)

In [None]:
xtrain_xgb_path = os.path.join(processed_datadir, "Xtrain_xgb.csv")
xval_xgb_path = os.path.join(processed_datadir, "Xval_xgb.csv")
ytrain_xgb_path = os.path.join(processed_datadir, "ytrain_xgb.csv")
yval_xgb_path = os.path.join(processed_datadir, "yval_xgb.csv")

X_train_xgb = pd.read_csv(xtrain__xgb_path)
X_val_xgb = pd.read_csv(xval_xgb_path)
y_train_xgb = pd.read_csv(ytrain_xgb_path, squeeze=True)
y_val_xgb = pd.read_csv(yval_xgb_path, squeeze=True)


ct.fit(X_train_xgb)
xgb_val_x = ct.transform(X_val_xgb)

In [None]:

xgb_cv.fit(X_train,y_train)

CPU times: user 1d 8h 1min 58s, sys: 9min 30s, total: 1d 8h 11min 29s
Wall time: 17h 12min 53s


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('columnTransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          sparse_threshold=0,
                                                          transformers=[('ordinal',
                                                                         OrdinalEncoder(categories=[['18-22',
                                                                                                     '23-27',
                                                                                                     '28-32',
                                                                                                     '33-37',
                                                                                                     '38-42',
                                                                

In [None]:

joblib.dump(xgb_cv.best_estimator_, os.path.join(modelsdir,'model_xgb_auc_ksel_best.pkl'))
joblib.dump(xgb_cv, os.path.join(modelsdir,'model_xgb_auc_ksel.pkl'))

['/content/drive/MyDrive/recidivism/models/model_xgb_sel.pkl']