In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
import evalml

In [3]:
df = pd.read_csv("heart.csv")

In [5]:
le = LabelEncoder()
df["output"] = le.fit_transform(df["output"])

In [6]:
X = df.drop(["output"], axis=1)

In [7]:
y = df["output"].values

In [8]:
X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(X, y, problem_type="binary")

In [9]:
from evalml.automl import AutoMLSearch
automl = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type="binary")
automl.search()

	High coefficient of variation (cv >= 0.5) within cross validation scores.
	Decision Tree Classifier w/ Label Encoder + Imputer + Select Columns Transformer may not perform as estimated on unseen data.


{1: {'Random Forest Classifier w/ Label Encoder + Imputer': 0.5398130416870117,
  'Total time of batch': 0.6526226997375488},
 2: {'Random Forest Classifier w/ Label Encoder + Imputer + RF Classifier Select From Model': 0.6403830051422119,
  'Total time of batch': 0.7594921588897705},
 3: {'Decision Tree Classifier w/ Label Encoder + Imputer + Select Columns Transformer': 0.33592867851257324,
  'LightGBM Classifier w/ Label Encoder + Imputer + Select Columns Transformer': 0.3867180347442627,
  'Extra Trees Classifier w/ Label Encoder + Imputer + Select Columns Transformer': 0.5492141246795654,
  'Elastic Net Classifier w/ Label Encoder + Imputer + Standard Scaler + Select Columns Transformer': 0.4632742404937744,
  'CatBoost Classifier w/ Label Encoder + Imputer + Select Columns Transformer': 0.5209269523620605,
  'XGBoost Classifier w/ Label Encoder + Imputer + Select Columns Transformer': 0.39038515090942383,
  'Logistic Regression Classifier w/ Label Encoder + Imputer + Standard Sca

In [10]:
automl.rankings

Unnamed: 0,id,pipeline_name,search_order,ranking_score,mean_cv_score,standard_deviation_cv_score,percent_better_than_baseline,high_variance_cv,parameters
0,5,Extra Trees Classifier w/ Label Encoder + Impu...,5,0.413358,0.413358,0.029595,97.476877,False,"{'Label Encoder': {'positive_label': None}, 'I..."
1,1,Random Forest Classifier w/ Label Encoder + Im...,1,0.420733,0.420733,0.032101,97.43186,False,"{'Label Encoder': {'positive_label': None}, 'I..."
2,4,LightGBM Classifier w/ Label Encoder + Imputer...,4,0.462099,0.462099,0.066745,97.179366,False,"{'Label Encoder': {'positive_label': None}, 'I..."
3,2,Random Forest Classifier w/ Label Encoder + Im...,2,0.466758,0.466758,0.024267,97.150928,False,"{'Label Encoder': {'positive_label': None}, 'I..."
4,9,Logistic Regression Classifier w/ Label Encode...,9,0.469254,0.469254,0.074869,97.135689,False,"{'Label Encoder': {'positive_label': None}, 'I..."
5,6,Elastic Net Classifier w/ Label Encoder + Impu...,6,0.470037,0.470037,0.075389,97.130913,False,"{'Label Encoder': {'positive_label': None}, 'I..."
6,8,XGBoost Classifier w/ Label Encoder + Imputer ...,8,0.488356,0.488356,0.042491,97.019094,False,"{'Label Encoder': {'positive_label': None}, 'I..."
7,7,CatBoost Classifier w/ Label Encoder + Imputer...,7,0.638074,0.638074,0.006335,96.10522,False,"{'Label Encoder': {'positive_label': None}, 'I..."
8,3,Decision Tree Classifier w/ Label Encoder + Im...,3,6.703388,6.703388,2.507087,59.082784,True,"{'Label Encoder': {'positive_label': None}, 'I..."
9,0,Mode Baseline Binary Classification Pipeline,0,16.382805,16.382805,0.141301,0.0,False,"{'Label Encoder': {'positive_label': None}, 'B..."


In [11]:
automl.best_pipeline

pipeline = BinaryClassificationPipeline(component_graph={'Label Encoder': ['Label Encoder', 'X', 'y'], 'Imputer': ['Imputer', 'X', 'Label Encoder.y'], 'Select Columns Transformer': ['Select Columns Transformer', 'Imputer.x', 'Label Encoder.y'], 'Extra Trees Classifier': ['Extra Trees Classifier', 'Select Columns Transformer.x', 'Label Encoder.y']}, parameters={'Label Encoder':{'positive_label': None}, 'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'boolean_impute_strategy': 'most_frequent', 'categorical_fill_value': None, 'numeric_fill_value': None, 'boolean_fill_value': None}, 'Select Columns Transformer':{'columns': ['age', 'cp', 'thalachh', 'exng', 'oldpeak', 'caa', 'thall']}, 'Extra Trees Classifier':{'n_estimators': 100, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}}, random_seed=0)

In [12]:
pipeline = automl.best_pipeline

In [13]:
automl.describe_pipeline(automl.rankings.iloc[0]["id"])


**********************************************************************************
* Extra Trees Classifier w/ Label Encoder + Imputer + Select Columns Transformer *
**********************************************************************************

Problem Type: binary
Model Family: Extra Trees

Pipeline Steps
1. Label Encoder
	 * positive_label : None
2. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * boolean_impute_strategy : most_frequent
	 * categorical_fill_value : None
	 * numeric_fill_value : None
	 * boolean_fill_value : None
3. Select Columns Transformer
	 * columns : ['age', 'cp', 'thalachh', 'exng', 'oldpeak', 'caa', 'thall']
4. Extra Trees Classifier
	 * n_estimators : 100
	 * max_features : auto
	 * max_depth : 6
	 * min_samples_split : 2
	 * min_weight_fraction_leaf : 0.0
	 * n_jobs : -1

Training
Training for binary problems.
Total training time (including CV): 0.5 seconds

Cross Validation
----------------
             Lo

In [14]:
pipeline.score(X_test, y_test, objectives=["auc", "f1"])

OrderedDict([('AUC', 0.8701298701298702), ('F1', 0.7812499999999999)])

In [15]:
automl_auc = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type="binary", objective="auc", additional_objectives=["f1", "precision"], max_batches=1, optimize_thresholds=True)
automl_auc.search()

{1: {'Random Forest Classifier w/ Label Encoder + Imputer': 0.5190188884735107,
  'Total time of batch': 0.6314430236816406}}

In [16]:
automl_auc.rankings

Unnamed: 0,id,pipeline_name,search_order,ranking_score,mean_cv_score,standard_deviation_cv_score,percent_better_than_baseline,high_variance_cv,parameters
0,1,Random Forest Classifier w/ Label Encoder + Im...,1,0.891494,0.891494,0.023837,39.149377,False,"{'Label Encoder': {'positive_label': None}, 'I..."
1,0,Mode Baseline Binary Classification Pipeline,0,0.5,0.5,0.0,0.0,False,"{'Label Encoder': {'positive_label': None}, 'B..."


In [17]:
automl_auc.describe_pipeline(automl_auc.rankings.iloc[0]["id"])


*******************************************************
* Random Forest Classifier w/ Label Encoder + Imputer *
*******************************************************

Problem Type: binary
Model Family: Random Forest

Pipeline Steps
1. Label Encoder
	 * positive_label : None
2. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * boolean_impute_strategy : most_frequent
	 * categorical_fill_value : None
	 * numeric_fill_value : None
	 * boolean_fill_value : None
3. Random Forest Classifier
	 * n_estimators : 100
	 * max_depth : 6
	 * n_jobs : -1

Training
Training for binary problems.
Total training time (including CV): 0.5 seconds

Cross Validation
----------------
              AUC    F1  Precision # Training # Validation
0           0.911 0.833      0.857        161           81
1           0.899 0.795      0.806        161           81
2           0.865 0.698      0.815        162           80
mean        0.891 0.775      0.826          - 

In [18]:
pipeline_auc = automl_auc.best_pipeline

In [19]:
pipeline_auc.score(X_test, y_test, objectives=["auc"])

OrderedDict([('AUC', 0.9123376623376623)])

In [20]:
pipeline.save("pipe.pkl")

In [21]:
model = automl.load("pipe.pkl")

In [22]:
model.predict_proba(X_test)

Unnamed: 0,0,1
24,0.476206,0.523794
67,0.111968,0.888032
13,0.292056,0.707944
112,0.384836,0.615164
80,0.045754,0.954246
...,...,...
160,0.131567,0.868433
234,0.596474,0.403526
110,0.655146,0.344854
190,0.892123,0.107877
