In [1]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
for i in range(2):
  try:
    import autosklearn
    from autosklearn.classification import AutoSklearnClassifier
  except:
    pass
from pathlib import Path
import pickle

In [2]:
upstream = ['section-01-get-data']
product = None

In [3]:
# Parameters
upstream = {
    "section-01-get-data": {
        "train": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-01-get-data-train.pkl",
        "df": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-01-get-data-df.pkl",
        "nb": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-01-get-data.ipynb",
    }
}
product = {
    "categorical_columns": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-04-autosklearn-auto-ml-categorical_columns.pkl",
    "X_train": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-04-autosklearn-auto-ml-X_train.pkl",
    "X": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-04-autosklearn-auto-ml-X.pkl",
    "y_train": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-04-autosklearn-auto-ml-y_train.pkl",
    "y": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-04-autosklearn-auto-ml-y.pkl",
    "nb": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-04-autosklearn-auto-ml.ipynb",
}


In [4]:
train = pickle.loads(Path(upstream['section-01-get-data']['train']).read_bytes())

## 04) Autosklearn  Auto ML


In [5]:
feature_columns = train.columns[1:-1]
print(feature_columns)

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')


In [6]:
## Input data with feature columns 
X = train[feature_columns].copy()

## Convert categorical features to 'category' type
categorical_columns = ['Gender', 'Married','Dependents', 'Education', 'Self_Employed', 'Property_Area']
X[categorical_columns] = X[categorical_columns].astype('category')
X.dtypes

Gender               category
Married              category
Dependents           category
Education            category
Self_Employed        category
ApplicantIncome         int64
CoapplicantIncome     float64
LoanAmount            float64
Loan_Amount_Term      float64
Credit_History        float64
Property_Area        category
dtype: object

In [7]:
## label encode target
y = train['Loan_Status'].map({'N':0,'Y':1}).astype(int)

## train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# train
autoML = AutoSklearnClassifier(time_left_for_this_task=2*30, per_run_time_limit=30, n_jobs=8) # imposing a 1 minute time limit on this
autoML.fit(X_train, y_train)

# predict
predictions_autoML = autoML.predict(X_test)

In [9]:
print('autoML Model Accuracy:', accuracy_score(predictions_autoML, y_test))

autoML Model Accuracy: 0.7967479674796748


In [10]:
print(confusion_matrix(predictions_autoML, y_test))

[[19  1]
 [24 79]]


In [11]:
Path(product['X']).parent.mkdir(exist_ok=True, parents=True)
Path(product['X']).write_bytes(pickle.dumps(X))

Path(product['X_train']).parent.mkdir(exist_ok=True, parents=True)
Path(product['X_train']).write_bytes(pickle.dumps(X_train))

Path(product['categorical_columns']).parent.mkdir(exist_ok=True, parents=True)
Path(product['categorical_columns']).write_bytes(pickle.dumps(categorical_columns))

Path(product['y']).parent.mkdir(exist_ok=True, parents=True)
Path(product['y']).write_bytes(pickle.dumps(y))

Path(product['y_train']).parent.mkdir(exist_ok=True, parents=True)
Path(product['y_train']).write_bytes(pickle.dumps(y_train))

12442