# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [None]:
from azureml.core import Workspace, Experiment,ScriptRunConfig
from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.exceptions import ComputeTargetException
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core.model import Model
from azureml.core.dataset import Dataset
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.environment import Environment
from azureml.core.webservice import Webservice
from azureml.core.webservice import AciWebservice
from azureml.core.model import Model, InferenceConfig
from azureml.interpret.scoring.scoring_explainer import TreeScoringExplainer, save
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import joblib
import os
import json
import requests

In [None]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'capstone_automl'

experiment=Experiment(ws, experiment_name)

In [None]:
cpu_cluster_name = "cpu-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print("Found existing cpu-cluster")
except ComputeTargetException:
    print("Creating new cpu-cluster")
    
    compute_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",
                                                           min_nodes=0,
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    
compute_target.wait_for_completion(show_output=True)

## Prepare Data

In [None]:
def clean_data(data):
        embarked = {"C": 1, "S": 2, "Q": 3}
        # Clean and one hot encode data
        x_df = data.dropna()
        x_df.drop("name", inplace=True, axis=1)
        x_df.drop("boat", inplace=True, axis=1)
        x_df.drop("home.dest", inplace=True, axis=1)
        x_df.drop("body", inplace=True, axis=1)
        x_df.drop("ticket", inplace=True, axis=1)
        x_df.drop("fare", inplace=True, axis=1)
        x_df.drop("cabin", inplace=True, axis=1)
        x_df["sex"] = x_df.sex.apply(lambda s: 1 if s == "male" else 0)
        x_df["embarked"] = x_df.embarked.map(embarked)
        x_df["age"] = x_df.age.apply(lambda s: np.NaN if s == "?" else s)
        x_df = x_df.dropna()
        y_df = x_df.pop("survived")

        return x_df, y_df

In [None]:
remote_dataset = TabularDatasetFactory.from_delimited_files("https://www.openml.org/data/get_csv/16826755/phpMYEkMl")
ds = remote_dataset.to_pandas_dataframe()
x, y = clean_data(ds)
training_data =  pd.concat([x, y], axis=1)

In [None]:
if "data" not in os.listdir():
    os.mkdir("./data")

if not os.path.isfile("data/titanic.csv"):
    training_data.to_csv('data/titanic.csv',index = False)
    
# get the datastore to upload prepared data
datastore = ws.get_default_datastore()

# upload the local file from src_dir to the target_path in datastore
datastore.upload(src_dir='data', target_path='data')

# create a dataset referencing the cloud location
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, ('data/titanic.csv'))])

## AutoML Configuration

we need to predict if a passanger is survived or not so our task is classification, the primary_metric her is accuracy which the automl should optimize, and the label_column_name is the value that we should predict,training_data containingthe data that we should use for training the model, max_concurrent_iterations is the max number of iterations that can excute in parallel,experiment_timeout_minutes Maximum amount of time in minutes that all iterations combined can take before the experiment terminates

In [None]:
# TODO: Put your automl settings here
automl_settings = {
    "experiment_timeout_minutes": 1,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'accuracy'
}

# TODO: Put your automl config here
automl_config = AutoMLConfig(
                             compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="survived",   
                             debug_log = "automl_errors.log",
                             **automl_settings)

In [None]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config)

In [None]:
RunDetails(remote_run).show()

In [None]:
best_run,model = remote_run.get_output()

In [None]:
best_run

## Clear Resources

In [None]:
try:
    compute_target.delete()
except ComputeTargetException:
    print("cpu-cluster Not Found")