In [None]:
import azureml.core
from azureml.core import Workspace

ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

In [None]:
from azureml.core import Dataset

default_ds = ws.get_default_datastore()

if 'car dataset' not in ws.datasets:
    default_ds.upload_files(files=['./data2/car_prediction2.csv'], # Upload the diabetes csv files in /data
                        target_path='car-data/', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'car-data/*.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='car dataset',
                                description='car data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

In [None]:
# creating scripts for Pipeline 

import os,shutil

pipeline_folder = 'car_pipeline2'
os.makedirs(pipeline_folder,exist_ok=True)

print(pipeline_folder)

In [None]:
%%writefile $pipeline_folder/car_preprocessing.py
import os
import argparse
import pandas as pd
from azureml.core import Run
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler,LabelEncoder


# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--processed-data', type=str, dest='processed_data', default='processed_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.processed_data

#get the experiement run context
run = Run.get_context()

print("Loading Data...")
data = run.input_datasets['raw_data'].to_pandas_dataframe()


# applying label encoding
pre = preprocessing.LabelEncoder()
data1 = data.apply(pre.fit_transform)

# Normalize the numeric columns
scaler = MinMaxScaler()
num_cols = ['Selling_Price','Present_Price','Kms_Driven','car_age']
data1[num_cols] = scaler.fit_transform(data1[num_cols])

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
data1.to_csv(save_path, index=False, header=True)

# End the run
run.complete()

In [None]:
%%writefile $pipeline_folder/car_training.py
from azureml.core import Run,Model
import pandas as pd
import numpy as np
import joblib
import os
from sklearn import preprocessing,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,roc_curve
import matplotlib.pyplot as plt

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-folder", type=str, dest='training_folder', help='training data folder')
args = parser.parse_args()
training_folder = args.training_folder

# get the experiement run context
run = Run.get_context()

# load the prepared data file in the training folder
print("Loading Data...")
file_path = os.path.join(training_folder,'data.csv')
data = pd.read_csv(file_path)

# separating features and labels
x,y = data[['Selling_Price','Present_Price','Kms_Driven','Fuel_Type','Seller_Type','Transmission','car_age']].values,data1['Owner'].values


# splitting the dataset into training and testing

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=123)

# Train adecision tree model
print('Training a decision tree model...')
model = DecisionTreeClassifier().fit(x_train,y_train)

# calculate accuracy
y_hat = model.predict(x_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(x_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
run.log_image(name = "ROC", plot = fig)
plt.show()

# Save the trained model in the outputs folder
print("Saving model...")
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', 'car_model3.pkl')
joblib.dump(value=model, filename=model_file)

# Register the model
print('Registering model...')
Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'car_model3',
               tags={'Training context':'Pipeline'},
               properties={'AUC': np.float(auc), 'Accuracy': np.float(acc)})


run.complete()

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "car-cluster"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2) #STANDARD_D1
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

In [None]:
# Creating new enviroment for our model

from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment
car_pipeline_env = Environment("car_pipeline_env")
car_pipeline_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
car_pipeline_env.docker.enabled = True # Use a docker container

# Create a set of package dependencies
packages = CondaDependencies.create(conda_packages=['scikit-learn','ipykernel','matplotlib','pandas','pip'],
                                             pip_packages=['azureml-defaults','azureml-dataprep[pandas]','pyarrow'])

# Add the dependencies to the environment
car_pipeline_env.python.conda_dependencies = packages

# Register the environment 
car_pipeline_env.register(workspace=ws)
car_pipeline_env = Environment.get(ws, 'car_pipeline_env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

print(car_pipeline_env.name, 'defined.')

# # Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# # Assign the environment to the run configuration
pipeline_run_config.environment = car_pipeline_env

print ("Run configuration created.")

In [None]:
# Creating and running Pipeline

import pandas as pd
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
car_ds = ws.datasets.get("car dataset")
# data=pd.read_csv('car_pipeline2/car_prediction2.csv')

# Create a PipelineData (temporary Data Reference) for the model folder
processed_data_folder = PipelineData("processed_data_folder", datastore=ws.get_default_datastore())

# Step 1, Run the data prep script
processing_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = pipeline_folder,
                                script_name = "car_preprocessing.py",
                                arguments = ['--input-data',car_ds.as_named_input('raw_data'),
                                             '--processed-data', processed_data_folder],
                                outputs=[processed_data_folder],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
training_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = pipeline_folder,
                                script_name = "car_training.py",
                                arguments = ['--training-folder', processed_data_folder],
                                inputs=[processed_data_folder],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [processing_step, training_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'car-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

In [None]:
for run in pipeline_run.get_children():
    print(run.name, ':')
    metrics = run.get_metrics()
    for metric_name in metrics:
        print('\t',metric_name, ":", metrics[metric_name])