In [1]:
import azureml
from azureml.core import Workspace
from azureml.core import Dataset

In [2]:
# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.38.0 to work with projetcloud


In [3]:
# Get the default datastore
default_ds = ws.get_default_datastore()

#Create a tabular dataset from the path on the datastore (this may take a short while)
df = ws.datasets["sample"]

# Display the first 20 rows as a Pandas dataframe
df = df.to_pandas_dataframe()

In [4]:
import os

# Create a folder for the experiment files
experiment_folder = 'files'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder, 'folder created')

files folder created


In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "myCC1606"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


In [6]:
from azureml.core import Environment
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/experiment_env.yml")

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'experiment_env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


In [7]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
#ds = ws.datasets.get("Rhone Alpes Dataset")
ds = ws.datasets.get("sample")

# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
prepped_data = OutputFileDatasetConfig("prepped_data")

# Step 1, Run the data prep script
prep_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = experiment_folder,
                                script_name = "preprocessing.py",
                                arguments = ['--input-data', ds.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
train_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = experiment_folder,
                                script_name = "training.py",
                                arguments = ['--training-data', prepped_data.as_input(), 
                                            '--age', 14],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [12]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps =  [prep_step, train_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'projetCloud-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline is built.
Created step Prepare Data [1404e102][96fe04e3-74f1-427a-8476-1148a3c89aca], (This step will run and generate new outputs)
Created step Train and Register Model [311e7532][685573ab-ac2b-4e29-bdc1-38f0685a082d], (This step will run and generate new outputs)
Submitted PipelineRun dd98fe87-29bd-40da-a3df-b87c06c9b383
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/dd98fe87-29bd-40da-a3df-b87c06c9b383?wsid=/subscriptions/686d0f0b-ae0f-4e57-a751-af55cb836ef2/resourcegroups/projetcloud/workspaces/projetcloud&tid=190ce420-b157-44ae-bc2f-69563baa5a3b
Pipeline submitted for execution.
PipelineRunId: dd98fe87-29bd-40da-a3df-b87c06c9b383
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/dd98fe87-29bd-40da-a3df-b87c06c9b383?wsid=/subscriptions/686d0f0b-ae0f-4e57-a751-af55cb836ef2/resourcegroups/projetcloud/workspaces/projetcloud&tid=190ce420-b157-44ae-bc2f-69563baa5a3b


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRun Status: Running


StepRunId: 0e69d4da-8caa-4a29-b2db-bfbcc95a5f9a
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/0e69d4da-8caa-4a29-b2db-bfbcc95a5f9a?wsid=/subscriptions/686d0f0b-ae0f-4e57-a751-af55cb836ef2/resourcegroups/projetcloud/workspaces/projetcloud&tid=190ce420-b157-44ae-bc2f-69563baa5a3b
StepRun( Prepare Data ) Status: Running

StepRun(Prepare Data) Execution Summary
StepRun( Prepare Data ) Status: Finished
{'runId': '0e69d4da-8caa-4a29-b2db-bfbcc95a5f9a', 'target': 'myCC1606', 'status': 'Completed', 'startTimeUtc': '2022-03-09T02:00:26.0632Z', 'endTimeUtc': '2022-03-09T02:02:15.841495Z', 'services': {}, 'properties': {'ContentSnapshotId': 'db3a1d7e-f161-45d4-877f-9ecf318275e6', 'StepType': 'PythonScriptStep', 'ComputeTargetType': 'AmlCompute', 'azureml.moduleid': '96fe04e3-74f1-427a-8476-1148a3c89aca', 'azureml.moduleName': 'Prepare Data', 'azureml.runsource': 'azureml.StepRun', 'azureml.nodeid': '1404e102', 'azureml.pipelinerunid': 'dd98fe87-29b

'Finished'

In [8]:
%%writefile $experiment_folder/preprocessing.py
# Import libraries
import os
import argparse
from azureml.core import Run, Dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib


parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data

# Get the experiment run context
run = Run.get_context()

# Get the training dataset
print("Loading Data...")
df = run.input_datasets['raw_data'].to_pandas_dataframe()

run.log('raw_df_len', len(df))

# remove some useless columns
df.date_mutation=pd.to_datetime(df.date_mutation)
to_drop = ["id_mutation","numero_disposition","adresse_numero","adresse_nom_voie","adresse_code_voie","code_postal",
           "adresse_suffixe","code_commune","nom_commune","code_departement","ancien_code_commune", "ancien_nom_commune",
           "id_parcelle","ancien_id_parcelle","type_local","nature_culture","nature_culture_speciale","code_nature_culture_speciale",
          "lot1_numero","lot2_numero","lot3_numero","lot4_numero","lot5_numero", "numero_volume", "lot3_surface_carrez", "lot4_surface_carrez",
          "lot5_surface_carrez"]

reduced_df = df.drop(to_drop, axis=1)

# get_dummies
reduced_df = pd.get_dummies(reduced_df, columns=["code_nature_culture", "nature_mutation"])

# feature engineering
reduced_df["year_mutation"] = reduced_df.date_mutation.dt.year
reduced_df["code_type_local"] = 5-reduced_df.code_type_local

reduced_df = reduced_df.drop("date_mutation",axis=1)

# manage missing values
final_df = reduced_df.fillna(reduced_df.mean())


# Log raw row count
row_count = (len(final_df))
run.log('processed_rows', row_count)

# Normalization
X = final_df.drop("valeur_fonciere", axis=1)
cols = X.columns

scaler = MinMaxScaler()
final_df[cols] = scaler.fit_transform(X)

run.log_list('nulls', list((final_df.isna()).sum(axis=0).values))

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
final_df.to_csv(save_path, index=False, header=True)

joblib.dump(value=scaler, filename='outputs/myscaler.scl')
joblib.dump(value=final_df.columns, filename='outputs/cols.cl')

# End the run
run.complete()

Overwriting files/preprocessing.py


In [9]:
%%writefile $experiment_folder/training.py
# Import libraries
import os
import argparse
from azureml.core import Run, Dataset
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


# Get the script arguments (regularization rate and training dataset ID)
parser = argparse.ArgumentParser()
parser.add_argument("--training-data", type=str, dest='training_data', help='training dataset')
parser.add_argument("--age", type=str, dest='age', help='age')
args = parser.parse_args()


# Get the experiment run context
run = Run.get_context()

training_data = args.training_data

# load the prepared data file in the training folder
print("Loading Data...")
file_path = os.path.join(training_data,'data.csv')
df = pd.read_csv(file_path)

run.log('final prep dataset len', len(df))
run.log('cols', len(df.columns))

# Separate features and labels
y = df.valeur_fonciere

run.log("nb null values", (df.isna()).sum(axis=0).sum())
X = df.drop("valeur_fonciere",axis=1)

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
model = LinearRegression()
model.fit(X_train, y_train)

# calculate metrics
print('train score:', model.score(X_train, y_train))
print('test score:' , model.score(X_test, y_test))

run.log('train_score', model.score(X_train, y_train))
run.log('test_score', model.score(X_test, y_test))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/projetCloud_model.pkl')


print('Model trained and registered.')

run.complete()

Overwriting files/training.py


In [11]:
%%writefile $experiment_folder/experiment_env.yml
name: experiment_env
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Overwriting files/experiment_env.yml


In [None]:
i=0
for run in pipeline_run.get_children():
    print(run.get_metrics())
    if i==1:
        run.register_model(model_path='outputs/myscaler.scl', model_name='projetCloud_scaler')
        run.register_model(model_path='outputs/cols.cl', model_name='projetCloud_df_cols')
    else:
        run.register_model(model_path='outputs/projetCloud_model.pkl', model_name='projetCloud_model',
        properties = {'train_score': run.get_metrics()['train_score'], 'test_score': run.get_metrics()['test_score']})
    i=1  

**Model Deployment**

In [28]:
import os
from azureml.core import Model
 

model = ws.models['projetCloud_model']
scaler = ws.models['projetCloud_scaler']
df_columns = ws.models['projetCloud_df_cols']

print(model.name, 'version', model.version)

# Create a folder for the deployment files
deployment_folder = './service'
os.makedirs(deployment_folder, exist_ok=True)
print(deployment_folder, 'folder created.')

# Set path for scoring script
script_file = 'projetCloud_api.py'
script_path = os.path.join(deployment_folder,script_file)

In [25]:
%%writefile $script_path
import json
import joblib
import numpy as np
import os

# Called when the service is loaded
def init():
    global model
    # Get the path to the deployed model file and load it
    model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'projetCloud_model.pkl')
    model = joblib.load(model_path)

# Called when a request is received
def run(raw_data):
    # Get the input data as a numpy array
    data = np.array(json.loads(raw_data)['data'])
    # Get a prediction from the model
    predictions = model.predict(data)
    # Get the corresponding classname for each prediction (0 or 1)
    classnames = ['not-diabetic', 'diabetic']
    predicted_classes = []
    for prediction in predictions:
        predicted_classes.append(classnames[prediction])
    # Return the predictions as JSON
    return json.dumps(predicted_classes)

Writing ./service/projetCloud-api.py


In [26]:
from azureml.core import Environment
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice

# Configure the scoring environment
service_env = Environment(name='service-env')
python_packages = ['scikit-learn', 'azureml-defaults', 'azure-ml-api-sdk']
for package in python_packages:
    service_env.python.conda_dependencies.add_pip_package(package)
inference_config = InferenceConfig(source_directory=deployment_folder,
                                   entry_script=script_file,
                                   environment=service_env)

# Configure the web service container
deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)

# Deploy the model as a service
print('Deploying model...')
service_name = "diabetes-service"
service = Model.deploy(ws, service_name, [model], inference_config, deployment_config, overwrite=True)
service.wait_for_deployment(True)
print(service.state)

Error, invalid name "projetCloud-api" for provided entry script. The script must be importable as a valid python module, and must adhere to standard module naming. This means it must be a valid python identifier and not a part of the set of python standard keywords.



WebserviceException: WebserviceException:
	Message: Error, invalid name "projetCloud-api" for provided entry script. The script must be importable as a valid python module, and must adhere to standard module naming. This means it must be a valid python identifier and not a part of the set of python standard keywords.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Error, invalid name \"projetCloud-api\" for provided entry script. The script must be importable as a valid python module, and must adhere to standard module naming. This means it must be a valid python identifier and not a part of the set of python standard keywords."
    }
}