In [1]:
# to read the local environment variables and secret keys
import os

import uuid

# to track the model using Azure MLStudio
import mlflow
import mlflow.sklearn

from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment

from azureml.core import Experiment, Workspace
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# import 'Pandas'
import pandas as pd

# import 'Numpy'
import numpy as np

# import the KNN Imputer class
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer()

# Import label encoder
from sklearn.preprocessing import LabelEncoder, StandardScaler

label_encoder = LabelEncoder()
scaler = StandardScaler()

# to suppress warnings
from warnings import filterwarnings

filterwarnings("ignore")

# import train-test split
from sklearn.model_selection import train_test_split

# import StandardScaler to perform scaling
from sklearn.preprocessing import StandardScaler

# import various functions from sklearn
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

os.environ["GIT_PYTHON_REFRESH"] = "quiet"

In [2]:
kidney_disease_df = pd.read_csv("chronic_kidney_disease_full.csv", usecols=range(1, 26))
# Columns with nominal data
categorical_cols = [
    "sg",
    "al",
    "su",
    "rbc",
    "pc",
    "pcc",
    "ba",
    "htn",
    "dm",
    "cad",
    "appet",
    "pe",
    "ane",
    "class",
]

# Columns with numerical data
non_categorical_cols = [
    "age",
    "bp",
    "ba",
    "bgr",
    "bu",
    "sc",
    "sod",
    "pot",
    "hemo",
    "pcv",
    "wbcc",
    "rbcc",
]


def encode(data):
    """function to encode non-null data"""

    data_no_null = np.array(data.dropna())  # retains only non-null values

    encoded_data = label_encoder.fit_transform(data_no_null)  # encode date

    data.loc[data.notnull()] = np.squeeze(
        encoded_data
    )  # Assign back encoded values to non-null values

    return data


def impute(data, col):
    """function to impute null data"""

    result = knn_imputer.fit_transform(data)

    if col in categorical_cols:
        return result.astype(int)

    return np.round(result, 2)


# Rectifying column names
kidney_disease_df.columns = kidney_disease_df.columns.str.replace("'", "")

# Replacing missing values i.e. '?' with NAN
kidney_disease_df = kidney_disease_df.replace("?", np.NaN)


kidney_disease_df[categorical_cols] = kidney_disease_df[categorical_cols].apply(encode)

kidney_disease_df[categorical_cols] = kidney_disease_df[categorical_cols].astype(
    "category"
)


for col in kidney_disease_df.columns:

    kidney_disease_df[[col]] = impute(kidney_disease_df[[col]], col)


# Fit and transform the scaler to the training data
kidney_disease_df[non_categorical_cols] = scaler.fit_transform(
    kidney_disease_df[non_categorical_cols]
)


kidney_disease_df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,-0.205459,0.262336,3,1,0,0,1,0,-0.241249,-0.361993,...,0.62847,-0.240518,0.5859,1,1,0,0,0,0,0
1,-2.623805,-1.966582,3,4,0,0,1,0,-0.241249,4.2e-05,...,-0.108551,-0.954786,0.002055,0,0,0,0,0,0,0
2,0.620318,0.262336,1,2,3,1,1,0,-0.241249,3.681436,...,-0.968408,-0.359563,0.002055,0,1,0,1,0,1,0
3,-0.205459,-0.480637,0,4,0,1,0,1,-0.241249,-0.415548,...,-0.845571,-0.677015,-0.963076,1,0,0,1,1,1,0
4,-0.028507,0.262336,1,2,0,1,1,0,-0.241249,-0.562825,...,-0.477061,-0.438926,-0.129012,0,0,0,0,0,0,0


In [3]:
# split data into train subset and test subset
X_train, X_test, y_train, y_test = train_test_split(
    kidney_disease_df.drop(columns="class", axis=1),
    kidney_disease_df["class"],
    test_size=0.2,
    random_state=42,
    stratify=kidney_disease_df["class"],
)

In [4]:
# authenticate
credential = DefaultAzureCredential()

# Authenticate your tenant to connect to Microsoft CLI
# from azureml.core.authentication import InteractiveLoginAuthentication

# interactive_auth = InteractiveLoginAuthentication(
#     tenant_id="d02378ec-1688-46d5-8540-1c28b5f470f6", force=True
# )

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id="a1f8741c-055a-432c-9d86-a66323c2706d",
    resource_group_name="task3p-test-rg",
    workspace_name="task3p-test-ws",
)

ws = Workspace.from_config("config.json")
experiment = Experiment(
    workspace=ws, name="test-Task3-kidney-prediction-using-python-sdk"
)
experiment

Name,Workspace,Report Page,Docs Page
test-Task3-kidney-prediction-using-python-sdk,task3p-test-ws,Link to Azure Machine Learning studio,Link to Documentation


In [5]:
# set name for logging
mlflow.set_experiment(
    experiment_name="s224207854-Task3-kidney-prediction-using-python-sdk"
)

# enable autologging with MLflow
mlflow.sklearn.autolog()

In [9]:
mlflow.start_run()
run = experiment.start_logging()

random_forest_classifier = RandomForestClassifier(
    criterion="gini", max_depth=7, n_estimators=20, random_state=42
)

run.log("num_samples", kidney_disease_df.shape[0])
run.log("num_features", kidney_disease_df.shape[1] - 1)
run.log("criterion", "gini")

# fit the model using fit() on train data
random_forest = random_forest_classifier.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

print("\nClassification Matrix:\n", classification_report(y_test, y_pred))

# Logging all metrics of classification_report
cr = classification_report(y_test, y_pred, output_dict=True)

run.log("accuracy", cr.pop("accuracy"))

for class_or_avg, metrics_dict in cr.items():
    for metric, value in metrics_dict.items():
        run.log(class_or_avg + "_" + metric, value)

model_name = "random_forest_kidney_pred"

print("Registering the model via MLFlow")
mlflow.sklearn.log_model(
    sk_model=random_forest_classifier,
    registered_model_name=model_name,
    artifact_path=model_name,
)

# Saving the model to a file
mlflow.sklearn.save_model(
    sk_model=random_forest_classifier,
    path=os.path.join(model_name, "trained_model"),
)
model = run.register_model(
    model_name=model_name,
    tags={"area": "qna"},
    model_path=os.path.join(model_name, "trained_model"),
)
run.complete()

mlflow.end_run()


Classification Matrix:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        50
           1       1.00      0.97      0.98        30

    accuracy                           0.99        80
   macro avg       0.99      0.98      0.99        80
weighted avg       0.99      0.99      0.99        80

Registering the model via MLFlow


Registered model 'random_forest_kidney_pred' already exists. Creating a new version of this model...
Created version '2' of model 'random_forest_kidney_pred'.


In [10]:
# Import the necessary libraries
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes

# Provide the model details, including the
# path to the model files, if you've stored them locally.
mlflow_model = Model(
    path="random_forest_kidney_pred/",
    type=AssetTypes.MLFLOW_MODEL,
    name="kideny_model",
    description="MLflow Model created from local files.",
)

# Register the model
ml_client.models.create_or_update(mlflow_model)

DefaultAzureCredential failed to retrieve a token from the included credentials.
Attempted credentials:
	EnvironmentCredential: EnvironmentCredential authentication unavailable. Environment variables are not fully configured.
Visit https://aka.ms/azsdk/python/identity/environmentcredential/troubleshoot to troubleshoot this issue.
	ManagedIdentityCredential: ManagedIdentityCredential authentication unavailable, no response from the IMDS endpoint.
	SharedTokenCacheCredential: SharedTokenCacheCredential authentication unavailable. No accounts were found in the cache.
	AzureCliCredential: Azure CLI not found on path
	AzurePowerShellCredential: Az.Account module >= 2.2.0 is not installed
	AzureDeveloperCliCredential: Azure Developer CLI could not be found. Please visit https://aka.ms/azure-dev for installation instructions and then,once installed, authenticate to your Azure account using 'azd auth login'.
To mitigate this issue, please refer to the troubleshooting guidelines here at https:/

ClientAuthenticationError: DefaultAzureCredential failed to retrieve a token from the included credentials.
Attempted credentials:
	EnvironmentCredential: EnvironmentCredential authentication unavailable. Environment variables are not fully configured.
Visit https://aka.ms/azsdk/python/identity/environmentcredential/troubleshoot to troubleshoot this issue.
	ManagedIdentityCredential: ManagedIdentityCredential authentication unavailable, no response from the IMDS endpoint.
	SharedTokenCacheCredential: SharedTokenCacheCredential authentication unavailable. No accounts were found in the cache.
	AzureCliCredential: Azure CLI not found on path
	AzurePowerShellCredential: Az.Account module >= 2.2.0 is not installed
	AzureDeveloperCliCredential: Azure Developer CLI could not be found. Please visit https://aka.ms/azure-dev for installation instructions and then,once installed, authenticate to your Azure account using 'azd auth login'.
To mitigate this issue, please refer to the troubleshooting guidelines here at https://aka.ms/azsdk/python/identity/defaultazurecredential/troubleshoot.

In [9]:
# Create a unique name for the endpoint
online_endpoint_name = "kidney-endpoint-" + str(uuid.uuid4())[:8]


# define an online endpoint
endpoint = ManagedOnlineEndpoint(
    name=online_endpoint_name,
    description="this is an online endpoint",
    auth_mode="key",
    tags={
        "training_dataset": "kidney_defaults",
    },
)

# create the online endpoint
# expect the endpoint to take approximately 2 minutes.
endpoint = ml_client.online_endpoints.begin_create_or_update(endpoint).result()

In [10]:
endpoint = ml_client.online_endpoints.get(name=online_endpoint_name)

print(
    f'Endpoint "{endpoint.name}" with provisioning state "{endpoint.provisioning_state}" is retrieved'
)

Endpoint "kidney-endpoint-f498cfb3" with provisioning state "Succeeded" is retrieved


In [11]:
registered_model_name = "random_forest_kidney_pred"

# Let's pick the latest version of the model
latest_model_version = max(
    [int(m.version) for m in ml_client.models.list(name=registered_model_name)]
)
latest_model_version

2

In [12]:
# Choose the latest version of our registered model for deployment
model = ml_client.models.get(name=registered_model_name, version=latest_model_version)

# define an online deployment
online_blue_deployment = ManagedOnlineDeployment(
    name="blue",
    endpoint_name=online_endpoint_name,
    model=model,
    instance_type="Standard_F4s_v2",
    instance_count=1,
)

# create the online deployment
blue_deployment = ml_client.online_deployments.begin_create_or_update(
    online_blue_deployment
).result()

Check: endpoint kidney-endpoint-f498cfb3 exists


.....................................................................................................

In [13]:
# blue deployment takes 100% traffic
# expect the deployment to take approximately 8 to 10 minutes.
endpoint.traffic = {"blue": 100}
ml_client.online_endpoints.begin_create_or_update(endpoint).result()

Readonly attribute principal_id will be ignored in class <class 'azure.ai.ml._restclient.v2022_05_01.models._models_py3.ManagedServiceIdentity'>
Readonly attribute tenant_id will be ignored in class <class 'azure.ai.ml._restclient.v2022_05_01.models._models_py3.ManagedServiceIdentity'>


ManagedOnlineEndpoint({'public_network_access': 'Enabled', 'provisioning_state': 'Succeeded', 'scoring_uri': 'https://kidney-endpoint-f498cfb3.centralindia.inference.ml.azure.com/score', 'openapi_uri': 'https://kidney-endpoint-f498cfb3.centralindia.inference.ml.azure.com/swagger.json', 'name': 'kidney-endpoint-f498cfb3', 'description': 'this is an online endpoint', 'tags': {'training_dataset': 'kidney_defaults'}, 'properties': {'azureml.onlineendpointid': '/subscriptions/a1f8741c-055a-432c-9d86-a66323c2706d/resourcegroups/s224207854_task3_rg/providers/microsoft.machinelearningservices/workspaces/s224207854_task3_ws/onlineendpoints/kidney-endpoint-f498cfb3', 'AzureAsyncOperationUri': 'https://management.azure.com/subscriptions/a1f8741c-055a-432c-9d86-a66323c2706d/providers/Microsoft.MachineLearningServices/locations/centralindia/mfeOperationsStatus/oe:3d47cddc-6c4f-44be-af55-100fec43efed:dbdc6602-2037-4ff2-8223-2ec165447dc6?api-version=2022-02-01-preview'}, 'print_as_yaml': True, 'id': 

In [14]:
# return an object that contains metadata for the endpoint
endpoint = ml_client.online_endpoints.get(name=online_endpoint_name)

# print a selection of the endpoint's metadata
print(
    f"Name: {endpoint.name}\nStatus: {endpoint.provisioning_state}
    \nDescription: {endpoint.description}"
)

# existing traffic details
print(endpoint.traffic)

# Get the scoring URI
print(endpoint.scoring_uri)

Name: kidney-endpoint-f498cfb3
Status: Succeeded
Description: this is an online endpoint
{'blue': 100}
https://kidney-endpoint-f498cfb3.centralindia.inference.ml.azure.com/score
