# Installing kernels, packages and creating TXT and YAML files

# Choosing kernels

In [None]:
!jupyter kernelspec list

In [None]:
#activate prefered kernel
!conda activate python38-azureml
#!jupyter kernelspec cmd install python38-azureml

# Handling packages

In [None]:
!pip list

In [None]:
#install separate package
!pip install pandas

But usually it is better to have the installations stored in requirements file

In [None]:
%%writefile Day20-requirements.txt
###### Requirements without specifying version ######
pandas
numpy
scikit-learn


# uncomment if you want verson specific
###### Requirements with package version ######
#python == 3.8             # exact version
#numpy ~= 1.21             # compatible release
#pip >= 21.2.4             # version later (minium version)
#scikit-learn != 0.24.2    # version different from


In [None]:
!pip install -r Day20-requirements.txt

# Creating folder structure

In [None]:
import os

dependencies_dir = "./Day20-dependencies"
os.makedirs(dependencies_dir, exist_ok=True)


## Creating YAML files

In [None]:
%%writefile {dependencies_dir}/conda.yml
name: model-test-day20
channels:
  - conda-forge
dependencies:
  - python=3.8
  - numpy=1.21.2
  - pip=21.2.4
  - scikit-learn=0.24.2
  - scipy=1.7.1
  - pandas>=1.1,<1.2
  - pip:
    - xlrd==2.0.1
    - mlflow== 1.26.1
    - azureml-mlflow==1.42.0
    - psutil>=5.8,<5.9
    - tqdm>=4.59,<4.60
    - ipykernel~=6.0
    - matplotlib

## Installing Python scripts

In [None]:
import os

train_src_dir = "./Day20-src"
os.makedirs(train_src_dir, exist_ok=True)

In [None]:
%%writefile {train_src_dir}/main.py
import os
import argparse
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to input data")
    parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
    parser.add_argument("--n_estimators", required=False, default=100, type=int)
    parser.add_argument("--learning_rate", required=False, default=0.1, type=float)
    parser.add_argument("--registered_model_name", type=str, help="model name")
    args = parser.parse_args()
   
    # Start Logging
    mlflow.start_run()

    # enable autologging
    mlflow.sklearn.autolog()

    ###################
    #
    ###################
    print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

    print("input data:", args.data)
    
    credit_df = pd.read_excel(args.data, header=1, index_col=0)

    mlflow.log_metric("num_samples", credit_df.shape[0])
    mlflow.log_metric("num_features", credit_df.shape[1] - 1)

    train_df, test_df = train_test_split(
        credit_df,
        test_size=args.test_train_ratio,
    )
    ####################
    ##################
    # Extracting the label column
    y_train = train_df.pop("default payment next month")

    # convert the dataframe values to array
    X_train = train_df.values

    # Extracting the label column
    y_test = test_df.pop("default payment next month")

    # convert the dataframe values to array
    X_test = test_df.values

    print(f"Training with data of shape {X_train.shape}")

    clf = GradientBoostingClassifier(
        n_estimators=args.n_estimators, learning_rate=args.learning_rate
    )
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print(classification_report(y_test, y_pred))
    ###################
    ##########################
    # Registering the model to the workspace
    print("Registering the model via MLFlow")
    mlflow.sklearn.log_model(
        sk_model=clf,
        registered_model_name=args.registered_model_name,
        artifact_path=args.registered_model_name,
    )

    # Saving the model to a file
    mlflow.sklearn.save_model(
        sk_model=clf,
        path=os.path.join(args.registered_model_name, "trained_model"),
    )
    ###########################
    #
    ###########################
    
    # Stop Logging
    mlflow.end_run()

if __name__ == "__main__":
    main()

## Creating test files

In [None]:
test_inference_dir = "./Day20-test"
os.makedirs(test_inference_dir, exist_ok=True)

In [None]:
%%writefile {test_inference_dir}/sample-request.json
{
  "input_data": {
    "columns": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
    "index": [0, 1],
    "data": [
            [200,2,2,1,24,2,2,-1,-1,-2,-2,33,31,69,0,0,0,0,689,0,0],
            [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 10]
        ]
  }
}

and testing it as

In [None]:
# testing the inference end-point
ml_client.online_endpoints.invoke(
    endpoint_name=online_endpoint_name,
    request_file="./Day20-test/sample-request.json",
    deployment_name="blue",
)