# Applied Project in Big Data on Industrial Dataset

## MODELS SELECTION TECHNIQUES
## Part IV. MLflow framework to manage experiments

### 1. Install and start MLflow server

You can easily install [MLflow](https://mlflow.org/) for your tasks in `DataScience environment` with the following script:

In [None]:
!cat /home/jovyan/__MANUAL/manutils/start-mlflow.sh

To run install process open a terminal and type `cd ~ && __MANUAL/manutils/start-mlflow.sh` and MLflow will be installed.

### 2. Libraries

In [None]:
import os
import re
import json
import time
import random
import datetime
import numpy as np
import pandas as pd
from joblib import dump, load
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import multiprocessing
from multiprocessing import Pool
from sklearn.feature_extraction.text import (
    TfidfVectorizer, 
    CountVectorizer
)
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score, 
    confusion_matrix, 
    precision_score, 
    recall_score, 
    f1_score,
    roc_curve, 
    auc
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, 
    GradientBoostingClassifier
)
from sklearn.model_selection import (
    cross_val_score, 
    train_test_split,
    StratifiedKFold
)
import mlflow
from mlflow import log_metric, log_param, log_params, log_artifacts
from mlflow.models.signature import infer_signature

pd.set_option('display.max_columns', None)
N_CORES = min(
    multiprocessing.cpu_count(), 
    int(float(os.environ['CPU_LIMIT']))
)
print('cores:', N_CORES)
mlflow.set_tracking_uri(f'file:///home/jovyan/{os.environ["JUPYTERHUB_USER"]}_mlflow')
print('MLflow UI available at:',
      'https://jhas01.gsom.spbu.ru{}proxy/{}/'.format(
          os.environ['JUPYTERHUB_SERVICE_PREFIX'], 50000))

### 2. Create config and place to store artifacts

It would be a good idea to store all experiment's artifacts in one place:

In [None]:
start_time = time.time()

# here is our config dictionary
# we can use it to manage model's parameters
# and save it to disk for a history
VER = 'v1'
CONFIG = {
    'version': VER,
    'start_time': str(datetime.datetime.fromtimestamp(start_time)),
    'sample_size': 1500,
    'ngram_range': (1, 1), 
    'max_df': .95, 
    'min_df': 5,
    'clf': 'GradientBoostingClassifier', # 'RandomForestClassifier' or `GradientBoostingClassifier`
    'folds': 4,
    'seed': 2023,
    'n_iters': 10,
    'comments': 'my first model'
}

# path to store our model
# will create folder each time
# we run our training code
MDLS_PATH = f'./models_{VER}'
if not os.path.exists(MDLS_PATH):
    os.mkdir(MDLS_PATH)
with open(f'{MDLS_PATH}/config.json', 'w') as file:
    json.dump(CONFIG, file)

# useful trick to fix randomness
def seed_all(seed):
    """
    Sometimes it is useful to nail all randomness
    and fix all random seeds for reproducibility.
    
    This function fixes all random seeds for current pipline, 
    but it could be extended e.g. for Tensorflow library 
    you may want to add `tf.random.set_seed(seed)` in the code.
    
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_all(CONFIG['seed'])

### 2. Dataset for modelling

In [None]:
df = pd.read_csv('articles_data.csv')
df = df.sample(CONFIG['sample_size']).reset_index()
del df['index']
print(df.shape)
display(df.head())

In [None]:
df.groupby('target').count()

In [None]:
# not necessary but can be helpful
# to reproduce experiments
save_data_path = f'{MDLS_PATH}/data_{CONFIG["version"]}.csv'
df.to_csv(save_data_path)

### 3. Modelling with save of results

In [None]:
def text_features(data, vectorizer):
    print('total texts:', len(data))
    features = vectorizer.fit_transform(data)
    print(
        'features shape:', features.shape, 
        'max:', np.max(features), 
        'min:', np.min(features)
    )
    return features, vectorizer

In [None]:
def cross_val_model(X, y, 
                    folds, clf,
                    vectorizer, ngram_range=(1, 1), 
                    max_df=.2, min_df=8, seed=2022):
    scores = {}
    roc_auc_scores = []
    f1_scores = []
    skf = StratifiedKFold(n_splits=folds, random_state=seed, shuffle=True)
    for fold, (train_idxs, test_idxs) in enumerate(skf.split(X, y)):
        
        # MLflow run initialization
        name_of_run = f'run_model_{CONFIG["version"]}_{datetime.datetime.now()}'
        with mlflow.start_run(run_name=name_of_run) as run:
            
            # train model
            X_train, X_test = X.iloc[train_idxs], X.iloc[test_idxs]
            y_train, y_test = y.iloc[train_idxs], y.iloc[test_idxs]
            X_train, vectorizer = text_features(
                X_train, 
                vectorizer=vectorizer
            )
            X_test = vectorizer.transform(X_test)
            clf.fit(X_train, y_train)

            # saving models
            # more about https://scikit-learn.org/stable/model_persistence.html
            # NOTE - not only model, but vectorizer too!
            file_name = f'{MDLS_PATH}/model_fold_{fold}.joblib'
            dump(clf, file_name)
            print('saved to', file_name)
            file_name = f'{MDLS_PATH}/vectorizer_fold_{fold}.joblib'
            dump(vectorizer, file_name)
            print('saved to', file_name)

            # metrics
            y_score = clf.predict_proba(X_test)
            roc_auc_score_ = roc_auc_score(y_test, y_score[:, 1])
            roc_auc_scores.append(roc_auc_score_)
            y_pred = clf.predict(X_test)
            f1_score_ = f1_score(y_test, y_pred)
            f1_scores.append(f1_score_)
            msg = f'fold {fold} - val ROC-AUC score: {roc_auc_score_:.2f}, val f1-score: {f1_score_:.2f}'
            print(msg)

            scores[f'fold {fold}'] = {
                'roc_auc_scores': roc_auc_scores,
                'f1_scores': f1_scores
            }

            # MLflow tracking model
            signature = infer_signature(X_test, y_pred)
            mlflow.sklearn.log_model(
                clf, 
                'model',
                registered_model_name=f'model {CONFIG["version"]} {CONFIG["clf"]} fold {fold}', 
                signature=signature
            )
            mlflow.log_artifact(
                local_path=file_name,
                artifact_path='vectorizer',
            )

            # MLflow tracking config
            for key, value in CONFIG.items():
                mlflow.log_param(key, value)

            # MLflow tracking metrics
            mlflow.log_metric('ROC-AUC score', roc_auc_score_)
            mlflow.log_metric('f1-score', f1_score_)

            # logging (if needed)
            # read about logging for production cases here https://docs.python.org/3/library/logging.html
            log_file_name = f'{MDLS_PATH}/model_{CONFIG["version"]}.log'
            log_string = f'{str(datetime.datetime.fromtimestamp(time.time()))} - {msg}\n'
            with open(log_file_name, 'a') as file:
                file.write(log_string)
        
    return scores

In [None]:
# create vectorizer with respect to
# our configuration parameters
vectorizer=TfidfVectorizer(
    ngram_range=CONFIG['ngram_range'], 
    max_df=CONFIG['max_df'], 
    min_df=CONFIG['min_df']
)

# select the type of the model
if CONFIG['clf'] == 'RandomForestClassifier':
    clf = RandomForestClassifier(n_estimators=CONFIG['n_iters']) 
elif CONFIG['clf'] == 'GradientBoostingClassifier':
    clf = GradientBoostingClassifier(n_estimators=CONFIG['n_iters']) 
else:
    clf = LogisticRegression()
    
# start our training
scores = cross_val_model(
    X=df['proc'], 
    y=df['target'], 
    folds=CONFIG['folds'], 
    clf=clf,
    vectorizer=vectorizer,
    seed=CONFIG['seed']
)

In [None]:
print(scores)

### 5. Fetching a model from the MLflow model registry

In [None]:
# find `run_id` in MLflow interface
run_id = 'af4c1f9f717a485eba82aad544c7aecd'

model = mlflow.pyfunc.load_model(
    model_uri=f'file:///home/jovyan/vgarshin_mlflow/0/{run_id}/artifacts/model/'
)
vectorizer_path = f'/home/jovyan/vgarshin_mlflow/0/{run_id}/artifacts/vectorizer/'
vectorizer_path = vectorizer_path + os.listdir(vectorizer_path)[0]
vectorizer = load(vectorizer_path)

In [None]:
# it is not a good idea to test our models
# on the data they were trained on
# but it is just a demo of approach
df_tmp = pd.read_csv('articles_data.csv')
df_tmp = df_tmp.sample(100).reset_index()
del df_tmp['index']
print(df_tmp.shape)
display(df_tmp.head())

In [None]:
data = vectorizer.transform(df_tmp['proc'])
y_score = model.predict(data)

In [None]:
df_tmp['predictions'] = y_score
print(df_tmp.shape)
display(df_tmp.head())

In [None]:
roc_auc_score(df_tmp.target, df_tmp.predictions)