In [1]:
import os
import warnings
import yaml

from so_tag_classifier_core import (text_prepare, binarize_ys, tokenize_and_stem, transform_y)

import dill
import mlflow
from mlflow.models.signature import infer_signature
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, accuracy_score, average_precision_score, 
                             f1_score, precision_score, make_scorer)
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.multioutput import ClassifierChain
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

warnings.filterwarnings("ignore")

### Read configuration files

In [2]:
with open("config.yaml") as f:
    configs = yaml.safe_load(f)

os.environ['MLFLOW_TRACKING_USERNAME'] = configs.get("MLFLOW_TRACKING_USERNAME")
os.environ['MLFLOW_TRACKING_PASSWORD'] = configs.get("MLFLOW_TRACKING_PASSWORD")

TRACKING_URI = configs.get("TRACKING_URI")
BUCKET = configs.get("BUCKET")

### MLflow config

In [3]:
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment('stackoverlow-classifier')
mlflow.sklearn.autolog()

### Extract conda env

This environment file with later be passed on to the model definition directory. 

In [4]:
conda_env = {
            'name': 'so-classifier-env',
            'channels': ['defaults', "anaconda", "conda-firge"],
            'dependencies': [
                'python=3.7.9',
                'scikit-learn>=0.21.3',
                'pip=20.3.1',
                'setuptools=51.0.0',
                {'pip': 
                 ['boto3==1.16.34',
                  'cloudpickle==1.6.0',
                  'dill==0.3.3',
                  'mlflow=1.12.1',
                  'nltk[stopwords]==3.5',
                  'nltk[punkt]==3.5',
                  'numpy==1.19.4',
                  'pandas==1.1.5',
                  'scikit-learn==0.23.2',
                  'scikit-optimize==0.8.1',
                  'scipy==1.5.4']
                }
            ]}

### Custom Python class 

This custom class lets us extract multiple artifacts from the `model` registry, not just the `classifier`. Also, it allows us to define a custom inference function (`.predict`), which will also transform the output data from a 100-element long matrix to just the labels we want to predict, along with their corresponding probabilities. 

In [5]:
class MultiLabelClassifierPipelineWrapper(mlflow.pyfunc.PythonModel):

    def load_context(self, context):
        self.binarizer = dill.load(context.artifacts["binarizer"])
        self.pipeline = dill.load(context.artifacts["pipeline"])
        

    def predict(self, context, document):
        """
        Make a label prediction for an arbitrary number of documents/texts
        """
        
        vals = document.text.tolist()
        raw_preds = self.pipeline.predict(vals)
        preds = self.binarizer.inverse_transform(raw_preds)
        
        probs = self.pipeline.predict_proba(vals)
        all_probs_dict = [dict(zip(self.binarizer.classes_, prob)) for prob in probs]
        to_return = []
        for pred, probs_dict in zip(preds, all_probs_dict):
            to_return.append({x:probs_dict[x] for x in probs_dict if x in pred})
        return to_return

### Helper functions

In [6]:
def eval_metrics(y_test, y_preds):
    accuracy = accuracy_score(y_test, y_preds)
    f1 = f1_score(y_test, y_preds, average="weighted")
    avg_precision = average_precision_score(y_test, y_preds)
    precision = precision_score(y_test, y_preds, average="weighted")
    return {"accuracy": accuracy, 
            "f1": f1, 
            "avg_precision": avg_precision, 
            "precision": precision}

## Training

In [7]:
data_file="/Users/tania/tvasil/stackoverflow-topic-classifier/data/full_body_clean.csv"

### Read and pre-process data

In [9]:
df = pd.read_csv(data_file)
#df = df.sample(10000)
X_train, X_test, y_train, y_test = train_test_split(df['text'].values, 
                                                    df['tags'].values, 
                                                    test_size=0.2, 
                                                    random_state=42)
#X_train_cleaned = pd.Series(text_prepare(X_train, " "))
binarizer, y_train_binarized, y_test_binarized = binarize_ys(y_train, y_test)

In [25]:
# def loss_fn(y_target, y_prediction):
#     """
#     Custom loss function that maximizes the precision of the model
#     """
#     return 1.0 - precision_score(y_target, y_prediction)

In [26]:
# estim = HyperoptEstimator(classifier=one_vs_rest('clf', estimator=svc('estimator')),  
#                           preprocessing=[tfidf('tfidf', lowercase=True)],
#                           algo=tpe.suggest, 
#                           loss_fn=loss_fn,
#                           trial_timeout=300, 
#                           verbose=True)
# estim.fit(X_train_cleaned, y_train_binarized)

In [11]:
bayes_search_space = {
    "tfidf__min_df": Integer(5, 100),
    "tfidf__max_df": Real(0.5, 0.99, prior='log-uniform'),
    "clf": [ClassifierChain(LogisticRegression(random_state=42,
                                                           dual=False, 
                                                           solver="liblinear", 
                                                           max_iter=1000), 
                                             cv=3)],
    "clf__base_estimator__C": Real(0.000001, 5e5, prior="uniform"),
    "clf__base_estimator__penalty": ['l1', 'l2']
}

In [13]:
### Define the search space and defaults 
estimators = [('preprocessor', FunctionTransformer(text_prepare, kw_args={'join_symbol': ' '})), 
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_and_stem, 
                                        ngram_range=(1, 3),
                                        norm='l2')),
              ('clf', ClassifierChain(LogisticRegression()))
             ]

# search_space = {"tfidf__min_df": np.arange(5, 100),
#                 "tfidf__max_df": np.arange(0.01, 0.98, step=0.01),
#                 "clf": [ClassifierChain(LogisticRegression(random_state=42,
#                                                            dual=False, 
#                                                            max_iter=1000), 
#                                              cv=3)], 
#                 "clf__base_estimator__C": np.arange(0.000001, 50000, step=1),
#                 "clf__base_estimator__penalty": ['l1', 'l2']}

scoring = {'f1': make_scorer(f1_score, average= 'weighted'), 
           'average_precision': 'average_precision'}

### Create the Pipeline and RSCV objects 
training_pipe = Pipeline(estimators, verbose=True)
hyperoptsearch = BayesSearchCV(training_pipe,
                          #param_grid=search_space,
                          search_spaces=bayes_search_space, 
                          scoring=make_scorer(f1_score, average= 'weighted'),
                          refit=True, 
                          return_train_score=True, 
                          cv=3, 
                          verbose=10, 
                          n_jobs=-1)

In [None]:
with mlflow.start_run(run_name="Second attempt at BayesSearch") as run:
    hyperoptsearch.fit(X_train, y_train_binarized)
    signature = infer_signature(X_train, hyperoptsearch.best_estimator_.predict(X_train))
    print("Logged data and model in run: {}".format(run.info.run_id))
    
    ## CAPTURE METRICS
    y_test_pred_binarized = hyperoptsearch.best_estimator_.predict(X_test)
    class_report = classification_report(
                                y_test_binarized, 
                                y_test_pred_binarized, 
                                target_names=binarizer.classes_, 
                                zero_division=1
                            )
    metrics = eval_metrics(y_test_binarized, y_test_pred_binarized)

#     mlflow.log_params(rs.named_steps) # log pipeline steps -- could be improved
#     mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    
    
    ## CREATE AND SAVE ARTIFACTS
    
    pipeline_path = "models/classifier.pkl"
    binarizer_path = "models/binarizer.pkl"
    class_report_path = "metrics/classification_report.txt"
    
    dill.dump(bscv.best_estimator_, pipeline_path)
    dill.dump(binarizer, binarizer_path)
    with open(class_report_path, 'w') as f:
        f.write(class_report)

    artifacts = {
        "pipeline": pipeline_path,
        "binarizer": binarizer_path,
        "classification_report": class_report_path
    }

    mlflow_pyfunc_model_path = "so_pyfunc_model"
    mlflow.pyfunc.log_model(
        artifact_path=mlflow_pyfunc_model_path, 
        python_model=MultiLabelClassifierPipelineWrapper(), 
        artifacts=artifacts,
        conda_env=conda_env, 
        signature=signature
    )

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  7.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  7.6min finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


## Inference

In [None]:
model_path_s3 = "XXX"

In [None]:
loaded_model = mlflow.pyfunc.load_model(model_path_s3)

In [None]:
to_predict = pd.DataFrame(data={"text": ["I can't figure out how to load a custom model from Tensorflow into my Python function",
                                         "How to compile the Kotlin code"]}, 
                          index=[0, 1])
to_predict

In [None]:
to_predict.to_json(orient="records")

In [None]:
loaded_model.predict(data=to_predict)