In [1]:
import os
import warnings
import yaml

from so_tag_classifier_core import (text_prepare, binarize_ys, tokenize_and_stem, transform_y)

import joblib
import mlflow
from mlflow.models.signature import infer_signature
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, average_precision_score, f1_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.multioutput import ClassifierChain
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

warnings.filterwarnings("ignore")

### Read configuration files

In [2]:
with open("config.yaml") as f:
    configs = yaml.safe_load(f)

os.environ['MLFLOW_TRACKING_USERNAME'] = configs.get("MLFLOW_TRACKING_USERNAME")
os.environ['MLFLOW_TRACKING_PASSWORD'] = configs.get("MLFLOW_TRACKING_PASSWORD")

TRACKING_URI = configs.get("TRACKING_URI")
BUCKET = configs.get("BUCKET")

### MLflow config

In [3]:
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment('stackoverlow-classifier')
#mlflow.sklearn.autolog()

### Extract conda env

This environment file with later be passed on to the model definition directory. 

In [24]:
! conda env export > environment.yml --no-build

In [22]:
with open("environment.yml") as f:
    conda_env = yaml.safe_load(f)
    conda_env.pop("prefix")
    conda_env["channels"].extend(["anaconda", "conda-forge"])

### Custom Python class 

This custom class lets us extract multiple artifacts from the `model` registry, not just the `classifier`. Also, it allows us to define a custom inference function (`.predict`), which will also transform the output data from a 100-element long matrix to just the labels we want to predict, along with their corresponding probabilities. 

In [6]:
class MultiLabelClassifierPipelineWrapper(mlflow.pyfunc.PythonModel):

    def load_context(self, context):
        self.binarizer = joblib.load(context.artifacts["binarizer"])
        self.pipeline = joblib.load(context.artifacts["pipeline"])
        

    def predict(self, context, document):
        """
        Make a label prediction for an arbitrary number of documents/texts
        """
        
        vals = document.text.tolist()
        raw_preds = self.pipeline.predict(vals)
        preds = self.binarizer.inverse_transform(raw_preds)
        
        probs = self.pipeline.predict_proba(vals)
        all_probs_dict = [dict(zip(self.binarizer.classes_, prob)) for prob in probs]
        to_return = []
        for pred, probs_dict in zip(preds, all_probs_dict):
            to_return.append({x:probs_dict[x] for x in probs_dict if x in pred})
        return to_return

### Helper functions

In [7]:
def eval_metrics(y_test, y_preds):
    accuracy = accuracy_score(y_test, y_preds)
    f1 = f1_score(y_test, y_preds, average="weighted")
    avg_precision = average_precision_score(y_test, y_preds)
    precision = precision_score(y_test, y_preds, average="weighted")
    return accuracy, f1, avg_precision, precision

## Training

In [8]:
data_file="/Users/tania/tvasil/stackoverflow-topic-classifier/data/full_body_clean.csv"

In [9]:
params = dict(tfidf_n_gram_range = (1, 3),
              tfidf_max_df=0.9,
              tfidf_min_df=5,
              tfidf_norm="l2",
              clf_c=3,
              clf_penalty="l1",
              clf_dual=False,
              clf_solver="liblinear",
              cv=3
             )

In [10]:
df = pd.read_csv(data_file)
X_train, X_test, y_train, y_test = train_test_split(df['text'].values, 
                                                    df['tags'].values, 
                                                    test_size=0.2, 
                                                    random_state=42)
binarizer, y_train_binarized, y_test_binarized = binarize_ys(y_train, y_test)

estimators = [('preprocessor', FunctionTransformer(text_prepare, kw_args={'join_symbol': ' '})), 
          ('tfidf', TfidfVectorizer(tokenizer=tokenize_and_stem, 
                                    ngram_range=params['tfidf_n_gram_range'], 
                                    max_df=params['tfidf_max_df'], 
                                    min_df=params['tfidf_min_df'],
                                    norm=params['tfidf_norm'])),
          ('clf', ClassifierChain(LogisticRegression(C=params['clf_c'], 
                                                     penalty=params['clf_penalty'], 
                                                     dual=params['clf_dual'], 
                                                     solver=params['clf_solver']), 
                                  random_state=42,
                                  cv=params['cv']))
         ]

training_pipe = Pipeline(estimators, verbose=True)

In [11]:
with mlflow.start_run(run_name="Fix mapping function") as run:
    training_pipe.fit(X_train, y_train_binarized)
    signature = infer_signature(X_train, training_pipe.predict(X_train))
    print("Logged data and model in run: {}".format(run.info.run_id))
    
    ## CAPTURE METRICS
    y_test_pred_binarized = training_pipe.predict(X_test)
    class_report = classification_report(
                                y_test_binarized, 
                                y_test_pred_binarized, 
                                target_names=binarizer.classes_, 
                                zero_division=1
                            )
    accuracy, f1, avg_precision, precision = eval_metrics(y_test_binarized, y_test_pred_binarized)

    mlflow.log_params(training_pipe.named_steps) # log pipeline steps -- could be improved
    mlflow.log_params(params)
    mlflow.log_metric("test_accuracy", accuracy)
    mlflow.log_metric("test_f1", f1)
    mlflow.log_metric("test_avg_precision", avg_precision)
    mlflow.log_metric("precision", precision)
    
    
    ## CREATE AND SAVE ARTIFACTS
    
    pipeline_path = "models/classifier.pkl"
    binarizer_path = "models/binarizer.pkl"
    tags_path = "models/tags.pkl"
    class_report_path = "metrics/classification_report.txt"

    
    
    joblib.dump(training_pipe, pipeline_path)
    joblib.dump(binarizer, binarizer_path)
    with open(class_report_path, 'w') as f:
        f.write(class_report)

    # Create an `artifacts` dictionary that assigns a unique name to the saved model file.
    # This dictionary will be passed to `mlflow.pyfunc.save_model`, which will copy the model file
    # into the new MLflow Model's directory.
    artifacts = {
        "pipeline": pipeline_path,
        "binarizer": binarizer_path,
        "classification_report": class_report_path
    }

    mlflow_pyfunc_model_path = "so_pyfunc_model"
    mlflow.pyfunc.log_model(
        artifact_path=mlflow_pyfunc_model_path, 
        python_model=MultiLabelClassifierPipelineWrapper(), 
        artifacts=artifacts,
        conda_env=conda_env, 
        signature=signature
    )

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   3.4s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total= 1.1min
[Pipeline] ............... (step 3 of 3) Processing clf, total= 3.3min
Logged data and model in run: e5b2d5de8e7f483087337483b0e87942


## Inference

In [12]:
model_path_s3 = "s3://tvasil-ml-models/1/ecd8d0a55a264405a3f6e824f17b63ca/artifacts/so_pyfunc_model/"

In [13]:
loaded_model = mlflow.pyfunc.load_model(model_path_s3)

In [14]:
to_predict = pd.DataFrame(data={"text": ["I can't figure out how to load a custom model from Tensorflow into my Python function",
                                         "How to compile the Kotlin code"]}, 
                          index=[0, 1])
to_predict

Unnamed: 0,text
0,I can't figure out how to load a custom model ...
1,How to compile the Kotlin code


In [27]:
to_predict.to_json(orient="records")

'[{"text":"I can\'t figure out how to load a custom model from Tensorflow into my Python function"},{"text":"How to compile the Kotlin code"}]'

In [15]:
loaded_model.predict(data=to_predict)

[{'python': 0.9870509228172972, 'tensorflow': 0.8195176085539446},
 {'android': 0.9373731083606335, 'kotlin': 0.999999999999936}]

In [29]:
print('Encountered an unexpected error while evaluating the model. Verify that the serialized input Dataframe is compatible with the model for inference.", "stack_trace": "Traceback (most recent call last):\n  File \"/miniconda/envs/custom_env/lib/python3.7/site-packages/mlflow/pyfunc/scoring_server/__init__.py\", line 213, in transformation\n    raw_predictions = model.predict(data)\n  File \"/miniconda/envs/custom_env/lib/python3.7/site-packages/mlflow/pyfunc/__init__.py\", line 424, in predict\n    return self._model_impl.predict(data)\n  File \"/miniconda/envs/custom_env/lib/python3.7/site-packages/mlflow/pyfunc/model.py\", line 254, in predict\n    return self.python_model.predict(self.context, model_input)\n  File \"<ipython-input-6-b1024ab02d67>\", line 14, in predict\n  File \"/miniconda/envs/custom_env/lib/python3.7/site-packages/sklearn/utils/metaestimators.py\", line 119, in <lambda>\n    out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)\n  File \"/miniconda/envs/custom_env/lib/python3.7/site-packages/sklearn/pipeline.py\", line 407, in predict\n    Xt = transform.transform(Xt)\n  File \"/miniconda/envs/custom_env/lib/python3.7/site-packages/sklearn/feature_extraction/text.py\", line 1880, in transform\n    X = super().transform(raw_documents)\n  File \"/miniconda/envs/custom_env/lib/python3.7/site-packages/sklearn/feature_extraction/text.py\", line 1250, in transform\n    _, X = self._count_vocab(raw_documents, fixed_vocab=True)\n  File \"/miniconda/envs/custom_env/lib/python3.7/site-packages/sklearn/feature_extraction/text.py\", line 1110, in _count_vocab\n    for feature in analyze(doc):\n  File \"/miniconda/envs/custom_env/lib/python3.7/site-packages/sklearn/feature_extraction/text.py\", line 106, in _analyze\n    doc = tokenizer(doc)\n  File \"/opt/mlflow/src/so-tag-classifier-core/core/so_tag_classifier_core/preprocessing_steps.py\", line 171, in tokenize_and_stem\n    tokenized_list = word_tokenize(text)\n  File \"/miniconda/envs/custom_env/lib/python3.7/site-packages/nltk/tokenize/__init__.py\", line 129, in word_tokenize\n    sentences = [text] if preserve_line else sent_tokenize(text, language)\n  File \"/miniconda/envs/custom_env/lib/python3.7/site-packages/nltk/tokenize/__init__.py\", line 106, in sent_tokenize\n    tokenizer = load(\"tokenizers/punkt/{0}.pickle\".format(language))\n  File \"/miniconda/envs/custom_env/lib/python3.7/site-packages/nltk/data.py\", line 752, in load\n    opened_resource = _open(resource_url)\n  File \"/miniconda/envs/custom_env/lib/python3.7/site-packages/nltk/data.py\", line 877, in _open\n    return find(path_, path + [\"\"]).open()\n  File \"/miniconda/envs/custom_env/lib/python3.7/site-packages/nltk/data.py\", line 585, in find\n    raise LookupError(resource_not_found)\nLookupError: \n**********************************************************************\n  Resource \u001b[93mpunkt\u001b[0m not found.\n  Please use the NLTK Downloader to obtain the resource:\n\n  \u001b[31m>>> import nltk\n  >>> nltk.download('punkt')\n  \u001b[0m\n  For more information see: https://www.nltk.org/data.html\n\n  Attempted to load \u001b[93mtokenizers/punkt/PY3/english.pickle\u001b[0m\n\n  Sea')

SyntaxError: invalid syntax (<ipython-input-29-f923d991bf00>, line 1)