In [1]:
import os
import warnings
import yaml

from so_tag_classifier_core import (text_prepare, binarize_ys, tokenize_and_stem, transform_y)

import dill
import mlflow
from mlflow.models.signature import infer_signature
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, accuracy_score, average_precision_score, 
                             f1_score, precision_score, make_scorer)
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.multioutput import ClassifierChain
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

warnings.filterwarnings("ignore")

### Read configuration files

In [2]:
with open("config.yaml") as f:
    configs = yaml.safe_load(f)

os.environ['MLFLOW_TRACKING_USERNAME'] = configs.get("MLFLOW_TRACKING_USERNAME")
os.environ['MLFLOW_TRACKING_PASSWORD'] = configs.get("MLFLOW_TRACKING_PASSWORD")

TRACKING_URI = configs.get("TRACKING_URI")
BUCKET = configs.get("BUCKET")

### MLflow config

In [3]:
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment('stackoverlow-classifier')
mlflow.sklearn.autolog()

### Custom Python class 

This custom class lets us extract multiple artifacts from the `model` registry, not just the `classifier`. Also, it allows us to define a custom inference function (`.predict`), which will also transform the output data from a 100-element long matrix to just the labels we want to predict, along with their corresponding probabilities. 

In [5]:
class MultiLabelClassifierPipelineWrapper(mlflow.pyfunc.PythonModel):

    def load_context(self, context):
        self.binarizer = dill.load(context.artifacts["binarizer"])
        self.pipeline = dill.load(context.artifacts["pipeline"])
        

    def predict(self, context, document):
        """
        Make a label prediction for an arbitrary number of documents/texts
        """
        
        vals = document.text.tolist()
        raw_preds = self.pipeline.predict(vals)
        preds = self.binarizer.inverse_transform(raw_preds)
        
        probs = self.pipeline.predict_proba(vals)
        all_probs_dict = [dict(zip(self.binarizer.classes_, prob)) for prob in probs]
        to_return = []
        for pred, probs_dict in zip(preds, all_probs_dict):
            to_return.append({x:probs_dict[x] for x in probs_dict if x in pred})
        return to_return

### Helper functions

In [6]:
def eval_metrics(y_test, y_preds):
    accuracy = accuracy_score(y_test, y_preds)
    f1 = f1_score(y_test, y_preds, average="weighted")
    avg_precision = average_precision_score(y_test, y_preds)
    precision = precision_score(y_test, y_preds, average="weighted")
    return {"accuracy": accuracy, 
            "f1": f1, 
            "avg_precision": avg_precision, 
            "precision": precision}

### Read and pre-process data

In [7]:
data_file="/Users/tania/tvasil/stackoverflow-topic-classifier/data/full_body_clean.csv"
df = pd.read_csv(data_file)
X_train, X_test, y_train, y_test = train_test_split(df['text'].values, 
                                                    df['tags'].values, 
                                                    test_size=0.2, 
                                                    random_state=42)
binarizer, y_train_binarized, y_test_binarized = binarize_ys(y_train, y_test)

## Training 
### 1. BayesSearchCV

In [None]:
bayes_search_space = {
    "tfidf__min_df": Integer(5, 100),
    "tfidf__max_df": Real(0.5, 0.99, prior='log-uniform'),
    "clf": [ClassifierChain(LogisticRegression(random_state=42,
                                                           dual=False, 
                                                           solver="liblinear", 
                                                           max_iter=1000), 
                                             cv=3)],
    "clf__base_estimator__C": Real(0.000001, 5e5, prior="uniform"),
    "clf__base_estimator__penalty": ['l1', 'l2']
}

In [None]:
### Define the search space and defaults 
estimators = [('preprocessor', FunctionTransformer(text_prepare, kw_args={'join_symbol': ' '})), 
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_and_stem, 
                                        ngram_range=(1, 3),
                                        norm='l2')),
              ('clf', ClassifierChain(LogisticRegression()))
             ]

scoring = {'f1': make_scorer(f1_score, average= 'weighted'), 
           'average_precision': 'average_precision'}

### Create the Pipeline and RSCV objects 
training_pipe = Pipeline(estimators, verbose=True)
hyperoptsearch = BayesSearchCV(training_pipe,
                          #param_grid=search_space,
                          search_spaces=bayes_search_space, 
                          scoring=make_scorer(f1_score, average= 'weighted'),
                          refit=True, 
                          return_train_score=True, 
                          cv=3, 
                          verbose=10, 
                          n_jobs=-1)

In [None]:
with mlflow.start_run(run_name="Second attempt at BayesSearch") as run:
    hyperoptsearch.fit(X_train, y_train_binarized)
    signature = infer_signature(X_train, hyperoptsearch.best_estimator_.predict(X_train))
    print("Logged data and model in run: {}".format(run.info.run_id))
    
    ## CAPTURE METRICS
    y_test_pred_binarized = hyperoptsearch.best_estimator_.predict(X_test)
    class_report = classification_report(
                                y_test_binarized, 
                                y_test_pred_binarized, 
                                target_names=binarizer.classes_, 
                                zero_division=1
                            )
    metrics = eval_metrics(y_test_binarized, y_test_pred_binarized)

#     mlflow.log_params(rs.named_steps) # log pipeline steps -- could be improved
#     mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    
    
    ## CREATE AND SAVE ARTIFACTS
    
    pipeline_path = "models/classifier.pkl"
    binarizer_path = "models/binarizer.pkl"
    class_report_path = "metrics/classification_report.txt"
    
    dill.dump(hyperoptsearch.best_estimator_, pipeline_path)
    dill.dump(binarizer, binarizer_path)
    with open(class_report_path, 'w') as f:
        f.write(class_report)

    artifacts = {
        "pipeline": pipeline_path,
        "binarizer": binarizer_path,
        "classification_report": class_report_path
    }

    mlflow_pyfunc_model_path = "so_pyfunc_model"
    mlflow.pyfunc.log_model(
        artifact_path=mlflow_pyfunc_model_path, 
        python_model=MultiLabelClassifierPipelineWrapper(), 
        artifacts=artifacts,
        conda_env=conda_env, 
        signature=signature
    )

### 2. RandomizedSearchCV

In [8]:
from scipy.stats import uniform, randint

In [17]:
estimators = [('preprocessor', FunctionTransformer(text_prepare, kw_args={'join_symbol': ' '})), 
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_and_stem, 
                                        ngram_range=(1, 3),
                                        norm='l2')),
              ('clf', ClassifierChain(LogisticRegression(random_state=42,
                                                         dual=False, 
                                                         max_iter=1000, 
                                                         solver='liblinear'), 
                                      cv=3))
             ]

search_space = {"tfidf__min_df": randint(5, 100),
                "tfidf__max_df": uniform(0.01, 0.98),
                "clf__base_estimator__C": uniform(0.000001, 50000),
                "clf__base_estimator__penalty": ['l1', 'l2']}


scoring = {'f1': make_scorer(f1_score, average= 'weighted'), 
           'average_precision': 'average_precision'}

### Create the Pipeline and RSCV objects 
training_pipe = Pipeline(estimators, verbose=True)
hyperoptsearch = RandomizedSearchCV(training_pipe,
                                    param_distributions=search_space,
                                    scoring=make_scorer(f1_score, average= 'weighted'),
                                    refit=True, 
                                    n_iter=3,
                                    return_train_score=True, 
                                    cv=3, 
                                    verbose=10, 
                                    n_jobs=-1)

In [18]:
with mlflow.start_run(run_name="Third attempt at RanzomizedSearchCV") as run:
    hyperoptsearch.fit(X_train, y_train_binarized)
    signature = infer_signature(X_train, hyperoptsearch.best_estimator_.predict(X_train))
    print("Logged data and model in run: {}".format(run.info.run_id))
    
    ## CAPTURE METRICS
    y_test_pred_binarized = hyperoptsearch.best_estimator_.predict(X_test)
    class_report = classification_report(
                                y_test_binarized, 
                                y_test_pred_binarized, 
                                target_names=binarizer.classes_, 
                                zero_division=1
                            )
    metrics = eval_metrics(y_test_binarized, y_test_pred_binarized)

#     mlflow.log_params(rs.named_steps) # log pipeline steps -- could be improved
#     mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    
    
    ## CREATE AND SAVE ARTIFACTS
    
    pipeline_path = "models/classifier.pkl"
    binarizer_path = "models/binarizer.pkl"
    class_report_path = "metrics/classification_report.txt"

    with open(pipeline_path, 'wb') as f:
        dill.dump(hyperoptsearch.best_estimator_, f)
    with open(binarizer_path, 'wb') as f:
        dill.dump(binarizer, f)
    with open(class_report_path, 'w') as f:
        f.write(class_report)

    artifacts = {
        "pipeline": pipeline_path,
        "binarizer": binarizer_path,
        "classification_report": class_report_path
    }

    mlflow_pyfunc_model_path = "so_pyfunc_model"
    mlflow.pyfunc.log_model(
        artifact_path=mlflow_pyfunc_model_path, 
        python_model=MultiLabelClassifierPipelineWrapper(), 
        artifacts=artifacts,
        conda_env=conda_env, 
        signature=signature
    )

         steps=[('preprocessor',
                 FunctionTransformer(accept_sparse=False, check_inverse=True,
                                     func=<function text_prepare at 0x7f9d3cbe8170>,
                             ...`


Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed: 100.0min remaining: 200.0min
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed: 108.6min remaining: 135.8min
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed: 114.0min remaining: 91.2min
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed: 115.9min remaining: 58.0min
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed: 118.3min remaining: 33.8min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 165.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 165.7min finished


[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   3.8s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total= 1.5min
[Pipeline] ............... (step 3 of 3) Processing clf, total= 4.2min


                    func=<function text_prepare at 0x7f9d3cbe8170>,
                    inv_kw_args=None, inverse_func=None,
                    kw_args={'join_symbol':...`
                    func=<function text_prepare at 0x7f9d3cbe8170>,
                    inv_kw_args=None, inverse_func=None,
                    kw_args={'join_symbol': ' '}, validate=Fa...`
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_...`
                                                  dual=False,
                                                  fit_intercept=True,
                                       ...`
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42...`
                    func=<function text_prepare at 0x7f9d3cbe8170>,
                    inv_kw_args=Non

Logged data and model in run: 83ae738e766b4383b124e33bd4415299


