In [40]:
import mlflow
import os
import warnings
import yaml

from so_tag_classifier_core import (text_prepare, binarize_ys, tokenize_and_stem, transform_y, _TAGS_TO_KEEP)

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.multioutput import ClassifierChain
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

warnings.filterwarnings("ignore")

In [46]:
with open("config.yaml") as f:
    configs = yaml.safe_load(f)

os.environ['MLFLOW_TRACKING_USERNAME'] = configs.get("MLFLOW_TRACKING_USERNAME")
os.environ['MLFLOW_TRACKING_PASSWORD'] = configs.get("MLFLOW_TRACKING_PASSWORD")

TRACKING_URI = configs.get("TRACKING_URI")
BUCKET = configs.get("BUCKET")
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment('stackoverlow-classifier')

mlflow.sklearn.autolog()

In [44]:
def train(file: str):
    df = pd.read_csv(file)
    X_train, X_test, y_train, y_test = train_test_split(df['text'].values, 
                                                    df['tags'].values, 
                                                    test_size=0.2, 
                                                    random_state=42)
    binarizer, y_train_binarized, y_test_binarized = binarize_ys(y_train, y_test)

    estimators = [('preprocessor', FunctionTransformer(text_prepare, kw_args={'join_symbol': ' '})), 
              ('tfidf', TfidfVectorizer(tokenizer=tokenize_and_stem, 
                                        ngram_range=(1, 3), 
                                        max_df=0.9, 
                                        min_df=5,
                                        norm='l2')),
              ('clf', ClassifierChain(LogisticRegression(C=1.0, 
                                                         penalty='l1', 
                                                         dual=False, 
                                                         solver='liblinear'), 
                                      cv=3))
             ]

    training_pipe = Pipeline(estimators, verbose=True)
    
    with mlflow.start_run() as run:
        training_pipe.fit(X_train, y_train_binarized)
        print("Logged data and model in run: {}".format(run.info.run_id))

In [45]:
train("/Users/tania/tvasil/stackoverflow-topic-classifier/data/full_body_clean.csv")

                    kw_args={'join_symbol': ' '})), ('tfidf', TfidfVectorizer(max_df=0.9, min_df=5, ngram_range=(1, 3),
                tokenizer=<function tokenize...`


[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   3.4s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total= 1.2min
[Pipeline] ............... (step 3 of 3) Processing clf, total= 3.6min




Logged data and model in run: 295c7087ed194ce098aa2b7a06cf9380
