# Building an ML Platform with MLflow
The purpose of this notebook is to walk through all the steps for managing a machine learning model in MLFlow from data ingestion to model inference and deployment.

### Steps
The notebook will cover the following steps:
1. Data Ingestion
2. Data Preprocessing
3. Parameter Tuning
4. Model Training
5. Model Deployment
6. Model Inference


### Data Ingestion

In [1]:
import pandas as pd

df = pd.read_csv('./data/feedback.csv')
df.head()

Unnamed: 0,text,label
0,interaction student teacher teachers come teac...,-1
1,teaching not upto mark focus topper student ha...,-1
2,teacher us instructional strategy ineffective ...,-1
3,teacher conveys content inaccuracy contribute ...,-1
4,boring dull lecture,-1


### Data Preprocessing
In this step we apply the following steps to ensure the dataset is ready for training.
* Drop rows with null values
* Remove special characters
* Remove stop words
* Encode categorical variables

In [2]:
def preprocess_data(df):
    """
    Preprocess data for model training
    """
    # Drop rows with missing values
    df = df.dropna()
    # Lowercasing
    df['text'] = df['text'].str.lower()
    # Remove special characters and numbers
    df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")
    # Create label to category mapping
    label2cat = {1: 'positive', 0: 'neutral', -1: 'negative'}
    cat2label = {cat: label for label, cat in label2cat.items()}
    df['label'] = df['label'].map(cat2label)

    return df

In [None]:
preprocessed_df = preprocess_data(df)
preprocessed_df.head()

### Parameter Tuning
In this section we run several parallel runs and compare them given a defined performance metric. In our case we use accuracy, but this can obviously be easily changed...

In [None]:
import mlflow
import random
from sklearn.naive_bayes import MultinomialNB 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def run_tuning(tracking_uri: str,
               experiment_name: str,
               run_name: str = None,
               model_params: dict = None,
               tokenizer_params: dict = None,
               data_params: dict = None,
               df: pd.DataFrame = None
    ):

    """
    Runs trainig as MLFlow experiment

    Args:
        tracking_uri (str): URI of MLFlow tracking server
        experiment_name (str): Name of experiment
        run_name (str): Name of current mlflow run
        model_params (dict): Dictionary of model parameters
        tokenizer_params (dict): Dictionary of tokenizer parameters
        data_params (dict): Dictionary of data parameters
        df (pd.DataFrame): Dataframe containing text and label columns
    
    Returns:
        results (dict): dictionary of results
    """
    mlflow.set_tracking_uri(tracking_uri)
    mlflow.set_experiment(experiment_name)
    # Split dataset
    x_train, x_test, y_train, y_test = train_test_split(
        df['text'].tolist(), 
        df['label'].tolist(), 
        test_size=data_params['test_size'], 
        random_state=data_params['random_state']
    )
    # Generate current timestamp
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    # Start MLFlow run
    with mlflow.start_run(nested=True, run_name=f'{run_name}:{timestamp}') as run:
        # Vectorize text
        cv = CountVectorizer(
            max_features=tokenizer_params['max_features'], 
            ngram_range=tokenizer_params['ngram_range'], 
            max_df=tokenizer_params['max_df'],
            min_df=tokenizer_params['min_df'],
        )
        X = cv.fit_transform(x_train)
        # Initialize and fit model
        model = MultinomialNB(alpha=model_params['alpha'], fit_prior=model_params['fit_prior'])
        model.fit(X.toarray(), y_train)
        # Generate predicitons on test set
        predictions = model.predict(cv.transform(x_test).toarray())
        # Get test metrics
        accuracy = accuracy_score(y_test, predictions)
        precision = precision_score(y_test, predictions, average='weighted')
        recall = recall_score(y_test, predictions, average='weighted')
        f1 = f1_score(y_test, predictions, average='weighted')
        metrics = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
        }
        # Log hyperparameters
        mlflow.log_params(model_params)
        # Log metrics
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
        })
        # Log model and tokenizer artifacts
        mlflow.sklearn.log_model(
            cv,
            "tokenizer"
        )
        mlflow.sklearn.log_model(
            model, 
            "student-model"
        )
        
    return {"accuracy": accuracy, "model": model, "tokenizer": cv, "model_params": model_params, "tokenizer_params": tokenizer_params, "run_id": run.info.run_id}