# Detecção de Anomalias em Contratos

O objetivo desta demonstração é desenvolver um processo para identificar anomalias em ordens compras.

Para isso, precisamos:
- Preparar uma série de variáveis sobre essas compras
- Armazená-las de forma a promover seu reaproveitamento
- Treinar diversos modelos de ML
- Operacionalizar e gerenciá-los em produção

<br><img src='https://www.databricks.com/sites/default/files/inline-images/db-392-blog-img-2.png?v=1669127716' width=70%>

## Conjunto de Dados

In [0]:
%sql USE vr_demo.auditoria_anomalia

In [0]:
%sql SELECT * FROM purchase_orders

Databricks data profile. Run in Databricks to view.

## Preparação dos dados

### Criação das variáveis

In [0]:
id_col = 'key_id'
num_cols = ['QUANTITY', 'UNIT_PRICE', 'PO_Amount', 'QUANTITY_BILLED', 'QUANTITY_CANCELLED', 'QUANTITY_DELIVERED', 'QUANTITY_ORDERED', 'QuantityDifference', 'POCreationTimeDiff', 'POCreationDayDiff', 'PODistinctCount', 'PotentialSplitPO', 'TotalPSG', 'PSGSpend_Percent', 'CurrentYearSpend', 'PriorYearSpend']
cat_cols = ['QuantityVariance', 'BillingVariance']

po_df = spark.table('purchase_orders').select(id_col, *num_cols, *cat_cols)

### Registra variáveis no Feature Store

In [0]:
from databricks.feature_engineering import FeatureEngineeringClient
fe = FeatureEngineeringClient()

In [0]:
fe.create_table(
  name=f"po_features",
  primary_keys=[id_col], 
  df=po_df, 
  description="Purchase order features"
)

### Recupera variáveis do Feature Store

In [0]:
from databricks.feature_engineering.entities.feature_lookup import FeatureLookup
from pyspark.sql.functions import lit

# Grab all useful features from different feature store tables
feature_lookups = [
  FeatureLookup(
      table_name="po_features", 
      lookup_key="key_id"
  )
]

# List desired ids to train on
lookup_df = spark.table("po_features").select("key_id").distinct().withColumn('pred_anomalia', lit(0))

# Create the training set
training_set = fe.create_training_set(
    df=lookup_df,
    feature_lookups=feature_lookups,
    exclude_columns=[id_col],
    label='pred_anomalia'
)

# Load the training data
df = training_set.load_df()
display(df)

### Separação das bases

In [0]:
from sklearn.model_selection import train_test_split

# split train and test datasets
X_train_raw, X_test_raw = train_test_split(df.drop('pred_anomalia').toPandas(), train_size=0.7)

### Pré-Processamento

In [0]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
set_config(display="diagram")

# Handling numerical data
num = Pipeline(steps=[
    ('std', StandardScaler()),
    ('imp', SimpleImputer(strategy='mean'))
])

# Handling categorical data
cat = Pipeline(steps=[
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('enc', OneHotEncoder())
])

# Preprocessor
pre = Pipeline(steps=[(
  'preprocessor', ColumnTransformer(transformers=[
    ('num', num, num_cols),
    ('cat', cat, cat_cols)
  ])
)])

# Transform data
X_train = pre.fit_transform(X_train_raw)
X_test = pre.transform(X_test_raw)

# Broadcast data
X_train_broadcast = sc.broadcast(X_train)
X_test_broadcast = sc.broadcast(X_test)

display(pre)

## Treinamento

Para treinarmos nossos modelos mais facilmente, iremos utilizar a biblioteca **`Kakapo`**!

Esta biblioteca integra uma das principais biliotecas de detecção de anomalias, a **`PyOD`**, com o **`MLflow`**, **`Spark`** e **`HyperOpt`**.

Dessa forma:
- Treinamos os diversos tipos de modelos inclusos na PyOD
- Aceleramos o treinamento através da distribuição em nosso cluster
- Otimizamos o tuning com técnicas bayesianas e genéticas
- Mantemos a rastreabilidade de todos os experimentos 

<br><img src='https://www.databricks.com/sites/default/files/inline-images/logo-kakap.png?v=1679063191' width=60%>

### Define o espaço de busca

In [0]:
%pip install databricks-kakapo

In [0]:
from kakapo import get_default_model_space, get_default_search_space
from hyperopt import hp

model_space = get_default_model_space()
model_space.pop('inne') # removes inne algo to save time

search_space = hp.choice('model_type', [i for i in get_default_search_space() if i['type'] != 'inne']) # removes inne algo to save time

### Define o experimento

In [0]:
from kakapo import train_outlier_detection

def train_model(params):
  y_test = None
  y_exists = False
  return train_outlier_detection(params, model_space, X_train_broadcast.value, X_test_broadcast.value, y_test, y_exists)

### Roda o experimento

In [0]:
from hyperopt import fmin, tpe, SparkTrials
import mlflow

# perform evaluation
with mlflow.start_run(run_name='PyOD-Default') as run:
  best_params = fmin(
    fn=train_model,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=SparkTrials(parallelism=8), # set to the number of available cores
    verbose=True
  )

### Registra o melhor modelo

In [0]:
model_full_name = 'vr_demo.auditoria_anomalia.od_purchase_orders'

# Load the best model
# Kakapo generates either AUC (supervised) or EM (unsupervised) scores to evaluate models
# In both cases, the best model is the one with the highest score
# As HyperOpt can only minimize a loss functions, Kakapo returns negative AUC or EM scores
# Thus, we need to sort the loss metric in ascending order to find the lowest score
runs = mlflow.search_runs(filter_string=f"tags.mlflow.parentRunId = '{run.info.run_id}'", order_by=["metrics.loss ASC"])
best_run_id = runs.loc[0,'run_id']
model = mlflow.pyfunc.load_model(f"runs:/{best_run_id}/model")

# Assemble the inference pipeline
model_pipeline = Pipeline(steps=[
  ('pre', pre),
  ('od', model)
])

# Register the best model
with mlflow.start_run(run_name='PyOD Final'):
  fe.log_model(
    model=model_pipeline,
    artifact_path="model",
    flavor=mlflow.sklearn,
    training_set=training_set,
    input_example=X_train_raw.head(10),
    registered_model_name=model_full_name,
  )

### Cria e Registra um Ensemble

In [0]:
model_full_name = 'vr_demo.auditoria_anomalia.od_purchase_orders'

# Load the best ABOD, ECOD and COPOD models
def get_best_model(run, model_type):
  runs = mlflow.search_runs(filter_string=f"tags.mlflow.parentRunId = '{run.info.run_id}' and tags.model_type = '{model_type}'", order_by=["metrics.loss ASC"])
  best_run_id = runs.loc[0,'run_id']
  return mlflow.pyfunc.load_model(f"runs:/{best_run_id}/model")
abod = get_best_model(run, 'abod')
ecod = get_best_model(run, 'ecod')
copod = get_best_model(run, 'copod')

# Assemble the inference pipeline
class Ensemble(mlflow.pyfunc.PythonModel):
  def __init__(self, pre, abod, ecod, copod):
    self.abod = abod
    self.ecod = ecod
    self.copod = copod
    self.pre = pre
  
  def predict(self, context, model_input):
    features = pre.transform(model_input)
    preds_abod = self.abod.predict(features)
    preds_ecod = self.ecod.predict(features)
    preds_copod = self.copod.predict(features)
    return (preds_abod + preds_ecod + preds_copod > 1)
ens = Ensemble(pre, abod, ecod, copod)

# Register the best model
with mlflow.start_run(run_name='PyOD Ensemble Final'):
  fe.log_model(
    model=ens,
    artifact_path="model",
    flavor=mlflow.pyfunc,
    training_set=training_set,
    input_example=X_train_raw.head(10),
    registered_model_name=model_full_name,
  )

### Promove o modelo para produção

In [0]:
production_alias = "prod"

# Find latest model version
client = mlflow.MlflowClient()
model_versions = client.search_model_versions(f"name='{model_full_name}'")
latest_version = max([int(i.version) for i in model_versions])

# Move it in Production
client.set_registered_model_alias(model_full_name, production_alias, version=latest_version)

## Inferência

In [0]:
fe.score_batch(model_uri=f"models:/{model_full_name}@{production_alias}", df=lookup_df, result_type="string") \
  .write.mode("overwrite").saveAsTable("po_scored")
display(spark.table("po_scored"))

## Próximos passos

- Ensemble (supervisionado)
- Segmentação
- Interpretabilidade