# Imports

In [13]:
import mlflow
import pandas as pd
from pycaret.classification import *
from sklearn.datasets import load_wine

In [16]:
mlflow.autolog()

2024/09/19 21:52:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2024/09/19 21:52:03 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/09/19 21:52:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/09/19 21:52:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.


# Load and processing data raw

In [17]:
def load_wine_dataset() -> pd.DataFrame:
    """
    Load the wine dataset and return it as a pandas DataFrame.

    Returns:
        pd.DataFrame: The wine dataset as a pandas DataFrame.
    """
    wine_data = load_wine()
    df_raw = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)
    df_raw['target'] = wine_data.target
    
    return df_raw

In [18]:
def processing_data_raw(df_raw: pd.DataFrame) -> pd.DataFrame:
    """
    Process the raw DataFrame by renaming a column and creating a copy.

    Args:
        df_raw (pd.DataFrame): The raw DataFrame to be processed.

    Returns:
        pd.DataFrame: The processed DataFrame.
    """
    df = df_raw.rename(
        columns={'od280/od315_of_diluted_wines': 'od280_od315_of_diluted_wines'}
    ).copy()
    
    return df

In [19]:
df_raw = load_wine_dataset()
df = processing_data_raw(df_raw)

In [20]:
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280_od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


# Setup Pipeline

In [21]:
# This function initializes the training environment and creates the transformation pipeline.
s = setup(
        df,
        target = 'target',
        train_size = 0.7,
        session_id = 123,
        normalize = True,
        log_experiment = True,
        experiment_name = 'wine'
    )

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Multiclass
3,Original data shape,"(178, 14)"
4,Transformed data shape,"(178, 14)"
5,Transformed train set shape,"(124, 14)"
6,Transformed test set shape,"(54, 14)"
7,Numeric features,13
8,Preprocess,True
9,Imputation type,simple


# Compare Models

In [22]:
# Train and evaluate all models in the model library.
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.169
svm,SVM - Linear Kernel,0.9917,0.0,0.9917,0.9933,0.9917,0.9874,0.9883,0.051
ridge,Ridge Classifier,0.9917,0.0,0.9917,0.9938,0.9918,0.9875,0.9884,0.055
lda,Linear Discriminant Analysis,0.9917,0.0,0.9917,0.9933,0.9917,0.9874,0.9883,0.04
qda,Quadratic Discriminant Analysis,0.9846,0.0,0.9846,0.9872,0.9843,0.9765,0.9781,0.045
lightgbm,Light Gradient Boosting Machine,0.9846,0.9981,0.9846,0.9874,0.9845,0.9767,0.9783,19.773
nb,Naive Bayes,0.984,0.999,0.984,0.9873,0.9839,0.9758,0.9775,0.047
rf,Random Forest Classifier,0.984,0.999,0.984,0.9876,0.9841,0.976,0.9777,0.196
catboost,CatBoost Classifier,0.984,0.9981,0.984,0.9876,0.9841,0.976,0.9777,3.062
lr,Logistic Regression,0.9833,0.0,0.9833,0.9868,0.983,0.9745,0.9764,0.854




# Best model prediction

In [9]:
# predict on test set
holdout_pred = predict_model(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9815,1.0,0.9815,0.9825,0.9815,0.972,0.9725


In [12]:
holdout_pred.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280_od315_of_diluted_wines,proline,target,prediction_label,prediction_score
19,13.64,3.1,2.56,15.2,116.0,2.7,3.03,0.17,1.66,5.1,0.96,3.36,845.0,0,0,0.99
28,13.87,1.9,2.8,19.4,107.0,2.95,2.97,0.37,1.76,4.5,1.25,3.4,915.0,0,0,0.95
58,13.72,1.43,2.5,16.700001,108.0,3.4,3.67,0.19,2.04,6.8,0.89,2.87,1285.0,0,0,0.96
102,12.34,2.45,2.46,21.0,98.0,2.56,2.11,0.34,1.31,2.8,0.8,3.38,438.0,1,1,0.88
48,14.1,2.02,2.4,18.799999,103.0,2.75,2.92,0.32,2.38,6.2,1.07,2.75,1060.0,0,0,0.99


# Save model pipeline

In [10]:
# save pipeline
save_model(best, 'et_pipeline')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['alcohol', 'malic_acid', 'ash',
                                              'alcalinity_of_ash', 'magnesium',
                                              'total_phenols', 'flavanoids',
                                              'nonflavanoid_phenols',
                                              'proanthocyanins',
                                              'color_intensity', 'hue',
                                              'od280_od315_of_diluted_wines',
                                              'proline'],
                                     transformer=SimpleImputer(add_indicator=False,...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini',
                                       max_depth=None, max

# Load model pipeline 

In [11]:
loaded_best_pipeline = load_model('et_pipeline')
loaded_best_pipeline

Transformation Pipeline and Model Successfully Loaded


# Get config

In [12]:
get_config()

{'USI',
 'X',
 'X_test',
 'X_test_transformed',
 'X_train',
 'X_train_transformed',
 'X_transformed',
 '_available_plots',
 '_ml_usecase',
 'data',
 'dataset',
 'dataset_transformed',
 'exp_id',
 'exp_name_log',
 'fix_imbalance',
 'fold_generator',
 'fold_groups_param',
 'fold_shuffle_param',
 'gpu_n_jobs_param',
 'gpu_param',
 'html_param',
 'idx',
 'is_multiclass',
 'log_plots_param',
 'logging_param',
 'memory',
 'n_jobs_param',
 'pipeline',
 'seed',
 'target_param',
 'test',
 'test_transformed',
 'train',
 'train_transformed',
 'variable_and_property_keys',
 'variables',
 'y',
 'y_test',
 'y_test_transformed',
 'y_train',
 'y_train_transformed',
 'y_transformed'}

# Mlflow

In [None]:
!mlflow ui --port 5001

# Creating json for testing (optional)

In [66]:
'''
import json

def save_json(dataframe, filename):
    """Converts a DataFrame to a JSON file."""
    json_str = dataframe.to_json(orient='records')
    formatted_json = json.dumps(json.loads(json_str), indent=4)
    with open(filename, 'w') as file:
        file.write(formatted_json)

def prepare_and_save_data(X_test, y_test):
    """Prepares data by concatenating, filtering, and saving to JSON files."""
    df = pd.concat([X_test, y_test], axis=1)
    for target in range(3):
        target_df = df[df['target'] == target].drop('target', axis=1)
        save_json(target_df, f'target_{target}.json')
'''

In [63]:
'''
X_test = get_config('X_test')
y_test = get_config('y_test')

prepare_and_save_data(X_test, y_test)
'''