In [1]:
import os
import pandas as pd
from mlflow.models import infer_signature
root_path = os.getcwd()
white_wine = pd.read_csv(root_path+"/winequality-white.csv",sep=";")
red_wine = pd.read_csv(root_path+"/winequality-red.csv",sep=";")


In [2]:
red_wine['is_red'] = 1
white_wine['is_red'] = 0

In [3]:
data = pd.concat([red_wine,white_wine],axis=0)
print("data shape: ",data.shape)

data shape:  (6497, 13)


In [4]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,is_red
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1


### Check data Corelation

In [5]:
data.rename(columns=lambda x: x.replace(' ','_'),inplace=True)
data.corr()


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,is_red
fixed_acidity,1.0,0.219008,0.324436,-0.111981,0.298195,-0.282735,-0.329054,0.45891,-0.2527,0.299568,-0.095452,-0.076743,0.48674
volatile_acidity,0.219008,1.0,-0.377981,-0.196011,0.377124,-0.352557,-0.414476,0.271296,0.261454,0.225984,-0.03764,-0.265699,0.653036
citric_acid,0.324436,-0.377981,1.0,0.142451,0.038998,0.133126,0.195242,0.096154,-0.329808,0.056197,-0.010493,0.085532,-0.187397
residual_sugar,-0.111981,-0.196011,0.142451,1.0,-0.12894,0.402871,0.495482,0.552517,-0.26732,-0.185927,-0.359415,-0.03698,-0.348821
chlorides,0.298195,0.377124,0.038998,-0.12894,1.0,-0.195045,-0.27963,0.362615,0.044708,0.395593,-0.256916,-0.200666,0.512678
free_sulfur_dioxide,-0.282735,-0.352557,0.133126,0.402871,-0.195045,1.0,0.720934,0.025717,-0.145854,-0.188457,-0.179838,0.055463,-0.471644
total_sulfur_dioxide,-0.329054,-0.414476,0.195242,0.495482,-0.27963,0.720934,1.0,0.032395,-0.238413,-0.275727,-0.26574,-0.041385,-0.700357
density,0.45891,0.271296,0.096154,0.552517,0.362615,0.025717,0.032395,1.0,0.011686,0.259478,-0.686745,-0.305858,0.390645
pH,-0.2527,0.261454,-0.329808,-0.26732,0.044708,-0.145854,-0.238413,0.011686,1.0,0.192123,0.121248,0.019506,0.329129
sulphates,0.299568,0.225984,0.056197,-0.185927,0.395593,-0.188457,-0.275727,0.259478,0.192123,1.0,-0.003029,0.038485,0.487218


In [6]:
import seaborn as sns
sns.displot(data.quality)

ModuleNotFoundError: No module named 'seaborn'

In [7]:
high_quality = (data.quality >= 7).astype(int)
data.quality= high_quality


data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,is_red
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0,1


### Create Train and Test set

In [8]:
X = data.drop(['quality'],axis=1)
y = data.quality

In [9]:
from sklearn.model_selection import train_test_split


X_train, X_rem , y_train, y_rem = train_test_split(X,y,train_size=0.8,random_state=123)

# Split the remaining data euqally into validation and test
X_val, X_test, y_val, y_test = train_test_split(X_rem,y_rem,test_size=0.5,random_state=123)


print("X_train : ",X_train.shape)
print("X_test : ",X_test.shape)
print("X_val : ",X_val.shape)


X_train :  (5197, 12)
X_test :  (650, 12)
X_val :  (650, 12)


### Create model and train

In [10]:
import mlflow
from sklearn.ensemble import RandomForestRegressor

In [11]:
mlflow_exp_name = "v4_end_to_end_mlflow"
mlflow.set_experiment(mlflow_exp_name)

Traceback (most recent call last):
  File "/opt/anaconda3/envs/ths/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 327, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "/opt/anaconda3/envs/ths/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 421, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/opt/anaconda3/envs/ths/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 1367, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "/opt/anaconda3/envs/ths/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 1360, in _read_helper
    result = read_yaml(root, file_name)
  File "/opt/anaconda3/envs/ths/lib/python3.10/site-packages/mlflow/utils/file_utils.py", line 309, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist.")
mlflow.exceptions.MissingCon

<Experiment: artifact_location='file:///Users/tharhtet/Documents/github/ML-in-Prod-batch-1/8_Experiment_Tracking/mlruns/229362213784232011', creation_time=1726948187654, experiment_id='229362213784232011', last_update_time=1726948187654, lifecycle_stage='active', name='v4_end_to_end_mlflow', tags={}>

In [12]:
mlflow.sklearn.autolog()

with mlflow.start_run(run_name="test_run"):
    n_estimators = 110
    max_depth = 10
    max_features = 3

    rf =  RandomForestRegressor(n_estimators=n_estimators,max_depth=max_depth,max_features=max_features)
    rf.fit(X_train,y_train)

    predictions = rf.predict(X_test)
    print("predictions : ", predictions.shape)


    # Infer the model signature
    signature = infer_signature(X_train, rf.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=rf,
        artifact_path="random_forest_model",
        signature=signature,
        input_example=X_train
    )


model_uri = model_info.model_uri
print("model_uri : ",model_uri)



predictions :  (650,)




model_uri :  runs:/21f96653bab44d9080376cd64885263b/random_forest_model


### Register the model

In [13]:
model_name = "wine_model_rf"
model_version = mlflow.register_model(model_uri, model_name)

Successfully registered model 'wine_model_rf'.
Created version '1' of model 'wine_model_rf'.


In [14]:
print("model_version : ",model_version)

model_version :  <ModelVersion: aliases=[], creation_timestamp=1726948193771, current_stage='None', description=None, last_updated_timestamp=1726948193771, name='wine_model_rf', run_id='21f96653bab44d9080376cd64885263b', run_link=None, source='file:///Users/tharhtet/Documents/github/ML-in-Prod-batch-1/8_Experiment_Tracking/mlruns/229362213784232011/21f96653bab44d9080376cd64885263b/artifacts/random_forest_model', status='READY', status_message=None, tags={}, user_id=None, version=1>


In [15]:
# Set alias with production
from mlflow.tracking import MlflowClient
client = MlflowClient()
client.set_registered_model_alias(name=model_name,
                                       version=model_version.version, alias="Production")

### Load model 

In [16]:
from sklearn.metrics import roc_auc_score

In [17]:
loaded_model = mlflow.pyfunc.load_model(f"models:/{model_name}@Production")
result = loaded_model.predict(X_test)
print(result)



[0.29861606 0.13786791 0.04819627 0.02647059 0.50565605 0.06428016
 0.06985619 0.15617322 0.30316569 0.13804845 0.00771873 0.64132093
 0.08151458 0.10869313 0.42734405 0.5705549  0.01718516 0.44790958
 0.04309942 0.0526342  0.02845389 0.22722374 0.32152675 0.01533764
 0.05247321 0.31060014 0.24953981 0.19424935 0.01926043 0.58790577
 0.00886672 0.11227714 0.06702156 0.24141659 0.00553523 0.83128909
 0.31377454 0.56302292 0.38448223 0.06680746 0.10834897 0.28790847
 0.12803176 0.11157222 0.05537459 0.47474572 0.59202297 0.15528236
 0.03024751 0.0712228  0.17905056 0.00695266 0.21672595 0.07193191
 0.03145821 0.04884557 0.02187876 0.01708644 0.15528236 0.0298653
 0.00261744 0.40536883 0.56174247 0.24938997 0.16532011 0.11498662
 0.18250022 0.01996845 0.70304465 0.00225857 0.51632434 0.1425591
 0.02255065 0.59839034 0.50565605 0.04640923 0.64496385 0.48991367
 0.07699492 0.10672695 0.08153402 0.02085532 0.16281884 0.35531793
 0.14101887 0.1340345  0.15211404 0.55323457 0.37698204 0.215302

In [18]:
X_test.shape

(650, 12)

In [19]:
sample_x = X_test.iloc[:2]
print(type(sample_x))
result = loaded_model.predict(sample_x)
print(result)



<class 'pandas.core.frame.DataFrame'>
[0.29861606 0.13786791]


In [20]:
print(f"AUC:{roc_auc_score(y_test, loaded_model.predict(X_test))}")



AUC:0.9207397818744513
