In [7]:
import pickle
import pandas as pd
from pandas import DataFrame
from sklearn.metrics import  root_mean_squared_error, accuracy_score,classification_report
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import dagshub
import mlflow
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from mlflow.tracking import MlflowClient

In [8]:
# Futuramente primera task leer los datos
def readData(path:str) -> DataFrame:
    return pd.read_csv(path)


# Futuramente segunda task separar los datos en X y Y
def prepareData(df:DataFrame): 
    return df["complaint_what_happened"], df["ticket_classification"]

In [9]:
X,y = prepareData(readData("../data/clean_data/datatransformed.csv"))


# Todo esto sería la tercera task
label_encoder = LabelEncoder()

y_encoded = label_encoder.fit_transform(y)
with open("label_encoder.pkl", 'wb') as file:
    pickle.dump(label_encoder, file)

tfidf = TfidfVectorizer()
X_vec = tfidf.fit_transform(X)
with open("tfidf.pkl", 'wb') as file:
    pickle.dump(tfidf, file)

X_train, X_test, y_train, y_test = train_test_split(X_vec,y_encoded, test_size=0.2, random_state=24, stratify=y_encoded)

In [10]:
dagshub.init(repo_owner='zapatacc', repo_name='final-exam-pcd2024-autumn', mlflow=True)
MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("dafne-tamayo-experiments-prefect")

<Experiment: artifact_location='mlflow-artifacts:/aed4eaf7a4a4473e95a7b7f7630a0b09', creation_time=1732246062368, experiment_id='33', last_update_time=1732246062368, lifecycle_stage='active', name='dafne-tamayo-experiments-prefect', tags={}>

In [85]:
# Declaré models and params
models = {
    "Logistic Regression": (LogisticRegression, {"penalty": "l2", "C": 1.0, "solver": "lbfgs"}),
    "Support Vector Machine": (SVC, {"C": 1.0, "kernel": "rbf", "gamma": "scale"}),
    "K-Nearest Neighbors": (KNeighborsClassifier, {"n_neighbors": 5, "weights": "uniform"}),
    "K-Nearest Neighbors2": (KNeighborsClassifier, {"n_neighbors": 5, "weights": "distance"})
}

In [86]:
for model_name, (model, params) in models.items():
    with mlflow.start_run(run_name=f"dafne-tamayo-{model_name}"):
        instance = model(**params)
        
        instance.fit(X_train, y_train)
        y_pred = instance.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        with open ("label_encoder.pkl", 'wb') as file:
            pickle.dump(label_encoder, file)
        mlflow.log_artifact("label_encoder.pkl")
        
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(instance, artifact_path=f"best_model_{model_name}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🏃 View run dafne-tamayo-Logistic Regression at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/8/runs/47e3a8eabcea439385b52a2ac7ccfa88
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/8


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🏃 View run dafne-tamayo-Support Vector Machine at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/8/runs/9a358f2085c04230a44a1611ee222dc5
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/8




🏃 View run dafne-tamayo-K-Nearest Neighbors at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/8/runs/6243d507f19a4a458abeb2b63ed3a741
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/8




🏃 View run dafne-tamayo-K-Nearest Neighbors2 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/8/runs/7a7fae480dbd4a979ce962b905a1deb6
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/8


In [99]:

all_runs = mlflow.search_runs(
    experiment_names=["dafne-tamayo-experiments"],
    order_by=["metrics.accuracy DESC"],
)

bestsRun = all_runs.drop_duplicates(subset="metrics.accuracy").head(2).reset_index()
bestsRun

Unnamed: 0,index,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.accuracy,params.C,params.gamma,params.kernel,params.penalty,params.solver,params.weights,params.n_neighbors,tags.mlflow.source.type,tags.mlflow.log-model.history,tags.mlflow.user,tags.mlflow.source.name,tags.mlflow.runName
0,0,9a358f2085c04230a44a1611ee222dc5,8,FINISHED,mlflow-artifacts:/373176df8ec44033a983493ce72b...,2024-11-21 02:39:41.064000+00:00,2024-11-21 02:42:01.995000+00:00,0.655475,1.0,scale,rbf,,,,,LOCAL,"[{""run_id"": ""9a358f2085c04230a44a1611ee222dc5""...",dafnetamayo,/Users/daftamayo/Documents/EXAMENPROYECTO/fina...,dafne-tamayo-Support Vector Machine
1,4,47e3a8eabcea439385b52a2ac7ccfa88,8,FINISHED,mlflow-artifacts:/373176df8ec44033a983493ce72b...,2024-11-21 02:39:32.384000+00:00,2024-11-21 02:39:40.650000+00:00,0.654567,1.0,,,l2,lbfgs,,,LOCAL,"[{""run_id"": ""47e3a8eabcea439385b52a2ac7ccfa88""...",dafnetamayo,/Users/daftamayo/Documents/EXAMENPROYECTO/fina...,dafne-tamayo-Logistic Regression


In [100]:
client = MlflowClient()

model_uri = f"runs:/{bestsRun.run_id[1]}/model"
model_name = "dafne-model"

registered_model = mlflow.register_model(model_uri=model_uri, name=model_name)

# Agrega un alias "champion" al modelo registrado
model_version = client.get_latest_versions(name=model_name, stages=["None"])[0].version
client.set_registered_model_alias(
    name=model_name,
    alias="challenger",
    version=model_version
)

###### 

model_uri = f"runs:/{bestsRun.run_id[0]}/model"
model_name = "dafne-model"

registered_model = mlflow.register_model(model_uri=model_uri, name=model_name)

# Agrega un alias "champion" al modelo registrado
model_version = client.get_latest_versions(name=model_name, stages=["None"])[0].version
client.set_registered_model_alias(
    name=model_name,
    alias="champion",
    version=model_version
)

Registered model 'dafne-model' already exists. Creating a new version of this model...
2024/11/20 20:55:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: dafne-model, version 7
Created version '7' of model 'dafne-model'.
  model_version = client.get_latest_versions(name=model_name, stages=["None"])[0].version
Registered model 'dafne-model' already exists. Creating a new version of this model...
2024/11/20 20:55:12 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: dafne-model, version 8
Created version '8' of model 'dafne-model'.
  model_version = client.get_latest_versions(name=model_name, stages=["None"])[0].version


Realizar búsqueda de hiperparámetros con GridSearch

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import pickle

models = {
    "Logistic Regression": (
        LogisticRegression,
        {"penalty": ["l2"], "C": [0.1, 1.0, 10.0], "solver": ["lbfgs"]}
    ),
    "Support Vector Machine": (
        SVC,
        {"C": [0.1, 1.0], "kernel": ["rbf"], "gamma": ["scale", "auto"]}
    ),
    "K-Nearest Neighbors": (
        KNeighborsClassifier,
        {"n_neighbors": [3, 5], "weights": ["uniform", "distance"]}
    )
}

for model_name, (model, param_grid) in models.items():
    with mlflow.start_run(run_name=f"dafne-tamayo-{model_name}"):
        grid_search = GridSearchCV(estimator=model(), param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

        y_pred = best_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        
        # Registro en MLflow
        mlflow.log_params(best_params)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_artifact("label_encoder.pkl")
        mlflow.log_artifact("tfidf.pkl")
        mlflow.sklearn.log_model(best_model, artifact_path=f"best_model_{model_name}")
        print(f"Model: {model_name} - Best Params: {best_params} - Accuracy: {accuracy}")

Fitting 3 folds for each of 3 candidates, totalling 9 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model: Logistic Regression - Best Params: {'C': 10.0, 'penalty': 'l2', 'solver': 'lbfgs'} - Accuracy: 0.6675741076830006
🏃 View run dafne-tamayo-Logistic Regression at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/33/runs/13988b3686ea4aeeb3d477ad56aaab3e
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/33
Fitting 3 folds for each of 4 candidates, totalling 12 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: Support Vector Machine - Best Params: {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'} - Accuracy: 0.6554748941318814
🏃 View run dafne-tamayo-Support Vector Machine at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/33/runs/95bc9c71dbc944c395bb8d9587eba715
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/33
Fitting 3 folds for each of 4 candidates, totalling 12 fits




Model: K-Nearest Neighbors - Best Params: {'n_neighbors': 5, 'weights': 'distance'} - Accuracy: 0.5859044162129462
🏃 View run dafne-tamayo-K-Nearest Neighbors at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/33/runs/c1efef64b00a415d8674e199bbde465e
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/33


In [12]:
client = MlflowClient()
all_runs = mlflow.search_runs(
    experiment_names=["dafne-tamayo-experiments-prefect"],
    order_by=["metrics.accuracy DESC"],
)

bestsRun = all_runs.drop_duplicates(subset="metrics.accuracy").head(2).reset_index()
if len(bestsRun) < 2:
    raise ValueError("No hay suficientes modelos para seleccionar champion y challenger.")

# Registrar modelos
model_name = "dafne-model"

for i, alias in enumerate(["champion", "challenger"]):
    model_uri = f"runs:/{bestsRun.run_id[i]}/best_model_Logistic Regression"
    registered_model = mlflow.register_model(model_uri=model_uri, name=model_name)
    model_version = client.get_latest_versions(name=model_name, stages=["None"])[0].version
    client.set_registered_model_alias(name=model_name, alias=alias, version=model_version)

Registered model 'dafne-model' already exists. Creating a new version of this model...
2024/11/22 15:24:56 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: dafne-model, version 3
Created version '3' of model 'dafne-model'.
  model_version = client.get_latest_versions(name=model_name, stages=["None"])[0].version
Registered model 'dafne-model' already exists. Creating a new version of this model...
2024/11/22 15:24:57 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: dafne-model, version 4
Created version '4' of model 'dafne-model'.
