## Import libraries

In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from src.utils.data_loader import read_csv_from_s3
from src.utils.train import object_type_to_category, stratified_split, train_model
from pathlib import Path
from src.utils.evaluate import plot_confusion_matrix

In [None]:
TARGET = "AdoptionLikelihood"
NUM_FEATURES = [
    "AgeMonths",
    "WeightKg",
    "Vaccinated",
    "HealthCondition",
    "TimeInShelterDays",
    "AdoptionFee",
    "PreviousOwner",
]
CAT_FEATURES = ["PetType", "Breed", "Color", "Size"]
RANDOM_STATE = 42

## Load the data from S3

In [None]:
bucket_name = "pet-adoption-mlops"
s3_data_path = "data"
file_name = "pet_adoption_data.csv"

data = read_csv_from_s3(bucket_name=bucket_name, file_key=f"{s3_data_path}/{file_name}")

In [None]:
data = object_type_to_category(df=data)

In [None]:
df_train, df_val, df_test = stratified_split(data)

In [None]:
print(f"Train df shape: {df_train.shape}")
print(f"Validation df shape: {df_val.shape}")
print(f"Test df shape: {df_test.shape}")

In [None]:
df_train[TARGET].value_counts(normalize=True)

In [None]:
df_val[TARGET].value_counts(normalize=True)

In [None]:
df_test[TARGET].value_counts(normalize=True)

In [None]:
model = train_model(df_train=df_train, df_val=df_val)

In [None]:
y_pred = model.predict(df_val[NUM_FEATURES + CAT_FEATURES])
y_true = df_val[TARGET]
accuracy = (y_pred == df_val[TARGET]).mean()
print(f"Accuracy: {accuracy}")

In [None]:
current_dir = Path.cwd()
artifacts_dir = current_dir.parent / "artifacts"
artifacts_dir.mkdir(exist_ok=True)

plot_name = "confusion_matrix.png"
full_path_plot = artifacts_dir / plot_name

In [None]:
full_path_plot

In [None]:
plot_confusion_matrix(y_true=df_val[TARGET], y_pred=y_pred, save_path=full_path_plot)

In [None]:
# Calculate classification metrics for each class and total
report = classification_report(y_true, y_pred, target_names=["Not Adopted", "Adopted"], output_dict=True)
report_df = pd.DataFrame(report).transpose()

In [None]:
import pandas as pd

pd.DataFrame(report).T