## Import libraries

In [None]:
import mlflow
import os
from sklearn.metrics import classification_report
from src.utils.data_loader import read_csv_from_s3
from src.utils.train import object_type_to_category, stratified_split, train_model

In [None]:
os.environ["AWS_PROFILE"] = (
    "mlops-zoomcamp"  # fill in with your AWS profile. More info: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/setup.html#setup-credentials
)

TRACKING_SERVER_HOST = (
    "ec2-3-249-138-11.eu-west-1.compute.amazonaws.com"  # fill in with the public DNS of the EC2 instance
)
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

## Load the data from S3

In [None]:
bucket_name = "pet-adoption-mlops"
s3_data_path = "data"
file_name = "pet_adoption_data.csv"

data = read_csv_from_s3(bucket_name=bucket_name, file_key=f"{s3_data_path}/{file_name}")

In [None]:
data = object_type_to_category(df=data)

In [None]:
target = "AdoptionLikelihood"
num_features = [
    "AgeMonths",
    "WeightKg",
    "Vaccinated",
    "HealthCondition",
    "TimeInShelterDays",
    "AdoptionFee",
    "PreviousOwner",
]
cat_features = ["PetType", "Breed", "Color", "Size"]

random_state = 42

In [None]:
df_train, df_val, df_test = stratified_split(data, target_col=target, random_state=42)

In [None]:
print(f"Train df shape: {df_train.shape}")
print(f"Validation df shape: {df_val.shape}")
print(f"Test df shape: {df_test.shape}")

In [None]:
df_train["AdoptionLikelihood"].value_counts(normalize=True)

In [None]:
df_val["AdoptionLikelihood"].value_counts(normalize=True)

In [None]:
df_test["AdoptionLikelihood"].value_counts(normalize=True)

In [None]:
model = train_model(
    df_train=df_train,
    df_val=df_val,
    target=target,
    num_features=num_features,
    cat_features=cat_features,
    random_state=random_state,
)

In [None]:
y_pred = model.predict(df_test[num_features + cat_features])
y_true = df_test["AdoptionLikelihood"]
accuracy = (y_pred == df_test["AdoptionLikelihood"]).mean()
print(f"Accuracy: {accuracy}")

In [None]:
from src.utils.evaluate import plot_confusion_matrix

plot_confusion_matrix(y_true=df_test["AdoptionLikelihood"], y_pred=y_pred)

In [None]:
# Calculate classification metrics for each class and total
report = classification_report(y_true, y_pred, target_names=["Class 0", "Class 1"])

# Print the classification report
print(report)