In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
sys.path.insert(0, os.path.abspath('..'))

In [3]:
from scripts.get_data import load_data

In [4]:
df = load_data()

2023-04-30 22:40:03,816 [INFO] Loading data from cloud-samples-data/ai-platform-unified/datasets/tabular/petfinder-tabular-classification.csv
2023-04-30 22:40:03,817 [INFO] Creating Google Cloud Storage client
2023-04-30 22:40:05,241 [INFO] Downloading blob 'ai-platform-unified/datasets/tabular/petfinder-tabular-classification.csv' from bucket 'cloud-samples-data'
2023-04-30 22:40:08,311 [INFO] Downloaded blob 'ai-platform-unified/datasets/tabular/petfinder-tabular-classification.csv' successfully
2023-04-30 22:40:08,312 [INFO] Reading CSV content from string
2023-04-30 22:40:08,353 [INFO] Successfully read CSV content and created DataFrame
2023-04-30 22:40:08,353 [INFO] DataFrame shape: (11537, 14)
2023-04-30 22:40:08,354 [INFO] DataFrame columns: Index(['Type', 'Age', 'Breed1', 'Gender', 'Color1', 'Color2', 'MaturitySize',
       'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Fee', 'PhotoAmt',
       'Adopted'],
      dtype='object')
2023-04-30 22:40:08,361 [INFO] DataFrame's co

In [6]:
df.head().to_dict(orient='list')

{'Type': ['Cat', 'Cat', 'Dog', 'Dog', 'Dog'],
 'Age': [3, 1, 1, 4, 1],
 'Breed1': ['Tabby',
  'Domestic Medium Hair',
  'Mixed Breed',
  'Mixed Breed',
  'Mixed Breed'],
 'Gender': ['Male', 'Male', 'Male', 'Female', 'Male'],
 'Color1': ['Black', 'Black', 'Brown', 'Black', 'Black'],
 'Color2': ['White', 'Brown', 'White', 'Brown', 'No Color'],
 'MaturitySize': ['Small', 'Medium', 'Medium', 'Medium', 'Medium'],
 'FurLength': ['Short', 'Medium', 'Medium', 'Short', 'Short'],
 'Vaccinated': ['No', 'Not Sure', 'Yes', 'Yes', 'No'],
 'Sterilized': ['No', 'Not Sure', 'No', 'No', 'No'],
 'Health': ['Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy'],
 'Fee': [100, 0, 0, 150, 0],
 'PhotoAmt': [1, 2, 7, 8, 3],
 'Adopted': ['Yes', 'Yes', 'Yes', 'Yes', 'Yes']}

In [129]:
import io
import logging

import pandas as pd
from google.cloud import storage

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler()],
)


def create_storage_client():
    """
    Creates a client to interact with the Google Cloud Storage API.

    Returns:
        storage.Client: An instance of the storage client.
    """
    logging.info("Creating Google Cloud Storage client")
    return storage.Client()


def download_blob_as_string(storage_client, bucket_name, blob_name):
    """
    Downloads a blob from Google Cloud Storage (GCS) and returns its contents as a string.

    Args:
        storage_client (storage.Client): The storage client instance.
        bucket_name (str): The name of the GCS bucket.
        blob_name (str): The name of the blob to download.

    Returns:
        str: The contents of the blob as a string.
    """
    logging.info(f"Downloading blob '{blob_name}' from bucket '{bucket_name}'")
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    content = blob.download_as_string()
    logging.info(f"Downloaded blob '{blob_name}' successfully")

    return content


def read_csv_from_string(csv_string):
    """
    Reads a CSV file from a string and returns its contents as a pandas DataFrame.

    Args:
        csv_string (str): The string containing the CSV content.

    Returns:
        pd.DataFrame: The parsed CSV content as a pandas DataFrame.
    """
    logging.info("Reading CSV content from string")
    dataframe = pd.read_csv(io.StringIO(csv_string.decode("utf-8")))
    logging.info("Successfully read CSV content and created DataFrame")

    return dataframe


def main():
    """
    Main function to demonstrate the usage of the above functions.
    """
    logging.info("Starting main function")

    bucket_name = "cloud-samples-data"
    file_name = "ai-platform-unified/datasets/tabular/petfinder-tabular-classification.csv"

    storage_client = create_storage_client()
    csv_string = download_blob_as_string(storage_client, bucket_name, file_name)
    df = read_csv_from_string(csv_string)

    # Perform any operations on the DataFrame here
    logging.info(f"DataFrame shape: {df.shape}")
    logging.info(f"DataFrame columns: {df.columns}")
    logging.info("DataFrame's content:\n" + str(df.head()))

    logging.info("Finished main function")
    return df

In [130]:
df = main()

INFO:root:Starting main function
INFO:root:Creating Google Cloud Storage client
INFO:root:Downloading blob 'ai-platform-unified/datasets/tabular/petfinder-tabular-classification.csv' from bucket 'cloud-samples-data'
INFO:root:Downloaded blob 'ai-platform-unified/datasets/tabular/petfinder-tabular-classification.csv' successfully
INFO:root:Reading CSV content from string
INFO:root:Successfully read CSV content and created DataFrame
INFO:root:DataFrame shape: (11537, 14)
INFO:root:DataFrame columns: Index(['Type', 'Age', 'Breed1', 'Gender', 'Color1', 'Color2', 'MaturitySize',
       'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Fee', 'PhotoAmt',
       'Adopted'],
      dtype='object')
INFO:root:DataFrame's content:
  Type  Age                Breed1  Gender Color1    Color2 MaturitySize  \
0  Cat    3                 Tabby    Male  Black     White        Small   
1  Cat    1  Domestic Medium Hair    Male  Black     Brown       Medium   
2  Dog    1           Mixed Breed    Male  Br

In [131]:
config = {
    "one_hot_encode_columns": ["Type", "Gender"],
    "label_encode_columns": ["Vaccinated", "Sterilized", "Color1", "Color2"],
    "ordinal_encode_columns": {
        "Health": ["Healthy", "Minor Injury", "Serious Injury"],
        "FurLength": ["Short", "Medium", "Long"],
        "MaturitySize": ["Small", "Medium", "Large"],
    },
    "count_encode_column": "Breed1",
    "target_column": "Adopted",
}


In [132]:
import logging

import pandas as pd
# from config import config
# from get_data import main
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


def one_hot_encode_columns(df, columns_to_be_one_hot_encoded):
    """
    One-hot encode the specified columns in the DataFrame.

    Args:
        df: The input DataFrame.
        columns_to_be_one_hot_encoded: A list of column names to be one-hot encoded.

    Returns:
        pd.DataFrame: The DataFrame with specified columns one-hot encoded.
    """
    logging.info("One-hot encoding columns: %s", columns_to_be_one_hot_encoded)
    for column in columns_to_be_one_hot_encoded:
        df = pd.get_dummies(df, columns=[column], prefix=column)
    return df


def label_encode_columns(df, columns_to_be_label_encoded):
    """
    Label encode the specified columns in the DataFrame.

    Args:
        df: The input DataFrame.
        columns_to_be_label_encoded: A list of column names to be label encoded.

    Returns:
        pd.DataFrame: The DataFrame with specified columns label encoded.
    """
    logging.info("Label encoding columns: %s", columns_to_be_label_encoded)
    for column in columns_to_be_label_encoded:
        encoder = LabelEncoder()
        df[column] = encoder.fit_transform(df[column])
    return df


def ordinally_encode_columns(df, columns_to_be_ordinally_encoded):
    """
    Ordinally encode the specified columns in the DataFrame.

    Args:
        df: The input DataFrame.
        columns_to_be_ordinally_encoded: A dictionary where the key is the column name and the value is a list of ordered values for that column.

    Returns:
        pd.DataFrame: The DataFrame with specified columns ordinally encoded.
    """
    logging.info("Ordinally encoding columns: %s", columns_to_be_ordinally_encoded.keys())
    for column, ordered_values in columns_to_be_ordinally_encoded.items():
        encoder = OrdinalEncoder(categories=[ordered_values])
        df[column] = encoder.fit_transform(df[[column]])
    return df


def count_encode_column(df, col="Breed1"):
    """
    Count encode the specified column in the DataFrame.

    Args:
        df: The input DataFrame.
        col: The column name to be count encoded. Default is 'Breed1'.

    Returns:
        pd.DataFrame: The DataFrame with the specified column count encoded.
    """
    logging.info("Count encoding column: %s", col)
    df[col] = df[col].map(df[col].value_counts())
    return df


def convert_target_to_binary(df, target_col="Adopted"):
    """
    Convert the target column to binary values.

    Args:
        df: The input DataFrame.
        target_col: The target column name. Default is 'Adopted'.

    Returns:
        pd.DataFrame: The DataFrame with the target column converted to binary values.
    """
    logging.info("Converting target column '%s' to binary values", target_col)
    df[target_col] = df[target_col].replace({"Yes": 1, "No": 0})
    logging.info("DataFrame's content:\n" + str(df.head()))
    return df


def preprocess_dataframe(df, config):
    """
    Preprocess the input DataFrame using the specified configuration.

    Args:
        df: The input DataFrame.
        config: A dictionary containing the configuration for preprocessing. This should include the following keys:
                - 'one_hot_encode_columns': list of columns to be one-hot encoded
                - 'label_encode_columns': list of columns to be label encoded
                - 'ordinal_encode_columns': dict with column names as keys and lists of ordered values as values
                - 'count_encode_column': name of the column to be count encoded (default: 'Breed1')
                - 'target_column': name of the target column to convert to binary values (default: 'Adopted')

    Returns:
        pd.DataFrame: The preprocessed DataFrame.
    """
    logging.info("Starting preprocessing of the DataFrame")
    df = one_hot_encode_columns(df, config["one_hot_encode_columns"])
    df = label_encode_columns(df, config["label_encode_columns"])
    df = ordinally_encode_columns(df, config["ordinal_encode_columns"])
    df = count_encode_column(df, config["count_encode_column"])

    logging.info("DataFrame's content:\n" + str(df.head()))
    logging.info("Finished preprocessing of the DataFrame")
    return df



In [133]:
config

{'one_hot_encode_columns': ['Type', 'Gender'],
 'label_encode_columns': ['Vaccinated', 'Sterilized', 'Color1', 'Color2'],
 'ordinal_encode_columns': {'Health': ['Healthy',
   'Minor Injury',
   'Serious Injury'],
  'FurLength': ['Short', 'Medium', 'Long'],
  'MaturitySize': ['Small', 'Medium', 'Large']},
 'count_encode_column': 'Breed1',
 'target_column': 'Adopted'}

In [134]:
df.head()

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,PhotoAmt,Adopted
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,1,Yes
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,2,Yes
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,7,Yes
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,8,Yes
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,3,Yes


In [139]:
df = convert_target_to_binary(df, config["target_column"])
preprocessed_df = preprocess_dataframe(df, config)

INFO:root:Converting target column 'Adopted' to binary values
INFO:root:DataFrame's content:
  Type  Age                Breed1  Gender Color1    Color2 MaturitySize  \
0  Cat    3                 Tabby    Male  Black     White        Small   
1  Cat    1  Domestic Medium Hair    Male  Black     Brown       Medium   
2  Dog    1           Mixed Breed    Male  Brown     White       Medium   
3  Dog    4           Mixed Breed  Female  Black     Brown       Medium   
4  Dog    1           Mixed Breed    Male  Black  No Color       Medium   

  FurLength Vaccinated Sterilized   Health  Fee  PhotoAmt  Adopted  
0     Short         No         No  Healthy  100         1        1  
1    Medium   Not Sure   Not Sure  Healthy    0         2        1  
2    Medium        Yes         No  Healthy    0         7        1  
3     Short        Yes         No  Healthy  150         8        1  
4     Short         No         No  Healthy    0         3        1  
INFO:root:Starting preprocessing of the Da

In [144]:
import logging
import xgboost as xgb
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.model_selection import train_test_split
import os

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()],
)

def split_dataset(X, y):
    """
    Splits the dataset into training, validation, and test sets.

    Args:
        X (pd.DataFrame): The feature matrix.
        y (pd.Series): The target vector.

    Returns:
        tuple: The train, validation, and test feature matrices and target vectors.
    """
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
    
    assert X_val.shape == X_test.shape, "X_val and X_test should have the same shape"
    logging.info("Shape of validation and test sets are the same as expected.")
    
    return X_train, X_val, X_test, y_train, y_val, y_test

def train_xgboost_classifier(X_train, y_train, X_val, y_val, params=None):
    """
    Trains an XGBoost classifier on the given training set.

    Args:
        X_train (pd.DataFrame): The training feature matrix.
        y_train (pd.Series): The training target vector.
        X_val (pd.DataFrame): The validation feature matrix.
        y_val (pd.Series): The validation target vector.
        params (dict, optional): The parameters for the XGBoost classifier. Defaults to None.

    Returns:
        xgb.Booster: The trained XGBoost classifier.
    """
    logging.info("Starting training...")
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    xgb_params = params or {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'eta': 0.1,
        'max_depth': 6,
        'seed': 42
    }

    watchlist = [(dtrain, 'train'), (dval, 'validation')]
    num_rounds = 1000

    model = xgb.train(xgb_params, dtrain, num_rounds, watchlist, early_stopping_rounds=10)
    logging.info("Training complete.")
    return model

def predict_with_xgboost_classifier(model, X):
    """
    Predicts the target using the given XGBoost classifier.

    Args:
        model (xgb.Booster): The trained XGBoost classifier.
        X (pd.DataFrame): The feature matrix.

    Returns:
        np.array: The predicted target values.
    """
    logging.info("Starting prediction...")
    dtest = xgb.DMatrix(X, enable_categorical=True)
    y_pred = model.predict(dtest)
    logging.info("Prediction complete.")
    return y_pred


def convert_probabilities_to_binary(y_pred, threshold=0.5):
    """
    Converts the predicted probabilities to binary predictions using the given threshold.

    Args:
        y_pred (List[float]): The list of predicted probabilities.
        threshold (float, optional): The threshold value to be used for converting probabilities to binary predictions. Defaults to 0.5.

    Returns:
        List[int]: The list of binary predictions.
    """
    return [1 if p >= threshold else 0 for p in y_pred]


def evaluate_xgboost_classifier(model, X_test, y_test):
    """
    Evaluates the XGBoost classifier on the test set.

    Args:
        model (xgb.Booster): The trained XGBoost classifier.
        X_test (pd.DataFrame): The test feature matrix.
        y_test (pd.Series): The test target vector.

    Returns:
        tuple: The accuracy, F1 score, and recall of the classifier.
    """
    logging.info("Starting evaluation...")
    y_pred = predict_with_xgboost_classifier(model, X_test)
    y_pred_binary = convert_probabilities_to_binary(y_pred)

    accuracy = accuracy_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)

    logging.info("Evaluation complete.")
    return accuracy, f1, recall

def save_model(model, model_dir="artifacts/model", model_filename="xgboost_classifier.model"):
    """
    Saves the trained model to the specified directory.

    Args:
        model (xgb.Booster): The trained XGBoost classifier.
        model_dir (str, optional): The directory where the model will be saved. Defaults to "artifacts/model".
        model_filename (str, optional): The filename for the saved model. Defaults to "xgboost_classifier.model".

    Returns:
        None
    """
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model.save_model(os.path.join(model_dir, model_filename))
    logging.info(f"Model saved to {model_dir}/{model_filename}")


def train_and_evaluate(df):
    """
    Trains and evaluates an XGBoost classifier on the given DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing the feature matrix and target vector.

    Returns:
        None
    """
    if 'Adopted' not in df.columns:
        raise ValueError("The 'Adopted' column is missing in the input DataFrame")

    X = df.drop(columns=['Adopted'])
    y = df['Adopted']

    try:
        X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(X, y)
        classifier_model = train_xgboost_classifier(X_train, y_train, X_val, y_val)
        accuracy, f1, recall = evaluate_xgboost_classifier(classifier_model, X_test, y_test)
        
        logging.info(f'Accuracy: {accuracy}')
        logging.info(f'F1 Score: {f1}')
        logging.info(f'Recall: {recall}')

        save_model(classifier_model)
        
    except Exception as e:
        logging.error(f"An error occurred during the training and evaluation process: {e}")



In [141]:
train_and_evaluate(preprocessed_df)

INFO:root:Shape of validation and test sets are the same as expected.
INFO:root:Starting training...


[0]	train-logloss:0.65936	validation-logloss:0.66154
[1]	train-logloss:0.63166	validation-logloss:0.63589
[2]	train-logloss:0.60862	validation-logloss:0.61479
[3]	train-logloss:0.58940	validation-logloss:0.59746
[4]	train-logloss:0.57301	validation-logloss:0.58302
[5]	train-logloss:0.55882	validation-logloss:0.57092
[6]	train-logloss:0.54654	validation-logloss:0.55901
[7]	train-logloss:0.53625	validation-logloss:0.55060
[8]	train-logloss:0.52713	validation-logloss:0.54311
[9]	train-logloss:0.51885	validation-logloss:0.53689
[10]	train-logloss:0.51185	validation-logloss:0.53101
[11]	train-logloss:0.50573	validation-logloss:0.52606
[12]	train-logloss:0.50018	validation-logloss:0.52234
[13]	train-logloss:0.49504	validation-logloss:0.51913
[14]	train-logloss:0.49086	validation-logloss:0.51616
[15]	train-logloss:0.48672	validation-logloss:0.51390
[16]	train-logloss:0.48320	validation-logloss:0.51172
[17]	train-logloss:0.47980	validation-logloss:0.50956
[18]	train-logloss:0.47682	validation-



[26]	train-logloss:0.45847	validation-logloss:0.49898
[27]	train-logloss:0.45675	validation-logloss:0.49826
[28]	train-logloss:0.45504	validation-logloss:0.49772
[29]	train-logloss:0.45348	validation-logloss:0.49754
[30]	train-logloss:0.45186	validation-logloss:0.49721
[31]	train-logloss:0.45041	validation-logloss:0.49640
[32]	train-logloss:0.44887	validation-logloss:0.49583
[33]	train-logloss:0.44737	validation-logloss:0.49562
[34]	train-logloss:0.44588	validation-logloss:0.49529
[35]	train-logloss:0.44470	validation-logloss:0.49479
[36]	train-logloss:0.44337	validation-logloss:0.49452
[37]	train-logloss:0.44184	validation-logloss:0.49359
[38]	train-logloss:0.44068	validation-logloss:0.49305
[39]	train-logloss:0.43990	validation-logloss:0.49296
[40]	train-logloss:0.43895	validation-logloss:0.49276
[41]	train-logloss:0.43765	validation-logloss:0.49264
[42]	train-logloss:0.43659	validation-logloss:0.49240
[43]	train-logloss:0.43547	validation-logloss:0.49184
[44]	train-logloss:0.43473	v

INFO:root:Training complete.
INFO:root:Starting evaluation...
INFO:root:Starting prediction...
INFO:root:Prediction complete.
INFO:root:Evaluation complete.
INFO:root:Accuracy: 0.7608318890814558
INFO:root:F1 Score: 0.85
INFO:root:Recall: 0.9243498817966903
INFO:root:Model saved to artifacts/model/xgboost_classifier.model


In [87]:
X = df.drop(columns=['Adopted'])
y_actual =  df['Adopted']

X_preprocessed = preprocess_dataframe(X, config)

y_pred = predict_with_xgboost_classifier(model_path="artifacts/model/xgboost_classifier.model", X=X_preprocessed)

INFO:root:Starting preprocessing of the DataFrame
INFO:root:One-hot encoding columns: ['Type', 'Gender']
INFO:root:Label encoding columns: ['Vaccinated', 'Sterilized', 'Color1', 'Color2']
INFO:root:Ordinally encoding columns: dict_keys(['Health', 'FurLength', 'MaturitySize'])
INFO:root:Count encoding column: Breed1
INFO:root:Converting target column 'Adopted' to binary values


KeyError: 'Adopted'