In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import xgboost as xgb

def load_xgboost_model(model_path):
    """
    Load an XGBoost model from the specified path.
    
    Args:
        model_path (str): The path to the saved XGBoost model file.
        
    Returns:
        xgb.Booster: The loaded XGBoost model.
    """
    model = xgb.Booster()
    model.load_model(model_path)
    return model

In [29]:
model_path = "artifacts/model/xgboost_classifier.model"
model = load_xgboost_model(model_path)

In [4]:
import os
import sys
sys.path.insert(0, os.path.abspath('..'))

In [6]:
from scripts.get_data import load_data

In [7]:
from scripts.preprocessing import preprocess_dataframe

In [12]:
from scripts.train import predict_with_xgboost_classifier

In [8]:
df = load_data()

2023-04-28 23:23:52,467 [INFO] Starting main function
2023-04-28 23:23:52,468 [INFO] Creating Google Cloud Storage client
2023-04-28 23:23:53,854 [INFO] Downloading blob 'ai-platform-unified/datasets/tabular/petfinder-tabular-classification.csv' from bucket 'cloud-samples-data'
2023-04-28 23:23:56,745 [INFO] Downloaded blob 'ai-platform-unified/datasets/tabular/petfinder-tabular-classification.csv' successfully
2023-04-28 23:23:56,746 [INFO] Reading CSV content from string
2023-04-28 23:23:56,822 [INFO] Successfully read CSV content and created DataFrame
2023-04-28 23:23:56,823 [INFO] DataFrame shape: (11537, 14)
2023-04-28 23:23:56,824 [INFO] DataFrame columns: Index(['Type', 'Age', 'Breed1', 'Gender', 'Color1', 'Color2', 'MaturitySize',
       'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Fee', 'PhotoAmt',
       'Adopted'],
      dtype='object')
2023-04-28 23:23:56,834 [INFO] DataFrame's content:
  Type  Age                Breed1  Gender Color1    Color2 MaturitySize  \
0  Cat

In [9]:
X = df.drop(columns=["Adopted"])
y = df["Adopted"]

In [10]:
preprocess_df = preprocess_dataframe(X)

2023-04-28 23:27:34,543 [INFO] Starting preprocessing of the DataFrame
2023-04-28 23:27:34,544 [INFO] One-hot encoding columns: ['Type', 'Gender']
2023-04-28 23:27:34,561 [INFO] Label encoding columns: ['Vaccinated', 'Sterilized', 'Color1', 'Color2']
2023-04-28 23:27:34,572 [INFO] Ordinally encoding columns: dict_keys(['Health', 'FurLength', 'MaturitySize'])
2023-04-28 23:27:34,583 [INFO] Count encoding column: Breed1
2023-04-28 23:27:34,587 [INFO] DataFrame shape: (11537, 15)
2023-04-28 23:27:34,587 [INFO] DataFrame columns: Index(['Age', 'Breed1', 'Color1', 'Color2', 'MaturitySize', 'FurLength',
       'Vaccinated', 'Sterilized', 'Health', 'Fee', 'PhotoAmt', 'Type_Cat',
       'Type_Dog', 'Gender_Female', 'Gender_Male'],
      dtype='object')
2023-04-28 23:27:34,592 [INFO] DataFrame's content:
   Age  Breed1  Color1  Color2  MaturitySize  FurLength  Vaccinated  \
0    3     242       0       5           0.0        0.0           0   
1    1     865       0       0           1.0       

In [13]:
y_pred = predict_with_xgboost_classifier(model, preprocess_df)

2023-04-28 23:29:15,563 [INFO] Starting prediction...
2023-04-28 23:29:15,609 [INFO] Prediction complete.


In [14]:
y_pred

array([0.6296387 , 0.81424767, 0.9719507 , ..., 0.94739354, 0.73697793,
       0.911344  ], dtype=float32)

In [18]:
def convert_prob_to_target(y_pred, threshold=0.5):
    convert_y_pred_to_target =  ["Yes" if p >= threshold else "No" for p in y_pred]
    return convert_y_pred_to_target

In [19]:
convert_y_pred_to_target = convert_prob_to_target(y_pred)

In [20]:
convert_y_pred_to_target

['Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 

In [23]:
from pathlib import Path
import pandas as pd

def get_root_dir():
    """
    Get the absolute path to the root directory of the project.
    """
    return Path.cwd().parent.resolve()

def create_outpuir):
    """
 t_dir(root_d   Create the 'output' directory in the root directory if it doesn't exist.

    Args:
        root_dir (pathlib.Path): The path to the root directory.
    """
    output_dir = root_dir / 'output'
    output_dir.mkdir(parents=True, exist_ok=True)

def save_results(data, output_path):
    """
    Save the given data to a CSV file at the specified output path.

    Args:
        data (pandas.DataFrame): The data to be saved.
        output_path (pathlib.Path): The path to the output file.
    """
    data.to_csv(output_path, index=False)
    print(f"Results saved to {output_path}")


In [27]:
df["Adopted_prediction"] = convert_y_pred_to_target

In [28]:
root_dir = get_root_dir()

create_output_dir(root_dir)

output_path = root_dir / 'output' / 'results.csv'

save_results(df, output_path)


Results saved to /Users/emmanuelsekyi/projects/virgin_media_test/output/results.csv


In [30]:
root_dir = get_root_dir()


In [31]:
root_dir

PosixPath('/Users/emmanuelsekyi/projects/virgin_media_test')