In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, recall_score
import logging

logging.basicConfig(level=logging.INFO)

In [2]:
# Functions for splitting dataset, training model, and evaluating model
def split_dataset(X, y):
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [44]:
def train_xgboost_classifier(X_train, y_train, X_val, y_val, feature_types, params=None):
    logging.info("Starting training...")
    dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True, feature_types=feature_types)
    dval = xgb.DMatrix(X_val, label=y_val, enable_categorical=True, feature_types=feature_types)

    xgb_params = params or {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'eta': 0.1,
        'max_depth': 6,
        'seed': 42
    }

    watchlist = [(dtrain, 'train'), (dval, 'validation')]
    num_rounds = 1000

    model = xgb.train(xgb_params, dtrain, num_rounds, watchlist, early_stopping_rounds=10)
    logging.info("Training complete.")
    return model

def predict_with_xgboost_classifier(model, X, feature_types):
    logging.info("Starting prediction...")
    dtest = xgb.DMatrix(X, enable_categorical=True, feature_types=feature_types)
    y_pred = model.predict(dtest)
    logging.info("Prediction complete.")
    return y_pred

def evaluate_xgboost_classifier(model, X_test, y_test, feature_types):
    logging.info("Starting evaluation...")
    y_pred = predict_with_xgboost_classifier(model, X_test, feature_types)
    y_pred_binary = [1 if p >= 0.5 else 0 for p in y_pred]

    accuracy = accuracy_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)

    logging.info("Evaluation complete.")
    return accuracy, f1, recall



In [45]:
import pandas as pd
from google.cloud import storage
import io

def create_storage_client():
    """
    Creates a client to interact with the Google Cloud Storage API.
    """
    return storage.Client()

def download_csv_from_gcs(storage_client, bucket_name, file_name):
    """
    Downloads a CSV file from Google Cloud Storage (GCS) and returns its contents as a string.
    """
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(file_name)

    file_content = blob.download_as_string()

    return file_content

def read_csv_from_string(file_content):
    """
    Reads a CSV file from a string and returns its contents as a pandas DataFrame.
    """
    df = pd.read_csv(io.StringIO(file_content.decode("utf-8")))

    return df

def convert_to_categorical(data, categorical_columns):
    for col in categorical_columns:
        data[col] = data[col].astype('category')
    return data

def convert_target_to_binary(y):
    y = y.replace({'Yes': 1, 'No': 0})
    return y

def create_feature_types(X, categorical_columns):
    feature_types = ['categorical' if col in categorical_columns else 'continuous' for col in X.columns]
    return feature_types

In [46]:
storage_client = create_storage_client()

bucket_name = "cloud-samples-data"
file_name = "ai-platform-unified/datasets/tabular/petfinder-tabular-classification.csv"

file_content = download_csv_from_gcs(storage_client, bucket_name, file_name)

data = read_csv_from_string(file_content)

In [47]:
data

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,PhotoAmt,Adopted
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,1,Yes
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,2,Yes
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,7,Yes
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,8,Yes
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,3,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11532,Dog,24,Poodle,Male,Brown,Golden,Medium,Medium,Not Sure,No,Healthy,0,0,No
11533,Cat,1,Domestic Short Hair,Female,Cream,Gray,Medium,Short,No,No,Healthy,0,1,Yes
11534,Dog,6,Schnauzer,Female,Black,White,Small,Long,Yes,No,Healthy,0,1,Yes
11535,Cat,9,Domestic Short Hair,Female,Yellow,White,Small,Short,Yes,Yes,Healthy,0,3,No


In [48]:
X = data.drop(columns=['Adopted'])
y = data['Adopted']

In [49]:
y = convert_target_to_binary(y)

In [50]:
categorical_columns = ['Type', 'Breed1', 'Gender', 'Color1', 'Color2', 'MaturitySize',
       'FurLength', 'Vaccinated', 'Sterilized', 'Health']

In [51]:
X = convert_to_categorical(X, categorical_columns)

In [52]:
#feature_types = create_feature_types(X, categorical_columns)

In [53]:
X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(X, y)

In [54]:
classifier_model = train_xgboost_classifier(X_train, y_train, X_val, y_val, feature_types)


INFO:root:Starting training...


XGBoostError: [19:00:15] /Users/runner/miniforge3/conda-bld/xgboost-split_1679035096581/work/src/data/data.cc:254: All feature_types must be one of {int, float, i, q, c}.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000179c8c804 dmlc::LogMessageFatal::~LogMessageFatal() + 116
  [bt] (1) 2   libxgboost.dylib                    0x0000000179d25b05 xgboost::LoadFeatureType(std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>> const&, std::__1::vector<xgboost::FeatureType, std::__1::allocator<xgboost::FeatureType>>*) + 517
  [bt] (2) 3   libxgboost.dylib                    0x0000000179d3383f xgboost::MetaInfo::SetFeatureInfo(char const*, char const**, unsigned long long) + 815
  [bt] (3) 4   libxgboost.dylib                    0x0000000179ca3d56 XGDMatrixSetStrFeatureInfo + 150
  [bt] (4) 5   libffi.8.dylib                      0x0000000109f15a22 ffi_call_unix64 + 82
  [bt] (5) 6   ???                                 0x000000030313de90 0x0 + 12936535696



In [10]:
X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(X, y)

In [11]:
X_val.shape

(1154, 13)

In [12]:
X_test.shape

(1154, 13)

In [17]:
classifier_model = train_xgboost_classifier(X_train, y_train, X_val, y_val, feature_types)

INFO:root:Starting training...


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Type: object, Breed1: object, Gender: object, Color1: object, Color2: object, MaturitySize: object, FurLength: object, Vaccinated: object, Sterilized: object, Health: object

In [None]:
if __name__ == "__main__":

    # Specify categorical columns (use column names)
    categorical_columns = ['column1', 'column2', 'column3']  # Replace with the actual column names

    # Replace categorical column names with their corresponding types: 'categorical'
    feature_types = X.dtypes.replace({col: 'categorical' for col in categorical_columns})

    X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(X, y)

    classifier_model = train_xgboost_classifier(X_train, y_train, X_val, y_val, feature_types)
    accuracy, f1, recall = evaluate_xgboost_classifier(classifier_model, X_test, y_test, feature_types)

    logging.info(f'Accuracy: {accuracy}')
    logging.info(f'F1 Score: {f1}')
    logging.info(f'Recall: {recall}')