# Installs


In [None]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


## LOAD AND PREPROCESS DATA

In [None]:
import pandas as pd
import numpy as np
import re

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.model_selection import KFold, RandomizedSearchCV
from scipy.stats import randint

print("=== Step 1: LOAD AND PREPROCESS DATA ===")
train_df = pd.read_csv('train.csv')
test_df  = pd.read_csv('test.csv')

def bucket_age(age_str):
    """Convert age strings (e.g., '2 years', '4 weeks') into age buckets."""
    if pd.isna(age_str):
        return "Unknown"

    age_str = age_str.lower().strip()
    conversion = {"year": 365, "month": 30, "week": 7, "day": 1}
    match = re.match(r"(\d+)\s*(year|month|week|day)s?", age_str)
    if match:
        num = int(match.group(1))
        unit = match.group(2)
        days = num * conversion[unit]
        if days <= 180:
            return "Baby"
        elif 181 <= days <= 730:
            return "Child"
        elif 1096 <= days < 4015:
            return "Adult"
        elif days >= 4015:
            return "Senior"
    return "Unknown"

train_df["Age Bucket"] = train_df["Age upon Intake"].apply(bucket_age)

# Get rid of wildlife record, since there is only one
train_df = train_df[train_df["Intake Type"] != "Wildlife"]

columns_to_drop = [
    'Name', 'Id', 'Intake Time', 'Outcome Time', 'Age upon Intake',
    'Date of Birth', 'Color', 'Found Location', 'Breed'
]
train_df.drop(
    columns=[col for col in columns_to_drop if col in train_df.columns],
    inplace=True
)

# Group rare categories into "Other"
min_count = 50
rare_conditions = train_df['Intake Condition'].value_counts()
rare_conditions = rare_conditions[rare_conditions < min_count].index
train_df['Intake Condition'] = train_df['Intake Condition'].replace(rare_conditions, 'Other')

print("Finished Step 1: Data loaded and preprocessed.\n")


=== Step 1: LOAD AND PREPROCESS DATA ===


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(


Finished Step 1: Data loaded and preprocessed.

=== Step 2: BALANCE THE TRAINING DATA (Oversampling) ===
Finished Step 2: Training data balanced.

=== Step 3: ENCODE AND SCALE FEATURES ===
Finished Step 3: Features encoded and scaled.

=== Step 4: NESTED CROSS-VALIDATION + STACKING ===
  Outer Fold 1 start...
  Outer Fold 1 score: 0.4957
  Best params for Outer Fold 1: {'final_estimator__max_depth': None, 'final_estimator__n_estimators': 78, 'rf1__max_depth': 5, 'rf1__n_estimators': 83, 'rf2__max_depth': None, 'rf2__n_estimators': 83}

  Outer Fold 2 start...
  Outer Fold 2 score: 0.4968
  Best params for Outer Fold 2: {'final_estimator__max_depth': None, 'final_estimator__n_estimators': 78, 'rf1__max_depth': 5, 'rf1__n_estimators': 83, 'rf2__max_depth': None, 'rf2__n_estimators': 83}

Outer fold accuracies: [0.4956689194099266, 0.4968025579536371]
Mean accuracy: 0.49623573868178183
Std of accuracies: 0.0005668192718552567
Finished Step 4: Nested CV complete.

=== Step 5: FINAL MODEL O

# BALANCE THE TRAINING DATA (Oversampling)

In [None]:
print("=== Step 2: BALANCE THE TRAINING DATA (Oversampling) ===")
adopted         = train_df[train_df["Outcome Type"] == "Adoption"]
transferred     = train_df[train_df["Outcome Type"] == "Transfer"]
euthanasia      = train_df[train_df["Outcome Type"] == "Euthanasia"]
return_to_owner = train_df[train_df["Outcome Type"] == "Return to Owner"]
died            = train_df[train_df["Outcome Type"] == "Died"]

max_count = max(len(adopted), len(transferred), len(euthanasia),
                len(return_to_owner), len(died))

adopted_oversampled         = resample(adopted,         replace=True, n_samples=max_count, random_state=42)
transferred_oversampled     = resample(transferred,     replace=True, n_samples=max_count, random_state=42)
euthanasia_oversampled      = resample(euthanasia,      replace=True, n_samples=max_count, random_state=42)
return_to_owner_oversampled = resample(return_to_owner, replace=True, n_samples=max_count, random_state=42)
died_oversampled            = resample(died,            replace=True, n_samples=max_count, random_state=42)

balanced_train_df = pd.concat([
    adopted_oversampled,
    transferred_oversampled,
    euthanasia_oversampled,
    return_to_owner_oversampled,
    died_oversampled
])

print("Finished Step 2: Training data balanced.\n")

# ENCODE AND SCALE FEATURES

In [None]:
print("=== Step 3: ENCODE AND SCALE FEATURES ===")
label_encoder = LabelEncoder()
balanced_train_df["Outcome Type"] = label_encoder.fit_transform(
    balanced_train_df["Outcome Type"]
)

balanced_train_df = pd.get_dummies(balanced_train_df)

X = balanced_train_df.drop(columns=["Outcome Type"])
y = balanced_train_df["Outcome Type"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Finished Step 3: Features encoded and scaled.\n")

# NESTED CROSS-VALIDATION + STACKING

In [None]:
print("=== Step 4: NESTED CROSS-VALIDATION + STACKING ===")
# Fewer folds for faster training:
outer_cv = KFold(n_splits=2, shuffle=True, random_state=42)  # Outer
inner_cv = KFold(n_splits=2, shuffle=True, random_state=42)  # Inner

# Base learners + meta-learner
rf1     = RandomForestClassifier(random_state=42)
rf2     = RandomForestClassifier(random_state=123)
meta_rf = RandomForestClassifier(random_state=999)

stack_clf = StackingClassifier(
    estimators=[('rf1', rf1), ('rf2', rf2)],
    final_estimator=meta_rf,
    passthrough=False,
    n_jobs=-1
)

# Narrow param space + fewer random draws for speed
param_dist = {
    # random integers in [50, 100]
    'rf1__n_estimators': randint(50, 101),
    'rf1__max_depth': [None, 5],
    'rf2__n_estimators': randint(50, 101),
    'rf2__max_depth': [None, 5],
    'final_estimator__n_estimators': randint(50, 101),
    'final_estimator__max_depth': [None, 5]
}

outer_scores = []
fold_count = 1

for train_idx, test_idx in outer_cv.split(X_scaled, y):
    print(f"  Outer Fold {fold_count} start...")
    X_train_fold, X_test_fold = X_scaled[train_idx], X_scaled[test_idx]
    y_train_fold, y_test_fold = y.iloc[train_idx], y.iloc[test_idx]

    rand_search = RandomizedSearchCV(
        estimator=stack_clf,
        param_distributions=param_dist,
        n_iter=3,  # only 3 random draws
        cv=inner_cv,
        scoring='accuracy',
        n_jobs=-1,
        random_state=999
    )

    rand_search.fit(X_train_fold, y_train_fold)
    best_model = rand_search.best_estimator_

    fold_score = best_model.score(X_test_fold, y_test_fold)
    outer_scores.append(fold_score)

    print(f"  Outer Fold {fold_count} score: {fold_score:.4f}")
    print(f"  Best params for Outer Fold {fold_count}: {rand_search.best_params_}\n")
    fold_count += 1

print("Outer fold accuracies:", outer_scores)
print("Mean accuracy:", np.mean(outer_scores))
print("Std of accuracies:", np.std(outer_scores))
print("Finished Step 4: Nested CV complete.\n")

print("=== Step 5: FINAL MODEL ON ALL DATA (OPTIONAL) ===")
# One more RandomizedSearchCV on entire dataset
final_rand_search = RandomizedSearchCV(
    estimator=stack_clf,
    param_distributions=param_dist,
    n_iter=3,   # same # of draws
    cv=2,       # fewer folds for final
    scoring='accuracy',
    n_jobs=-1,
    random_state=999
)
final_rand_search.fit(X_scaled, y)
final_best_model = final_rand_search.best_estimator_

print("Final model best params:", final_rand_search.best_params_)
print("Final model CV accuracy:", final_rand_search.best_score_)
print("Finished Step 5: Final model trained on entire dataset.\n")

print("=== Step 6: PREPROCESS TEST DATA ===")
test_df["Age Bucket"] = test_df["Age upon Intake"].apply(bucket_age)
test_df.drop(
    columns=[col for col in columns_to_drop if col in test_df.columns],
    inplace=True
)
test_df['Intake Condition'] = test_df['Intake Condition'].replace(rare_conditions, 'Other')
test_df = pd.get_dummies(test_df)
test_df = test_df.reindex(columns=X.columns, fill_value=0)
test_df_scaled = scaler.transform(test_df)

print("Finished Step 6: Test data preprocessed.\n")

print("=== Step 7: MAKE PREDICTIONS ===")
pred_probs_stack = final_best_model.predict_proba(test_df_scaled)
final_pred_indices = np.argmax(pred_probs_stack, axis=1)
final_pred_classes = label_encoder.inverse_transform(final_pred_indices)

test_predictions = pd.DataFrame({
    "Id": range(1, len(final_pred_classes) + 1),
    "Outcome Type": final_pred_classes
})
test_predictions.to_csv("test_predictions_random_forest.csv", index=False)

print("Predictions saved to test_predictions_random_forest.csv")
print("Finished Step 7: End of script execution.")

# FINAL MODEL ON ALL DATA

In [None]:
print("=== Step 5: FINAL MODEL ON ALL DATA ===")
# One more RandomizedSearchCV on entire dataset
final_rand_search = RandomizedSearchCV(
    estimator=stack_clf,
    param_distributions=param_dist,
    n_iter=3,   # same # of draws
    cv=2,       # fewer folds for final
    scoring='accuracy',
    n_jobs=-1,
    random_state=999
)
final_rand_search.fit(X_scaled, y)
final_best_model = final_rand_search.best_estimator_

print("Final model best params:", final_rand_search.best_params_)
print("Final model CV accuracy:", final_rand_search.best_score_)
print("Finished Step 5: Final model trained on entire dataset.\n")

print("=== Step 6: PREPROCESS TEST DATA ===")
test_df["Age Bucket"] = test_df["Age upon Intake"].apply(bucket_age)
test_df.drop(
    columns=[col for col in columns_to_drop if col in test_df.columns],
    inplace=True
)
test_df['Intake Condition'] = test_df['Intake Condition'].replace(rare_conditions, 'Other')
test_df = pd.get_dummies(test_df)
test_df = test_df.reindex(columns=X.columns, fill_value=0)
test_df_scaled = scaler.transform(test_df)

print("Finished Step 6: Test data preprocessed.\n")

# MAKE PREDICTIONS

In [None]:
print("=== Step 7: MAKE PREDICTIONS ===")
pred_probs_stack = final_best_model.predict_proba(test_df_scaled)
final_pred_indices = np.argmax(pred_probs_stack, axis=1)
final_pred_classes = label_encoder.inverse_transform(final_pred_indices)

test_predictions = pd.DataFrame({
    "Id": range(1, len(final_pred_classes) + 1),
    "Outcome Type": final_pred_classes
})
test_predictions.to_csv("test_predictions_random_forest.csv", index=False)

print("Predictions saved to test_predictions_random_forest.csv")
print("Finished Step 7: End of script execution.")