# Balance dataset

In [None]:
!pip install keras-tuner



In [None]:
import pandas as pd
import numpy as np
import re

# scikit-learn tools
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.model_selection import KFold, GridSearchCV

# -----------------------------
# 1. LOAD AND PREPROCESS DATA
# -----------------------------
train_df = pd.read_csv('train.csv')
test_df  = pd.read_csv('test.csv')

def bucket_age(age_str):
    """Convert age strings (e.g., '2 years', '4 weeks') into age buckets."""
    if pd.isna(age_str):
        return "Unknown"

    age_str = age_str.lower().strip()
    conversion = {"year": 365, "month": 30, "week": 7, "day": 1}

    match = re.match(r"(\d+)\s*(year|month|week|day)s?", age_str)
    if match:
        num = int(match.group(1))
        unit = match.group(2)
        days = num * conversion[unit]
        if days <= 180:
            return "Baby"
        elif 181 <= days <= 730:
            return "Child"
        elif 1096 <= days < 4015:
            return "Adult"
        elif days >= 4015:
            return "Senior"
    return "Unknown"

# Apply age bucketing to the training set
train_df["Age Bucket"] = train_df["Age upon Intake"].apply(bucket_age)

# Drop irrelevant columns
columns_to_drop = [
    'Name', 'Id', 'Intake Time', 'Outcome Time', 'Age upon Intake',
    'Date of Birth', 'Color', 'Found Location', 'Breed'
]
train_df.drop(
    columns=[col for col in columns_to_drop if col in train_df.columns],
    inplace=True
)

# Group rare categories into "Other"
min_count = 50
rare_conditions = train_df['Intake Condition'].value_counts()
rare_conditions = rare_conditions[rare_conditions < min_count].index
train_df['Intake Condition'] = train_df['Intake Condition'].replace(rare_conditions, 'Other')

# -----------------------------
# 2. BALANCE THE TRAINING DATA (Oversampling)
# -----------------------------
adopted         = train_df[train_df["Outcome Type"] == "Adoption"]
transferred     = train_df[train_df["Outcome Type"] == "Transfer"]
euthanasia      = train_df[train_df["Outcome Type"] == "Euthanasia"]
return_to_owner = train_df[train_df["Outcome Type"] == "Return to Owner"]
died            = train_df[train_df["Outcome Type"] == "Died"]

# Largest class size
max_count = max(len(adopted),
                len(transferred),
                len(euthanasia),
                len(return_to_owner),
                len(died))

# Oversample each class
adopted_oversampled         = resample(adopted,         replace=True, n_samples=max_count, random_state=42)
transferred_oversampled     = resample(transferred,     replace=True, n_samples=max_count, random_state=42)
euthanasia_oversampled      = resample(euthanasia,      replace=True, n_samples=max_count, random_state=42)
return_to_owner_oversampled = resample(return_to_owner, replace=True, n_samples=max_count, random_state=42)
died_oversampled            = resample(died,            replace=True, n_samples=max_count, random_state=42)

balanced_train_df = pd.concat([
    adopted_oversampled,
    transferred_oversampled,
    euthanasia_oversampled,
    return_to_owner_oversampled,
    died_oversampled
])

# -----------------------------
# 3. ENCODE AND SCALE FEATURES
# -----------------------------
label_encoder = LabelEncoder()
balanced_train_df["Outcome Type"] = label_encoder.fit_transform(balanced_train_df["Outcome Type"])

# Convert categorical features to dummy variables
balanced_train_df = pd.get_dummies(balanced_train_df)

# Separate features (X) and labels (y)
X = balanced_train_df.drop(columns=["Outcome Type"])
y = balanced_train_df["Outcome Type"]

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------
# 4. NESTED CROSS-VALIDATION + STACKING
# -----------------------------
# Outer CV (fewer folds to speed up)
outer_cv = KFold(n_splits=2, shuffle=True, random_state=42)

# Define base learners and meta-learner for stacking
rf1     = RandomForestClassifier(random_state=42)
rf2     = RandomForestClassifier(random_state=123)
meta_rf = RandomForestClassifier(random_state=999)

stack_clf = StackingClassifier(
    estimators=[('rf1', rf1), ('rf2', rf2)],
    final_estimator=meta_rf,
    passthrough=False,
    n_jobs=-1
)

# Smaller parameter grid to reduce runtime
param_grid = {
    'rf1__n_estimators': [50],
    'rf1__max_depth': [5],
    'rf2__n_estimators': [50],
    'rf2__max_depth': [5],
    'final_estimator__n_estimators': [50],
    'final_estimator__max_depth': [5]
}

outer_scores = []

for train_idx, test_idx in outer_cv.split(X_scaled, y):
    X_train_fold, X_test_fold = X_scaled[train_idx], X_scaled[test_idx]
    y_train_fold, y_test_fold = y.iloc[train_idx], y.iloc[test_idx]

    # Inner CV for hyperparameter tuning
    inner_cv = KFold(n_splits=2, shuffle=True, random_state=42)

    grid_search = GridSearchCV(
        estimator=stack_clf,
        param_grid=param_grid,
        cv=inner_cv,
        scoring='accuracy',
        n_jobs=-1
    )
    # Fit on the "training fold" of the outer CV
    grid_search.fit(X_train_fold, y_train_fold)

    # Evaluate the best model on the "test fold" of the outer CV
    best_model = grid_search.best_estimator_
    fold_score = best_model.score(X_test_fold, y_test_fold)
    outer_scores.append(fold_score)

    print(f"Outer fold score: {fold_score:.4f}, best params: {grid_search.best_params_}")

# Performance across all outer folds
print("Outer fold accuracies:", outer_scores)
print("Mean accuracy:", np.mean(outer_scores))
print("Std of accuracies:", np.std(outer_scores))

# -----------------------------
# 5. FINAL MODEL ON ALL DATA (OPTIONAL)
# -----------------------------
# Retrain the best stacking model on the entire dataset
# to get a final model for prediction.
# We'll do one more GridSearchCV (2-fold) on *all* data:
final_grid_search = GridSearchCV(
    estimator=stack_clf,
    param_grid=param_grid,
    cv=2,
    scoring='accuracy',
    n_jobs=-1
)
final_grid_search.fit(X_scaled, y)
final_best_model = final_grid_search.best_estimator_

print("Final model best params:", final_grid_search.best_params_)
print("Final model CV accuracy:", final_grid_search.best_score_)

# -----------------------------
# 6. PREPROCESS THE TEST DATA
# -----------------------------
test_df["Age Bucket"] = test_df["Age upon Intake"].apply(bucket_age)

test_df.drop(
    columns=[col for col in columns_to_drop if col in test_df.columns],
    inplace=True
)

test_df['Intake Condition'] = test_df['Intake Condition'].replace(rare_conditions, 'Other')
test_df = pd.get_dummies(test_df)
test_df = test_df.reindex(columns=X.columns, fill_value=0)

test_df_scaled = scaler.transform(test_df)

# -----------------------------
# 7. MAKE PREDICTIONS
# -----------------------------
pred_probs_stack = final_best_model.predict_proba(test_df_scaled)
final_pred_indices = np.argmax(pred_probs_stack, axis=1)
final_pred_classes = label_encoder.inverse_transform(final_pred_indices)

test_predictions = pd.DataFrame({
    "Id": range(1, len(final_pred_classes) + 1),
    "Outcome Type": final_pred_classes
})
test_predictions.to_csv("test_predictions.csv", index=False)
print("Predictions saved to test_predictions.csv")


Outer fold score: 0.6025, best params: {'final_estimator__max_depth': 5, 'final_estimator__n_estimators': 50, 'rf1__max_depth': 5, 'rf1__n_estimators': 50, 'rf2__max_depth': 5, 'rf2__n_estimators': 50}
Outer fold score: 0.6338, best params: {'final_estimator__max_depth': 5, 'final_estimator__n_estimators': 50, 'rf1__max_depth': 5, 'rf1__n_estimators': 50, 'rf2__max_depth': 5, 'rf2__n_estimators': 50}
Outer fold accuracies: [0.6025236593059937, 0.6337805840568271]
Mean accuracy: 0.6181521216814104
Std of accuracies: 0.015628462375416707
Final model best params: {'final_estimator__max_depth': 5, 'final_estimator__n_estimators': 50, 'rf1__max_depth': 5, 'rf1__n_estimators': 50, 'rf2__max_depth': 5, 'rf2__n_estimators': 50}
Final model CV accuracy: 0.6193394441276867
Predictions saved to test_predictions.csv
