# Only change is we categorize colors based off of light, medium, and dark


In [None]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
import re
import numpy as np

# If you have not already installed keras_tuner:
# pip install keras-tuner --upgrade
import keras_tuner as kt

# -----------------------------
# 1. LOAD AND PREPROCESS DATA
# -----------------------------
train_df = pd.read_csv('train.csv').sample(n=50000, random_state=42)
test_df = pd.read_csv('test.csv')

def bucket_age(age_str):
    """Convert age strings (e.g., '2 years', '4 weeks') into age buckets."""
    if pd.isna(age_str):
        return "Unknown"

    age_str = age_str.lower().strip()
    conversion = {"year": 365, "month": 30, "week": 7, "day": 1}

    match = re.match(r"(\d+)\s*(year|month|week|day)s?", age_str)
    if match:
        num = int(match.group(1))
        unit = match.group(2)
        days = num * conversion[unit]

        if days <= 180:
            return "Baby"
        elif 181 <= days <= 730:
            return "Child"
        elif 1096 <= days < 4015:
            return "Adult"
        elif days >= 4015:
            return "Senior"

    return "Unknown"

# Apply age bucketing to the training set
train_df["Age Bucket"] = train_df["Age upon Intake"].apply(bucket_age)

# Clean Breed column: remove "mix" and extra whitespace
train_df['Breed_Clean'] = (
    train_df['Breed']
    .str.lower()
    .str.replace('mix', '', regex=False)
    .str.strip()
)

# --- UPDATED COLOR GROUPING SECTION ---
# Process Color column: convert to lowercase, remove extra spaces, and split if two colors are provided
train_df['Color_Clean'] = train_df['Color'].str.lower().str.strip()
color_split = train_df['Color_Clean'].str.split('/', expand=True)
train_df['Primary_Color'] = color_split[0]
train_df['Secondary_Color'] = color_split[1] if color_split.shape[1] > 1 else np.nan

# --- ADDITION: Create Simplified Primary Color Feature (Top 20) ---
top_n = 20
top_colors = train_df['Primary_Color'].value_counts().nlargest(top_n).index.tolist()
print("Top 20 Primary Colors:", top_colors)
train_df['Simplified_Primary_Color'] = train_df['Primary_Color'].apply(lambda x: x if x in top_colors else "Other")
# --- END ADDITION ---

# --- ADDITION: Create Color_Category Feature (Light, Dark, Mixed, Other) for Training Data ---
def assign_color_category(color):
    """
    Assigns a color category based on the full color string:
      - 'Mixed' if both light and dark keywords are present,
      - 'Light' if only a light keyword is found,
      - 'Dark' if only a dark keyword is found,
      - 'Other' if neither light nor dark keywords are found.
    """
    if pd.isna(color):
        return "Unknown"
    color = str(color).lower()
    light_keywords = ["white", "cream", "beige", "tan", "yellow", "gold", "light"]
    dark_keywords = ["black", "brown", "dark", "gray", "grey", "navy", "blue"]
    found_light = any(kw in color for kw in light_keywords)
    found_dark = any(kw in color for kw in dark_keywords)
    if found_light and found_dark:
        return "Mixed"
    elif found_light:
        return "Light"
    elif found_dark:
        return "Dark"
    else:
        return "Other"

train_df['Color_Category'] = train_df['Color_Clean'].apply(assign_color_category)
# --- END ADDITION ---
# --- END UPDATED COLOR GROUPING SECTION ---

# Drop unnecessary columns
columns_to_drop = [
    'Name', 'Id', 'Intake Time', 'Outcome Time', 'Age upon Intake',
    'Date of Birth', 'Color', 'Found Location', 'Breed'
]
train_df.drop(
    columns=[col for col in columns_to_drop if col in train_df.columns],
    inplace=True
)

# Group rare categories into "Other"
min_count = 50  # Adjust this threshold as you see fit
rare_conditions = train_df['Intake Condition'].value_counts()
rare_conditions = rare_conditions[rare_conditions < min_count].index
train_df['Intake Condition'] = train_df['Intake Condition'].replace(rare_conditions, 'Other')

# -----------------------------
# 2. BALANCE THE TRAINING DATA (Oversampling)
# -----------------------------
adopted = train_df[train_df["Outcome Type"] == "Adoption"]
transferred = train_df[train_df["Outcome Type"] == "Transfer"]
euthanasia = train_df[train_df["Outcome Type"] == "Euthanasia"]
return_to_owner = train_df[train_df["Outcome Type"] == "Return to Owner"]
died = train_df[train_df["Outcome Type"] == "Died"]

# Find the largest class count
max_count = max(
    len(adopted),
    len(transferred),
    len(euthanasia),
    len(return_to_owner),
    len(died)
)

# Randomly oversample each class to match the largest class
adopted_oversampled = resample(
    adopted, replace=True, n_samples=max_count, random_state=42
)
transferred_oversampled = resample(
    transferred, replace=True, n_samples=max_count, random_state=42
)
euthanasia_oversampled = resample(
    euthanasia, replace=True, n_samples=max_count, random_state=42
)
return_to_owner_oversampled = resample(
    return_to_owner, replace=True, n_samples=max_count, random_state=42
)
died_oversampled = resample(
    died, replace=True, n_samples=max_count, random_state=42
)

balanced_train_df = pd.concat([
    adopted_oversampled,
    transferred_oversampled,
    euthanasia_oversampled,
    return_to_owner_oversampled,
    died_oversampled
])

# -----------------------------
# 3. ENCODE CATEGORICAL LABELS
# -----------------------------
label_encoder = LabelEncoder()
balanced_train_df["Outcome Type"] = label_encoder.fit_transform(
    balanced_train_df["Outcome Type"]
)

# Convert categorical variables to dummy variables
balanced_train_df = pd.get_dummies(balanced_train_df)

# Prepare features (X) and labels (y)
X = balanced_train_df.drop(columns=["Outcome Type"])
y = balanced_train_df["Outcome Type"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------
# 4. TRAIN/VALIDATION SPLIT
# -----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# ----------------------------------------------------
# 5. HYPERPARAMETER TUNING FOR THE NEURAL NETWORK (KerasTuner)
# ----------------------------------------------------
def build_model(hp):
    """
    Build a Keras model using hyperparameters from KerasTuner.
    Adjust the search space as desired.
    """
    model = keras.Sequential()

    # Choose the number of hidden layers
    num_hidden_layers = hp.Int('num_hidden_layers', min_value=1, max_value=3, step=1)

    # First layer (input_shape must match the number of features)
    model.add(
        layers.Dense(
            units=hp.Int('units_0', min_value=32, max_value=128, step=32),
            activation='relu',
            input_shape=(X_train.shape[1],)
        )
    )

    # Additional hidden layers
    for i in range(1, num_hidden_layers + 1):
        model.add(
            layers.Dense(
                units=hp.Int(f'units_{i}', min_value=32, max_value=128, step=32),
                activation='relu'
            )
        )

    # Output layer
    model.add(
        layers.Dense(
            len(label_encoder.classes_),
            activation='softmax'
        )
    )

    # Choose an optimizer
    hp_learning_rate = hp.Choice('learning_rate', [1e-3, 5e-4, 1e-4])
    optimizer = keras.optimizers.Adam(learning_rate=hp_learning_rate)

    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

# Instantiate a tuner. We'll use RandomSearch for demonstration.
tuner = kt.RandomSearch(
    hypermodel=build_model,
    objective='val_accuracy',
    max_trials=5,  # Increase or decrease based on how exhaustive you want your search
    executions_per_trial=1,
    overwrite=True,
    directory='my_dir',
    project_name='nn_tuning_demo'
)

# Perform hyperparameter search
tuner.search(
    X_train, y_train,
    epochs=10,  # fewer epochs for faster tuning
    validation_split=0.2,
    batch_size=32
)

# Retrieve the best model found by the tuner
best_hps_model = tuner.get_best_models(num_models=1)[0]

# Optional: re-train the best model with more epochs
history = best_hps_model.fit(
    X_train, y_train,
    epochs=20,  # now train longer
    batch_size=32,
    validation_data=(X_val, y_val)
)

train_accuracy_nn = history.history['accuracy'][-1]
val_accuracy_nn = history.history['val_accuracy'][-1]
print(f"Neural Network - Final Training Accuracy: {train_accuracy_nn:.4f}")
print(f"Neural Network - Final Validation Accuracy: {val_accuracy_nn:.4f}")

# ----------------------------------------------------
# 6. HYPERPARAMETER TUNING FOR RANDOM FOREST (GridSearchCV)
# ----------------------------------------------------
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}

rf_base = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    cv=3,              # 3-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,         # Use all available CPU cores
    verbose=1
)

grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_

train_accuracy_rf = best_rf_model.score(X_train, y_train)
val_accuracy_rf = best_rf_model.score(X_val, y_val)
print(f"Random Forest - Best Params: {grid_search.best_params_}")
print(f"Random Forest - Final Training Accuracy: {train_accuracy_rf:.4f}")
print(f"Random Forest - Final Validation Accuracy: {val_accuracy_rf:.4f}")

# -----------------------------
# 7. PREPROCESS THE TEST DATA
# -----------------------------
test_df["Age Bucket"] = test_df["Age upon Intake"].apply(bucket_age)

# For the test set, apply the same breed cleaning logic used on training data
test_df['Breed_Clean'] = (
    test_df['Breed']
    .str.lower()
    .str.replace('mix', '', regex=False)
    .str.strip()
)

test_df.drop(
    columns=[col for col in columns_to_drop if col in test_df.columns],
    inplace=True
)

# Replace rare intake conditions with "Other" if they appeared in the training set
test_df['Intake Condition'] = test_df['Intake Condition'].replace(rare_conditions, 'Other')

# Convert categorical variables to numerical
test_df = pd.get_dummies(test_df)

# Align columns with training data
test_df = test_df.reindex(columns=X.columns, fill_value=0)

# Scale the test data
test_df_scaled = StandardScaler().fit(scaler.mean_.reshape(1, -1)).transform(test_df)
# or simply: test_df_scaled = scaler.transform(test_df)

# ----------------------------------------------------
# 8. MAKE PREDICTIONS & SOFT VOTING
# ----------------------------------------------------

# -- Neural Network predictions (best tuned model)
pred_probs_nn = best_hps_model.predict(test_df_scaled)  # Probability predictions

# -- Random Forest predictions (best tuned model)
pred_probs_rf = best_rf_model.predict_proba(test_df_scaled)

# Combine predictions using soft voting (average probabilities)
avg_probs = (pred_probs_nn + pred_probs_rf) / 2
final_pred_classes = np.argmax(avg_probs, axis=1)

# Convert numeric predictions back to original class labels
final_pred_classes = label_encoder.inverse_transform(final_pred_classes)

# Create a DataFrame for submission or further analysis
test_predictions = pd.DataFrame({
    "Id": range(1, len(final_pred_classes) + 1),
    "Outcome Type": final_pred_classes
})

test_predictions.to_csv("test_predictions.csv", index=False)
print("Predictions saved to test_predictions.csv")


Trial 5 Complete [00h 02m 32s]
val_accuracy: 0.6604897975921631

Best val_accuracy So Far: 0.6777581572532654
Total elapsed time: 00h 14m 39s


  saveable.load_own_variables(weights_store.get(inner_path))


Epoch 1/20
[1m3095/3095[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 7ms/step - accuracy: 0.7036 - loss: 0.7053 - val_accuracy: 0.6892 - val_loss: 0.7609
Epoch 2/20
[1m3095/3095[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 7ms/step - accuracy: 0.7084 - loss: 0.6885 - val_accuracy: 0.6848 - val_loss: 0.7621
Epoch 3/20
[1m3095/3095[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - accuracy: 0.7122 - loss: 0.6746 - val_accuracy: 0.6875 - val_loss: 0.7600
Epoch 4/20
[1m3095/3095[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 7ms/step - accuracy: 0.7188 - loss: 0.6616 - val_accuracy: 0.6894 - val_loss: 0.7562
Epoch 5/20
[1m3095/3095[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 6ms/step - accuracy: 0.7189 - loss: 0.6548 - val_accuracy: 0.6975 - val_loss: 0.7517
Epoch 6/20
[1m3095/3095[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 7ms/step - accuracy: 0.7226 - loss: 0.6522 - val_accuracy: 0.6935 - val_loss: 0.7496
Epoch 7/20



[1m869/869[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
Predictions saved to test_predictions.csv
