# Balance dataset

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
import re
import numpy as np

# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Function to convert age strings to buckets
def bucket_age(age_str):
    """Convert age strings (e.g., '2 years', '4 weeks') into age buckets."""
    if pd.isna(age_str):
        return "Unknown"

    age_str = age_str.lower().strip()
    conversion = {"year": 365, "month": 30, "week": 7, "day": 1}

    match = re.match(r"(\d+)\s*(year|month|week|day)s?", age_str)
    if match:
        num = int(match.group(1))
        unit = match.group(2)
        days = num * conversion[unit]

        if days <= 180:
            return "Baby"
        elif 181 <= days <= 730:
            return "Child"
        elif 1096 <= days < 4015:
            return "Adult"
        elif days >= 4015:
            return "Senior"

    return "Unknown"

# Apply age bucketing to the training set
train_df["Age Bucket"] = train_df["Age upon Intake"].apply(bucket_age)

# Clean Breed column: remove "mix" and extra whitespace
train_df['Breed_Clean'] = train_df['Breed'].str.lower().str.replace('mix', '').str.strip()

# Process Color column: convert to lowercase, remove extra spaces, and split if two colors are provided
train_df['Color_Clean'] = train_df['Color'].str.lower().str.strip()
color_split = train_df['Color_Clean'].str.split('/', expand=True)
train_df['Primary_Color'] = color_split[0]
train_df['Secondary_Color'] = color_split[1] if color_split.shape[1] > 1 else np.nan

# Drop unnecessary columns
columns_to_drop = ['Name', 'Id', 'Intake Time', 'Outcome Time', 'Age upon Intake', 'Date of Birth', 'Color',
                   'Found Location', 'Breed']
train_df.drop(columns=[col for col in columns_to_drop if col in train_df.columns], inplace=True)

# Group rare categories into "Other"
min_count = 50  # Adjust this threshold as needed
rare_conditions = train_df['Intake Condition'].value_counts()[train_df['Intake Condition'].value_counts() < min_count].index
train_df['Intake Condition'] = train_df['Intake Condition'].replace(rare_conditions, 'Other')

# Balance the dataset
adopted = train_df[train_df["Outcome Type"] == "Adoption"]
transferred = train_df[train_df["Outcome Type"] == "Transfer"]
euthanasia = train_df[train_df["Outcome Type"] == "Euthanasia"]
return_to_owner = train_df[train_df["Outcome Type"] == "Return to Owner"]
died = train_df[train_df["Outcome Type"] == "Died"]

minority_class = pd.concat([euthanasia, return_to_owner, died])
target_size = len(minority_class)

adopted_undersampled = resample(adopted, replace=False, n_samples=target_size, random_state=42)
transferred_undersampled = resample(transferred, replace=False, n_samples=target_size, random_state=42)

balanced_train_df = pd.concat([adopted_undersampled, transferred_undersampled, minority_class])

# Encode categorical labels
label_encoder = LabelEncoder()
balanced_train_df["Outcome Type"] = label_encoder.fit_transform(balanced_train_df["Outcome Type"])

# Convert categorical variables to numerical
balanced_train_df = pd.get_dummies(balanced_train_df)

# Prepare features and labels
X = balanced_train_df.drop(columns=["Outcome Type"])
y = balanced_train_df["Outcome Type"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build a fully connected neural network model
model_nn = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(len(label_encoder.classes_), activation='softmax')  # Multi-class classification
])

# Compile the neural network model
model_nn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the neural network
history = model_nn.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

# Build and train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate models
train_accuracy_nn = history.history['accuracy'][-1]
val_accuracy_nn = history.history['val_accuracy'][-1]
print(f"Neural Network - Final Training Accuracy: {train_accuracy_nn:.4f}")
print(f"Neural Network - Final Validation Accuracy: {val_accuracy_nn:.4f}")

train_accuracy_rf = rf_model.score(X_train, y_train)
val_accuracy_rf = rf_model.score(X_val, y_val)
print(f"Random Forest - Final Training Accuracy: {train_accuracy_rf:.4f}")
print(f"Random Forest - Final Validation Accuracy: {val_accuracy_rf:.4f}")

# Preprocess test dataset
test_df["Age Bucket"] = test_df["Age upon Intake"].apply(bucket_age)

# Clean Breed column: remove "mix" and extra whitespace
test_df['Breed_Clean'] = test_df['Breed'].str.lower().str.replace('mix', '').str.strip()

# Drop unnecessary columns only if they exist
test_df.drop(columns=[col for col in columns_to_drop if col in test_df.columns], inplace=True)

# Group rare categories into "Other" in the test set
test_df['Intake Condition'] = test_df['Intake Condition'].replace(rare_conditions, 'Other')

# Convert categorical variables to numerical (align with training set)
test_df = pd.get_dummies(test_df)
test_df = test_df.reindex(columns=X.columns, fill_value=0)  # Ensure same columns as training data
test_df_scaled = scaler.transform(test_df)

# Make predictions with both models
predictions_nn = model_nn.predict(test_df_scaled)
predicted_classes_nn = predictions_nn.argmax(axis=1)

predictions_rf = rf_model.predict(test_df_scaled)

# Combine predictions using soft voting (average of predicted probabilities)
# For soft voting, we'll average the predicted probabilities of both models.
# Make predictions with both models
predictions_nn = model_nn.predict(test_df_scaled)  # This gives the probabilities for each class
predicted_classes_nn = predictions_nn.argmax(axis=1)

predictions_rf = rf_model.predict(test_df_scaled)  # Random Forest also gives class predictions, not probabilities

# Combine predictions using soft voting (average of predicted probabilities)
# For soft voting, we'll average the predicted probabilities of both models.
pred_probs_nn = model_nn.predict(test_df_scaled)  # Keras model's predictions give probabilities (softmax output)
pred_probs_rf = rf_model.predict_proba(test_df_scaled)  # Random Forest provides probabilities

# Average the predicted probabilities
avg_probs = (pred_probs_nn + pred_probs_rf) / 2
final_pred_classes = np.argmax(avg_probs, axis=1)  # Get the class with the highest average probability

# Convert numeric predictions back to the original class labels
final_pred_classes = label_encoder.inverse_transform(final_pred_classes)

# Store predictions in test dataframe with specific column format
test_predictions = pd.DataFrame({"Id": range(1, len(final_pred_classes) + 1), "Outcome Type": final_pred_classes})

# Save predictions
test_predictions.to_csv("test_predictions.csv", index=False)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m1582/1582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.4884 - loss: 1.2195 - val_accuracy: 0.5303 - val_loss: 1.0655
Epoch 2/20
[1m1582/1582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.5688 - loss: 0.9907 - val_accuracy: 0.5377 - val_loss: 1.0714
Epoch 3/20
[1m1582/1582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 6ms/step - accuracy: 0.5839 - loss: 0.9546 - val_accuracy: 0.5381 - val_loss: 1.0908
Epoch 4/20
[1m1582/1582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.5889 - loss: 0.9384 - val_accuracy: 0.5482 - val_loss: 1.0824
Epoch 5/20
[1m1582/1582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.5944 - loss: 0.9224 - val_accuracy: 0.5450 - val_loss: 1.0834
Epoch 6/20
[1m1582/1582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.5989 - loss: 0.9131 - val_accuracy: 0.5469 - val_loss: 1.0920
Epoch 7/20
