In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

def convert_age_to_days(age_str):
    if pd.isnull(age_str) or not isinstance(age_str, str):
        return np.nan
    num, unit = age_str.split()[:2]
    num = int(num)
    if 'year' in unit:
        return num * 365
    elif 'month' in unit:
        return num * 30
    elif 'week' in unit:
        return num * 7
    elif 'day' in unit:
        return num
    return np.nan

def bucket_age(days):
    if pd.isnull(days):
        return 'Unknown'
    elif days < 180:
        return 'Baby'
    elif days < 730:
        return 'Young'
    elif days < 2555:
        return 'Adult'
    else:
        return 'Senior'

def is_in_austin_travis(location):
    if pd.isnull(location):
        return 0
    location = location.lower()
    return int('austin' in location or 'travis' in location)

def preprocess_data(df, is_train=True):
    df['AgeInDays'] = df['Age upon Intake'].apply(convert_age_to_days)
    df['AgeGroup'] = df['AgeInDays'].apply(bucket_age)
    df['Intake Time'] = pd.to_datetime(df['Intake Time'], errors='coerce')
    df['Outcome Time'] = pd.to_datetime(df['Outcome Time'], errors='coerce')
    df['IntakeHour'] = df['Intake Time'].dt.hour
    df['IntakeMonth'] = df['Intake Time'].dt.month
    df['IntakeWeekday'] = df['Intake Time'].dt.dayofweek
    df['StayDuration'] = (df['Outcome Time'] - df['Intake Time']).dt.days if is_train else np.nan
    df['IsNamed'] = df['Name'].notnull().astype(int)
    df['IsMixedBreed'] = df['Breed'].str.contains("Mix", case=False, na=False).astype(int)
    df['Found_In_Austin_Travis'] = df['Found Location'].apply(is_in_austin_travis)
    df['AgeInDays'] = df['AgeInDays'].fillna(df['AgeInDays'].median())
    df['StayDuration'] = df['StayDuration'].fillna(df['StayDuration'].median())
    if is_train:
        df = df.dropna(subset=['Outcome Type'])
    return df

df_train = preprocess_data(df_train, is_train=True)
df_test = preprocess_data(df_test, is_train=False)

features = [
    'AgeGroup', 'StayDuration', 'IntakeHour', 'IntakeMonth', 'IntakeWeekday',
    'IsNamed', 'IsMixedBreed', 'Found_In_Austin_Travis',
    'Sex upon Intake', 'Animal Type', 'Intake Condition', 'Intake Type', 'Breed'
]
target = 'Outcome Type'

X_train = df_train[features]
y_train = df_train[target]
X_test = df_test[features]

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

categorical_features = [
    'AgeGroup', 'Sex upon Intake', 'Animal Type',
    'Intake Condition', 'Intake Type', 'Breed'
]

numerical_features = [
    'StayDuration', 'IntakeHour', 'IntakeMonth',
    'IntakeWeekday', 'IsNamed', 'IsMixedBreed', 'Found_In_Austin_Travis'
]

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numerical_features)
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

y_train = keras.utils.to_categorical(y_train, num_classes=len(label_encoder.classes_))

model = keras.Sequential([
    layers.Input(shape=(X_train_processed.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train_processed, y_train, epochs=10, batch_size=32)

y_pred_probs = model.predict(X_test_processed)
y_pred = np.argmax(y_pred_probs, axis=1)

test_predictions_df = pd.DataFrame({
    'id': range(1, len(y_pred) + 1),
    'Outcome Type': label_encoder.inverse_transform(y_pred)
})

test_predictions_df.to_csv('test_predictions_nn.csv', index=False)
print("Test predictions have been saved to 'test_predictions_nn.csv'.")

  df['Outcome Time'] = pd.to_datetime(df['Outcome Time'], errors='coerce')


Epoch 1/10
[1m6881/6881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 7ms/step - accuracy: 0.5345 - loss: 1.1259 - val_accuracy: 0.5932 - val_loss: 1.0228
Epoch 2/10
[1m6881/6881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 7ms/step - accuracy: 0.6575 - loss: 0.8701 - val_accuracy: 0.6539 - val_loss: 0.8969
Epoch 3/10
[1m6881/6881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 7ms/step - accuracy: 0.6847 - loss: 0.8006 - val_accuracy: 0.6244 - val_loss: 0.9660
Epoch 4/10
[1m6881/6881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 6ms/step - accuracy: 0.6991 - loss: 0.7652 - val_accuracy: 0.6380 - val_loss: 0.9505
Epoch 5/10
[1m6881/6881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 7ms/step - accuracy: 0.7098 - loss: 0.7429 - val_accuracy: 0.6494 - val_loss: 0.9218
Epoch 6/10
[1m6881/6881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 7ms/step - accuracy: 0.7180 - loss: 0.7212 - val_accuracy: 0.6456 - val_loss: 0.9292
Epoch 7/10