In [None]:
import pandas as pd
import numpy as np

# Scikit-learn and imblearn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, IterativeImputer
from imblearn.over_sampling import SMOTE

# Ensemble Modeling
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

# Metrics
from sklearn.metrics import accuracy_score, classification_report

# ----------------------------
# Helper Functions
# ----------------------------

def convert_age_to_days(age_str):
    """Convert an age string to days."""
    if pd.isnull(age_str):
        return np.nan
    try:
        parts = age_str.split()
        num = int(parts[0])
        unit = parts[1]
    except Exception:
        return np.nan

    if "day" in unit:
        return num
    elif "week" in unit:
        return num * 7
    elif "month" in unit:
        return num * 30
    elif "year" in unit:
        return num * 365
    return np.nan

def bucket_age(days):
    """Bucket the age (in days) into groups."""
    if pd.isnull(days):
        return 'Unknown'
    elif days < 180:
        return 'Baby'
    elif days < 730:
        return 'Young'
    elif days < 2555:
        return 'Adult'
    else:
        return 'Senior'

def is_in_austin_travis(location):
    """Check if the location contains 'austin' or 'travis'."""
    if pd.isnull(location):
        return 0
    return int('austin' in location.lower() or 'travis' in location.lower())

# ----------------------------
# Load and Preprocess Training Data
# ----------------------------

df_train = pd.read_csv('train.csv')  # Limiting dataset size

df_train['AgeInDays'] = df_train['Age upon Intake'].apply(convert_age_to_days)
df_train['AgeGroup'] = df_train['AgeInDays'].apply(bucket_age)
df_train['Intake Time'] = pd.to_datetime(df_train['Intake Time'], errors='coerce')
df_train['IntakeHour'] = df_train['Intake Time'].dt.hour
df_train['IntakeMonth'] = df_train['Intake Time'].dt.month
df_train['IntakeWeekday'] = df_train['Intake Time'].dt.dayofweek

df_train['IsNamed'] = df_train.get('Name', pd.Series(index=df_train.index)).notnull().astype(int)
df_train['IsMixedBreed'] = df_train['Breed'].str.contains("Mix", case=False, na=False).astype(int)
df_train['Found_In_Austin_Travis'] = df_train['Found Location'].apply(is_in_austin_travis)

df_train.dropna(subset=['Outcome Type'], inplace=True)

# ----------------------------
# Define Features and Target
# ----------------------------

features = ['AgeGroup', 'IntakeHour', 'IntakeMonth', 'IntakeWeekday',
            'IsNamed', 'IsMixedBreed', 'Found_In_Austin_Travis',
            'Sex upon Intake', 'Animal Type', 'Intake Condition', 'Intake Type', 'Breed']
target = 'Outcome Type'

X_full = df_train[features]
y_full = df_train[target]

X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.2, random_state=42, stratify=y_full)

# ----------------------------
# Preprocessing Pipeline
# ----------------------------

categorical_features = ['AgeGroup', 'Sex upon Intake', 'Animal Type', 'Intake Condition', 'Intake Type', 'Breed']
numerical_features = ['IntakeHour', 'IntakeMonth', 'IntakeWeekday', 'IsNamed', 'IsMixedBreed', 'Found_In_Austin_Travis']

categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('cat', categorical_transformer, categorical_features), ('num', numeric_transformer, numerical_features)])

X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

# ----------------------------
# Handle Imbalance with SMOTE
# ----------------------------

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

# ----------------------------
# Encode Target Labels
# ----------------------------

label_encoder = LabelEncoder()
y_train_resampled_encoded = label_encoder.fit_transform(y_train_resampled)
y_val_encoded = label_encoder.transform(y_val)

# ----------------------------
# Stacking with 3 Random Forests
# ----------------------------

base_models = [
    ('rf1', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)),
    ('rf2', RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5, random_state=42)),
    ('rf3', RandomForestClassifier(n_estimators=300, max_depth=15, min_samples_split=10, random_state=42))
]

meta_model = LogisticRegression(random_state=42)
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

stacking_model.fit(X_train_resampled, y_train_resampled_encoded)

y_pred = stacking_model.predict(X_val_processed)

accuracy = accuracy_score(y_val_encoded, y_pred)
print(f'Validation Accuracy: {accuracy:.4f}')
print(classification_report(y_val_encoded, y_pred, target_names=label_encoder.classes_))

# ----------------------------
# Save Predictions to CSV
# ----------------------------

df_test = pd.read_csv('test.csv')
df_test['AgeInDays'] = df_test['Age upon Intake'].apply(convert_age_to_days)
df_test['AgeGroup'] = df_test['AgeInDays'].apply(bucket_age)
df_test['Intake Time'] = pd.to_datetime(df_test['Intake Time'], errors='coerce')
#df_test['Outcome Time'] = pd.to_datetime(df_test['Outcome Time'], errors='coerce')
df_test['IntakeHour'] = df_test['Intake Time'].dt.hour
df_test['IntakeMonth'] = df_test['Intake Time'].dt.month
df_test['IntakeWeekday'] = df_test['Intake Time'].dt.dayofweek
#df_test['StayDuration'] = (df_test['Outcome Time'] - df_test['Intake Time']).dt.days
df_test['IsNamed'] = df_test.get('Name', pd.Series(index=df_test.index)).notnull().astype(int)
df_test['IsMixedBreed'] = df_test['Breed'].str.contains("Mix", case=False, na=False).astype(int)
df_test['Found_In_Austin_Travis'] = df_test['Found Location'].apply(is_in_austin_travis)

X_test = df_test[features]
X_test_processed = preprocessor.transform(X_test)

test_predictions_encoded = stacking_model.predict(X_test_processed)
predicted_outcomes = label_encoder.inverse_transform(test_predictions_encoded)

predictions_df = pd.DataFrame({'Id': df_test.index + 1, 'Outcome Type': predicted_outcomes})
predictions_df.to_csv('predictions.csv', index=False)
print("Predictions have been saved to 'predictions.csv'.")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Accuracy: 0.5781
                 precision    recall  f1-score   support

       Adoption       0.71      0.61      0.66     11009
           Died       0.03      0.14      0.05       208
     Euthanasia       0.28      0.53      0.37       690
Return to Owner       0.49      0.70      0.58      3320
       Transfer       0.60      0.48      0.53      7005

       accuracy                           0.58     22232
      macro avg       0.42      0.49      0.44     22232
   weighted avg       0.62      0.58      0.59     22232



  df_test['Intake Time'] = pd.to_datetime(df_test['Intake Time'], errors='coerce')


Predictions have been saved to 'predictions.csv'.
