In [122]:
import pandas as pd
df = pd.read_excel('Dataset (2).xlsx')

In [123]:
# Filter out 'unknown' race instances
df = df[~df['Race'].isin(['Unknown'])]
del df['Gender']
print(df)

                     Age       Race  CatsInHouse  \
0       Less than 1 year     Birman            3   
1       Less than 1 year     Birman            1   
2             2-10 years   European            4   
3       Less than 1 year   European            1   
4              1-2 years     Birman            2   
...                  ...        ...          ...   
3138          2-10 years    Persian            1   
3139    Less than 1 year  MaineCoon            3   
3140  More than 10 years      Other            1   
3141    Less than 1 year     Bengal            1   
3142    Less than 1 year     Bengal            5   

                            HousingType       Zone  TimeOutside  \
0             Apartment without balcony      Urban            0   
1     Apartment with balcony or terrace      Urban            0   
2                House in a subdivision      Urban            0   
3                House in a subdivision      Rural            2   
4                 Individual house zone 

In [124]:
# Calculate the mean of numeric values
numeric_mask = df['NaturalAreasAbundance'] != 'Unknown'
mean_value = pd.to_numeric(df[numeric_mask]['NaturalAreasAbundance']).mean()

# Replace 'Unknown' with the mean value and round to nearest integer
df['NaturalAreasAbundance'] = df['NaturalAreasAbundance'].replace('Unknown', mean_value)
df['NaturalAreasAbundance'] = df['NaturalAreasAbundance'].astype(float).round().astype(int)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 3063 entries, 0 to 3142
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Age                    3063 non-null   object
 1   Race                   3063 non-null   object
 2   CatsInHouse            3063 non-null   int64 
 3   HousingType            3063 non-null   object
 4   Zone                   3063 non-null   object
 5   TimeOutside            3063 non-null   int64 
 6   TimeWithOwner          3063 non-null   int64 
 7   Shy                    3063 non-null   int64 
 8   Calm                   3063 non-null   int64 
 9   Skittish               3063 non-null   int64 
 10  Intelligent            3063 non-null   int64 
 11  Vigilant               3063 non-null   int64 
 12  Tenacious              3063 non-null   int64 
 13  Affectionate           3063 non-null   int64 
 14  Friendly               3063 non-null   int64 
 15  Loner                  306

In [125]:
from sklearn.model_selection import train_test_split
import pandas as pd

def create_stratified_split(df, stratify_col='Race', test_size=0.1, random_state=42):
    # Create the train/test split while maintaining the same proportions of Race
    train_df, test_df = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state,
        stratify=df[stratify_col]
    )
    
    # Verify the proportions
    print("\nRace proportions in original dataset:")
    print(df[stratify_col].value_counts(normalize=True))
    
    print("\nRace proportions in larger split (90%):")
    print(train_df[stratify_col].value_counts(normalize=True))
    
    print("\nRace proportions in smaller split (10%):")
    print(test_df[stratify_col].value_counts(normalize=True))
    
    return train_df, test_df

df, test_df = create_stratified_split(df)

# Print the sizes of the splits
print(f"\nSplit sizes:")
print(f"Larger split (90%): {len(df)} rows")
print(f"Smaller split (10%): {len(test_df)} rows")


Race proportions in original dataset:
Race
European            0.333660
NoBreed             0.157689
Bengal              0.078028
Ragdoll             0.070846
MaineCoon           0.064643
Birman              0.062684
Persian             0.062684
BritishShorthair    0.054195
Other               0.044074
Sphynx              0.024812
Siamese             0.018936
Chartreux           0.010121
TurkishAngora       0.009141
Savannah            0.008488
Name: proportion, dtype: float64

Race proportions in larger split (90%):
Race
European            0.333817
NoBreed             0.157837
Bengal              0.078012
Ragdoll             0.070755
MaineCoon           0.064586
Persian             0.062772
Birman              0.062772
BritishShorthair    0.054064
Other               0.044267
Sphynx              0.024673
Siamese             0.018868
Chartreux           0.010160
TurkishAngora       0.009071
Savannah            0.008345
Name: proportion, dtype: float64

Race proportions in smaller spl

In [126]:
import pandas as pd

def transform_dataset(df):
    """
    Transform the dataset by:
    1. Mapping age values to numeric
    2. One-hot encoding HousingType, Zone, and Race
    
    Args:
    df (pandas.DataFrame): Input DataFrame
    
    Returns:
    pandas.DataFrame: Transformed DataFrame
    """
    # Create a copy of the DataFrame
    df_transformed = df.copy()
    
    # Age mapping
    age_mapping = {
        'Less than 1 year': 0.5,
        '1-2 years': 1.5,
        '2-10 years': 6,
        'More than 10 years': 12
    }
    
    # Apply age mapping
    df_transformed['Age'] = df_transformed['Age'].map(age_mapping)
    
    # One-hot encode categorical variables
    categorical_columns = ['HousingType', 'Zone', 'Race']
    
    # Create one-hot encoded columns
    for column in categorical_columns:
        one_hot = pd.get_dummies(df_transformed[column], prefix=column)
        
        # Add one-hot encoded columns to the transformed DataFrame
        df_transformed = pd.concat([df_transformed, one_hot], axis=1)
        
        # Drop the original categorical column
        df_transformed = df_transformed.drop(column, axis=1)
    
    return df_transformed

# Example usage:
df = transform_dataset(df)
test_df = transform_dataset(test_df)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2756 entries, 1053 to 1321
Data columns (total 43 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Age                                            2756 non-null   float64
 1   CatsInHouse                                    2756 non-null   int64  
 2   TimeOutside                                    2756 non-null   int64  
 3   TimeWithOwner                                  2756 non-null   int64  
 4   Shy                                            2756 non-null   int64  
 5   Calm                                           2756 non-null   int64  
 6   Skittish                                       2756 non-null   int64  
 7   Intelligent                                    2756 non-null   int64  
 8   Vigilant                                       2756 non-null   int64  
 9   Tenacious                                      2756 no

In [127]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from collections import Counter
import random

def balance_cat_races(df, min_class_size=None, max_class_size=None):
    """
    Balance cat race classes using SMOTE for underrepresented classes and random undersampling
    for overrepresented classes.
    
    Parameters:
    df (pandas.DataFrame): Input dataframe with cat features and race columns
    min_class_size (int): Minimum size for each race class (default: None)
    max_class_size (int): Maximum size for each race class (default: None)
    
    Returns:
    pandas.DataFrame: Balanced dataframe
    dict: Class distribution before and after balancing
    """
    # Get race columns
    race_cols = [col for col in df.columns if col.startswith('Race_')]
    
    # Get features (all columns except race columns)
    feature_cols = [col for col in df.columns if col not in race_cols]
    
    # Create y label (convert one-hot encoded races to single column)
    y = np.argmax([df[col].values for col in race_cols], axis=0)
    X = df[feature_cols].copy()
    
    # Get initial class distribution
    initial_distribution = dict(Counter(y))
    
    # If min_class_size is not specified, use the median class size
    if min_class_size is None:
        min_class_size = int(np.median([count for count in initial_distribution.values()]))
    
    # If max_class_size is not specified, use the median class size * 2
    if max_class_size is None:
        max_class_size = min_class_size * 2
    
    # Apply SMOTE to classes below min_class_size
    small_classes = {k: v for k, v in initial_distribution.items() if v < min_class_size}
    if small_classes:
        sampling_strategy = {k: min_class_size for k in small_classes.keys()}
        smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X, y)
    else:
        X_resampled, y_resampled = X.copy(), y.copy()
    
    # Random undersample classes above max_class_size
    indices_to_keep = []
    for class_label in np.unique(y_resampled):
        class_indices = np.where(y_resampled == class_label)[0]
        if len(class_indices) > max_class_size:
            selected_indices = random.sample(list(class_indices), max_class_size)
            indices_to_keep.extend(selected_indices)
        else:
            indices_to_keep.extend(class_indices)
    
    # Create final balanced dataset
    X_balanced = X_resampled.iloc[indices_to_keep].reset_index(drop=True)
    y_balanced = y_resampled[indices_to_keep]
    
    # Convert y back to one-hot encoding
    race_df = pd.DataFrame(0, index=range(len(y_balanced)), columns=race_cols)
    for i, label in enumerate(y_balanced):
        race_df.iloc[i, label] = 1
    
    # Combine features with balanced races
    final_df = pd.concat([X_balanced, race_df], axis=1)
    
    # Get final class distribution
    final_distribution = dict(Counter(y_balanced))
    
    class_changes = {
        'initial_distribution': initial_distribution,
        'final_distribution': final_distribution
    }
    
    return final_df, class_changes

balanced_df, distribution = balance_cat_races(
    df, 
    min_class_size=100,  
    max_class_size=200
)

# Print class distribution changes
print("Before:", distribution['initial_distribution'])
print("After:", distribution['final_distribution'])


Before: {2: 149, 8: 173, 4: 920, 7: 122, 6: 435, 9: 195, 0: 215, 5: 178, 3: 28, 10: 23, 1: 173, 11: 52, 13: 25, 12: 68}
After: {0: 200, 1: 173, 2: 149, 3: 100, 4: 200, 5: 178, 6: 200, 7: 122, 8: 173, 9: 195, 10: 100, 11: 100, 12: 100, 13: 100}


In [129]:
import tensorflow as tf
from tensorflow.keras import layers, models
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def prepare_data(df):
    # Separate features and labels
    # Get all race columns
    race_columns = [col for col in df.columns if col.startswith('Race_')]
    
    # Get all non-race columns except the race columns
    feature_columns = [col for col in df.columns if not col.startswith('Race_')]
    
    X = df[feature_columns].copy()
    y = df[race_columns].copy()
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale the numerical features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test, scaler

def create_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Dense(128, activation='relu', input_shape=input_shape),
        layers.Dropout(0.3),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dense(num_classes, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

def train_model(df):
    # Prepare the data
    X_train, X_test, y_train, y_test, scaler = prepare_data(df)
    
    # Create the model
    input_shape = (X_train.shape[1],)
    num_classes = y_train.shape[1]
    model = create_model(input_shape, num_classes)
    
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_split=0.2,
        verbose=2
    )
    
    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"\nTest accuracy: {test_accuracy:.4f}")
    
    return model, scaler, history

def predict_breed(model, scaler, input_data):
    # Scale the input data
    scaled_input = scaler.transform(input_data)
    
    # Make predictions
    predictions = model.predict(scaled_input)
    
    return predictions

# Example usage:
# Load your data into a pandas DataFrame called 'df'
model, scaler, history = train_model(df)



Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


56/56 - 3s - 49ms/step - accuracy: 0.2377 - loss: 0.3480 - val_accuracy: 0.3537 - val_loss: 0.2164
Epoch 2/50
56/56 - 0s - 3ms/step - accuracy: 0.3193 - loss: 0.2241 - val_accuracy: 0.3560 - val_loss: 0.2115
Epoch 3/50
56/56 - 0s - 3ms/step - accuracy: 0.3279 - loss: 0.2175 - val_accuracy: 0.3628 - val_loss: 0.2078
Epoch 4/50
56/56 - 0s - 3ms/step - accuracy: 0.3443 - loss: 0.2119 - val_accuracy: 0.3537 - val_loss: 0.2067
Epoch 5/50
56/56 - 0s - 4ms/step - accuracy: 0.3415 - loss: 0.2088 - val_accuracy: 0.3605 - val_loss: 0.2076
Epoch 6/50
56/56 - 0s - 3ms/step - accuracy: 0.3539 - loss: 0.2078 - val_accuracy: 0.3537 - val_loss: 0.2076
Epoch 7/50
56/56 - 0s - 3ms/step - accuracy: 0.3528 - loss: 0.2059 - val_accuracy: 0.3560 - val_loss: 0.2059
Epoch 8/50
56/56 - 0s - 3ms/step - accuracy: 0.3596 - loss: 0.2035 - val_accuracy: 0.3605 - val_loss: 0.2067
Epoch 9/50
56/56 - 0s - 3ms/step - accuracy: 0.3613 - loss: 0.2025 - val_accuracy: 0.3605 - val_loss: 0.2070
Epoch 10/50
56/56 - 0s - 3ms/