### 1. Data preprocessing

In [150]:
import pandas as pd
import sys
import os

def create_dataframe(path):
    df = pd.read_excel(path)
    return df

def delete_duplicates(df):
    # Stergerea duplicatelor
    df.drop_duplicates(inplace = True) # inplace = True indica faptul ca fuctia va sterge duplicate din dataframe-ul dat si nu va return unul nou
    return df

def remove_unknown_gender_instances(df):
    # Eliminăm rândurile unde 'Gender' are valoarea 'Unknown'
    return df[df['Gender'] != 'Unknown']

def remove_unknown_race_instances(df):
    # Eliminăm rândurile unde 'Race' are valoarea 'Unknown'
    return df[df['Race'] != 'Unknown']

def replace_unknown_with_median(x):
    # Înlocuiește "Unknown" cu mediană, după ce valorile sunt convertite în numeric
    median_value = pd.to_numeric(x[x != "Unknown"]).median()
    return x.replace("Unknown", median_value)

# Funcția pentru aplicarea pe grupuri
def edit_unknown_values_for_natural_area(df):
    df["The abundance of natural areas"] = df.groupby("Race")["The abundance of natural areas"].transform(
        lambda x: replace_unknown_with_median(x)
    )

    # Convertim în int
    df["The abundance of natural areas"] = df["The abundance of natural areas"].astype(int)
    return df

def encode_age(df):
    # Create a copy of the DataFrame
    df_transformed = df.copy()
    
    # Age mapping
    age_mapping = {
        'Less than 1 year': 0.5,
        '1-2 years': 1.5,
        '2-10 years': 6,
        'More than 10 years': 12
    }
    
    # Apply age mapping
    df_transformed['Age'] = df_transformed['Age'].map(age_mapping)
    
    return df_transformed

def encode_categorical_columns(df, categorical_columns):
    # Create one-hot encoded columns
    for column in categorical_columns:
        one_hot = pd.get_dummies(df[column], prefix=column)
        
        # Add one-hot encoded columns to the transformed DataFrame
        df = pd.concat([df, one_hot], axis=1)
        
        # Drop the original categorical column
        df = df.drop(column, axis=1)
    
    return df


def preprocessing():
    # Crearea dataframe-ului
    df = create_dataframe('./Dataset/Dataset.xlsx')

    #stergerea duplicatelor
    df = delete_duplicates(df)

    # Stergerea instantelor unde Gender = Unknown (6 instante)
    df = remove_unknown_gender_instances(df)

    # Stergerea instantelor unde Race = Unknown (79 instante)
    df = remove_unknown_race_instances(df)

    # modificam instantele ce au valori 'Unknown' pentru coloana 'The abundance of natural areas' in mediana pentru fiecare rasa (240 de instante)
    df = edit_unknown_values_for_natural_area(df)

    # encodam coloana 'Age'
    df = encode_age(df)

    return df

df = preprocessing()
df.info()
# Print current path



<class 'pandas.core.frame.DataFrame'>
Index: 3055 entries, 0 to 3142
Data columns (total 26 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Gender                              3055 non-null   object 
 1   Age                                 3055 non-null   float64
 2   Race                                3055 non-null   object 
 3   Number of cats in the household     3055 non-null   int64  
 4   Type of housing                     3055 non-null   object 
 5   Zone                                3055 non-null   object 
 6   Time spent outside each day         3055 non-null   int64  
 7   Time spent with the owner each day  3055 non-null   int64  
 8   Shy                                 3055 non-null   int64  
 9   Calm                                3055 non-null   int64  
 10  Skittish                            3055 non-null   int64  
 11  Intelligent                         3055 non-nul

### 3. Training the model

> Antrenarea pe setul de date cu modificări minime.

In [151]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import pickle


class Dropout:
    def __init__(self, dropout_rate):
        self.dropout_rate = dropout_rate
        self.mask = None
        
    def forward(self, input_data, training=True):
        if training:
            self.mask = np.random.binomial(1, 1 - self.dropout_rate, input_data.shape) / (1 - self.dropout_rate)
            return input_data * self.mask
        return input_data
    
    def backward(self, grad_output):
        return grad_output * self.mask

class Layer:
    def __init__(self, input_size, output_size, activation='relu', l2_lambda=0.005, dropout_rate=0.2):
        self.weights = np.random.randn(input_size, output_size) * np.sqrt(2.0/input_size)
        self.bias = np.zeros((1, output_size))
        self.activation = activation
        self.l2_lambda = l2_lambda
        self.dropout = Dropout(dropout_rate)
        self.input = None
        self.output = None
        self.z = None
    
    def get_weights(self):
        return {
            'weights': self.weights.copy(),
            'bias': self.bias.copy()
        }
    
    def set_weights(self, weights):
        self.weights = weights['weights'].copy()
        self.bias = weights['bias'].copy()
        
    def forward(self, input_data, training=True):
        self.input = input_data
        self.z = np.dot(input_data, self.weights) + self.bias
        
        if self.activation == 'relu':
            self.output = np.maximum(0, self.z)
        elif self.activation == 'softmax':
            exp_values = np.exp(self.z - np.max(self.z, axis=1, keepdims=True))
            self.output = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        
        if self.activation != 'softmax':
            self.output = self.dropout.forward(self.output, training)
            
        return self.output
    
    def backward(self, grad_output, learning_rate):
        if self.activation != 'softmax':
            grad_output = self.dropout.backward(grad_output)
        
        if self.activation == 'relu':
            grad_z = grad_output * (self.z > 0)
        elif self.activation == 'softmax':
            grad_z = grad_output
            
        grad_weights = np.dot(self.input.T, grad_z) + self.l2_lambda * self.weights
        grad_bias = np.sum(grad_z, axis=0, keepdims=True)
        grad_input = np.dot(grad_z, self.weights.T)
        
        self.weights -= learning_rate * grad_weights
        self.bias -= learning_rate * grad_bias
        
        return grad_input

class NeuralNetwork:
    def __init__(self, input_dim, output_dim, l2_lambda=0.005):
        self.layers = [
            Layer(input_dim, 128, 'relu', l2_lambda, dropout_rate=0.1),
            Layer(128, 64, 'relu', l2_lambda, dropout_rate=0.1),
            Layer(64, output_dim, 'softmax', l2_lambda, dropout_rate=0.0)
        ]
    
    def get_weights(self):
        return [layer.get_weights() for layer in self.layers]
    
    def set_weights(self, weights):
        for layer, w in zip(self.layers, weights):
            layer.set_weights(w)
    
    def forward(self, X, training=True):
        current_output = X
        for layer in self.layers:
            current_output = layer.forward(current_output, training)
        return current_output
    
    def backward(self, X, y, learning_rate):
        grad_output = self.layers[-1].output - y
        for layer in reversed(self.layers):
            grad_output = layer.backward(grad_output, learning_rate)
    
    def train_step(self, X_batch, y_batch, learning_rate):
        predictions = self.forward(X_batch, training=True)
        self.backward(X_batch, y_batch, learning_rate)
        loss = self.compute_loss(predictions, y_batch)
        return loss, predictions
    
    def predict(self, X):
        return self.forward(X, training=False)
    
    def compute_loss(self, predictions, y_true):
        epsilon = 1e-15
        predictions = np.clip(predictions, epsilon, 1 - epsilon)
        return -np.mean(np.sum(y_true * np.log(predictions), axis=1))
    
    def compute_accuracy(self, predictions, y_true):
        return np.mean(np.argmax(predictions, axis=1) == np.argmax(y_true, axis=1))

def plot_training_history(histories, n_splits):
    plt.figure(figsize=(15, 10))
    
    # Plot training loss
    plt.subplot(2, 1, 1)
    for fold in range(n_splits):
        plt.plot(histories[fold]['train_loss'], label=f'Fold {fold+1}')
    plt.title('Training Loss per Fold')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    
    # Plot validation accuracy
    plt.subplot(2, 1, 2)
    for fold in range(n_splits):
        plt.plot(histories[fold]['val_acc'], label=f'Fold {fold+1}')
    plt.title('Validation Accuracy per Fold')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()
    
def train_and_evaluate_model(X, y, n_splits=6, epochs=1000, batch_size=32, learning_rate=0.0001, patience=100):
    y_encoded = y.values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)  # Fit scaler on all breed_predictor_model
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    fold_results = []
    histories = []
    
    n_classes = y_encoded.shape[1]
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, np.argmax(y_encoded, axis=1)), 1):
        print(f"\nFold {fold}/{n_splits}")
        
        X_train, X_val = X.iloc[train_idx].values, X.iloc[val_idx].values
        y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
        
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        
        model = NeuralNetwork(X_train.shape[1], n_classes, 0.001)
        
        history = {
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': []
        }
        
        best_val_accuracy = 0
        best_weights = None
        best_epoch = 0
        patience_counter = 0
        
        for epoch in range(epochs):
            indices = np.random.permutation(len(X_train_scaled))
            X_train_shuffled = X_train_scaled[indices]
            y_train_shuffled = y_train[indices]
            
            epoch_train_loss = 0
            epoch_train_acc = 0
            n_batches = 0
            
            for i in range(0, len(X_train_scaled), batch_size):
                X_batch = X_train_shuffled[i:i+batch_size]
                y_batch = y_train_shuffled[i:i+batch_size]
                
                loss, predictions = model.train_step(X_batch, y_batch, learning_rate)
                accuracy = model.compute_accuracy(predictions, y_batch)
                
                epoch_train_loss += loss
                epoch_train_acc += accuracy
                n_batches += 1
            
            avg_train_loss = epoch_train_loss / n_batches
            avg_train_acc = epoch_train_acc / n_batches
            
            val_predictions = model.predict(X_val_scaled)
            val_loss = model.compute_loss(val_predictions, y_val)
            val_accuracy = model.compute_accuracy(val_predictions, y_val)
            
            history['train_loss'].append(avg_train_loss)
            history['train_acc'].append(avg_train_acc)
            history['val_loss'].append(val_loss)
            history['val_acc'].append(val_accuracy)
            
            # Early stopping logic
            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                best_weights = model.get_weights()
                best_epoch = epoch
                patience_counter = 0
            else:
                patience_counter += 1
                
            if patience_counter >= patience:
                print(f"\nEarly stopping triggered at epoch {epoch + 1}")
                print(f"No improvement in validation accuracy for {patience} epochs")
                break
            
            if (epoch + 1) % 5 == 0:
                print(f"Epoch {epoch + 1}/{epochs}")
                print(f"Train Loss: {avg_train_loss:.4f}, Train Acc: {avg_train_acc:.4f}")
                print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")
        
        # Restore best weights for this fold
        model.set_weights(best_weights)
        histories.append(history)
        
        fold_results.append({
            'fold': fold,
            'best_val_accuracy': best_val_accuracy,
            'best_epoch': best_epoch,
            'model': model
        })
        
        print(f"\nFold {fold} Best Results:")
        print(f"Best Epoch: {best_epoch}")
        print(f"Best Validation Accuracy: {best_val_accuracy:.4f}")
    
    # Find the best overall model
    best_fold = max(fold_results, key=lambda x: x['best_val_accuracy'])
    
    print("\nOverall Cross-validation results:")
    print(f"Mean validation accuracy: {np.mean([r['best_val_accuracy'] for r in fold_results]):.4f}")
    print(f"Standard deviation: {np.std([r['best_val_accuracy'] for r in fold_results]):.4f}")
    print(f"\nBest model from fold {best_fold['fold']} at epoch {best_fold['best_epoch']}")
    print(f"Best validation accuracy: {best_fold['best_val_accuracy']:.4f}")
    
    # Plot training history
    plot_training_history(histories, n_splits)
    
    return best_fold['model'], fold_results, histories, scaler

def save_model(model, scaler, filepath='cat_breed_model.pkl'):
    """Save the model and scaler to a file"""
    model_data = {
        'weights': model.get_weights(),
        'scaler': scaler
    }
    with open(filepath, 'wb') as f:
        pickle.dump(model_data, f)

def prepare_data(data):
    race_columns = [col for col in data.columns if col.startswith('Race_')]
    X = data.drop(columns=race_columns)
    y = data[race_columns]
    return X, y

def train(data):
    X, y = prepare_data(data)
    best_model, fold_results, histories, scaler = train_and_evaluate_model(X, y)
    save_model(best_model, scaler)
    return best_model, fold_results, histories, scaler


In [152]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def test_model(best_model, test_df):
    """
    Test the trained model on a new test dataset.
    
    Parameters:
    best_model: trained Keras model
    test_df: pandas DataFrame containing test breed_predictor_model
    
    Returns:
    dict containing test accuracy, predictions, and evaluation metrics
    """
    # Prepare test breed_predictor_model
    race_columns = [col for col in test_df.columns if col.startswith('Race_')]
    X_test = test_df.drop(columns=race_columns)
    y_test = test_df[race_columns]
    
    # Scale features using the same approach as training
    scaler = StandardScaler()
    X_test_scaled = scaler.fit_transform(X_test)
    
    # Make predictions
    y_pred_proba = best_model.predict(X_test_scaled)
    y_pred = np.argmax(y_pred_proba, axis=1)
    y_test_classes = np.argmax(y_test.values, axis=1)
    
    # Calculate accuracy
    test_accuracy = np.mean(y_pred == y_test_classes)
    
    # Generate classification report
    class_names = [col.replace('Race_', '') for col in race_columns]
    report = classification_report(y_test_classes, y_pred, target_names=class_names)
    
    # Generate confusion matrix
    cm = confusion_matrix(y_test_classes, y_pred)
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    
    # Calculate per-class metrics
    per_class_metrics = {}
    for i, class_name in enumerate(class_names):
        true_class = (y_test_classes == i)
        pred_class = (y_pred == i)
        
        true_positives = np.sum(true_class & pred_class)
        false_positives = np.sum(~true_class & pred_class)
        false_negatives = np.sum(true_class & ~pred_class)
        
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        per_class_metrics[class_name] = {
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }
    
    return {
        'test_accuracy': test_accuracy,
        'classification_report': report,
        'confusion_matrix': cm,
        'per_class_metrics': per_class_metrics,
        'predictions': y_pred,
        'prediction_probabilities': y_pred_proba
    }

# Example usage
def evaluate_test_results(test_df, best_model):
    """
    Evaluate and print test results in a readable format.
    """
    results = test_model(best_model, test_df)
    
    print("\n=== Model Evaluation on Test Data ===")
    print(f"\nOverall Test Accuracy: {results['test_accuracy']:.4f}")
    
    print("\nClassification Report:")
    print(results['classification_report'])
    
    print("\nPer-Class Metrics:")
    for class_name, metrics in results['per_class_metrics'].items():
        print(f"\n{class_name}:")
        for metric_name, value in metrics.items():
            print(f"  {metric_name}: {value:.4f}")
    
    return results

In [153]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 3055 entries, 0 to 3142
Data columns (total 26 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Gender                              3055 non-null   object 
 1   Age                                 3055 non-null   float64
 2   Race                                3055 non-null   object 
 3   Number of cats in the household     3055 non-null   int64  
 4   Type of housing                     3055 non-null   object 
 5   Zone                                3055 non-null   object 
 6   Time spent outside each day         3055 non-null   int64  
 7   Time spent with the owner each day  3055 non-null   int64  
 8   Shy                                 3055 non-null   int64  
 9   Calm                                3055 non-null   int64  
 10  Skittish                            3055 non-null   int64  
 11  Intelligent                         3055 non-nul

In [154]:
import numpy as np

# Definim lungimi posibile ale părului pentru fiecare categorie
hair_length_mapping = {
    'Sphynx': ['No hair'],
    'Siamese': ['Short hair', 'Medium hair'],
    'British Shorthair': ['Short hair'],
    'Bengal': ['Short hair'],
    'Chartreux': ['Short hair'],
    'Savannah': ['Short hair'],
    'European': ['Short hair', 'Medium hair'],
    'Birman': ['Medium hair', 'Long hair'],
    'Turkish angora': ['Medium hair', 'Long hair'],
    'Persian': ['Long hair'],
    'Maine coon': ['Long hair'],
    'Ragdoll': ['Medium hair', 'Long hair'],
    'No breed': ['Short hair', 'Medium hair'],
    'Other': ['Short hair', 'Medium hair']
}

# Funcție pentru a selecta aleator lungimea părului bazat pe rasa specifică
def assign_coat_length(race):
    if race in hair_length_mapping:
        return np.random.choice(hair_length_mapping[race])
    return "Unknown"  # Default, în cazul în care rasa nu este cunoscută

# Aplicăm funcția pentru fiecare instanță
df['Coat Length'] = df['Race'].apply(assign_coat_length)

In [155]:
# Definirea caracteristicilor legate de coat pattern pentru fiecare rasă
coat_patterns = {
    "European": ["Tabby", "Solid", "Bicolor", "Tortoiseshell"],
    "Bengal": ["Tabby"],
    "Ragdoll": ["Colorpoint"],
    "Maine coon": ["Tabby", "Bicolor"],
    "Birman": ["Colorpoint"],
    "Persian": ["Colorpoint"],
    "British Shorthair": ["Solid", "Tabby", "Tortoiseshell"],
    "Sphynx": ["Solid", "Tabby"],
    "Siamese": ["Colorpoint"],
    "Chartreux": ["Solid"],
    "Turkish angora": ["Solid", "Tabby"],
    "Savannah": ["Tabby"],
    "No breed": ["Tabby", "Solid", "Bicolor", "Tricolor", "Tortoiseshell"],
    "Other": ["Tabby", "Solid", "Bicolor"]
}

# Adăugarea caracteristicilor în mod uniform
df["Coat Pattern"] = df["Race"].apply(
    lambda breed: np.random.choice(coat_patterns[breed])  # Atribuim aleatoriu un pattern din lista corespunzătoare
)

print(df)



     Gender   Age        Race  Number of cats in the household  \
0         F   0.5      Birman                                3   
1         F   0.5      Birman                                1   
2         F   6.0    European                                4   
3         F   0.5    European                                1   
4         F   1.5      Birman                                2   
...     ...   ...         ...                              ...   
3138      F   6.0     Persian                                1   
3139      F   0.5  Maine coon                                3   
3140      M  12.0       Other                                1   
3141      M   0.5      Bengal                                1   
3142      F   0.5      Bengal                                5   

                        Type of housing       Zone  \
0             Apartment without balcony      Urban   
1     Apartment with balcony or terrace      Urban   
2                House in a subdivision      

In [156]:
# One-hot encode categorical variables
categorical_columns = ['Type of housing', 'Zone', 'Race', 'Gender', 'Coat Length', 'Coat Pattern']
df = encode_categorical_columns(df, categorical_columns)

# Add FlatFace attribute (True for Persian cats)
df['FlatFace'] = df['Race_Persian']

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 3055 entries, 0 to 3142
Data columns (total 56 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Age                                                3055 non-null   float64
 1   Number of cats in the household                    3055 non-null   int64  
 2   Time spent outside each day                        3055 non-null   int64  
 3   Time spent with the owner each day                 3055 non-null   int64  
 4   Shy                                                3055 non-null   int64  
 5   Calm                                               3055 non-null   int64  
 6   Skittish                                           3055 non-null   int64  
 7   Intelligent                                        3055 non-null   int64  
 8   Vigilant                                           3055 non-null   int64  
 9   Tenacious    

In [157]:
import pandas as pd
import numpy as np

def remove_race_outliers(df, percentage=5):
    """
    Remove outliers for each cat race based on numerical columns.
    Shows distribution of races before and after outlier removal.
    
    Parameters:
    df (pandas.DataFrame): Input dataframe
    percentage (float): Percentage of outliers to remove (default 5%)
    
    Returns:
    pandas.DataFrame: DataFrame with outliers removed
    """
    # Get list of race columns and numerical columns
    race_columns = [col for col in df.columns if col.startswith('Race_')]
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    numerical_cols = [col for col in numerical_cols if not col.startswith('Race_')]
    
    # Calculate initial race distribution
    print("\nInitial Race Distribution:")
    initial_dist = {}
    for race_col in race_columns:
        race_count = df[race_col].sum()
        race_pct = (race_count / len(df)) * 100
        initial_dist[race_col.replace('Race_', '')] = (race_count, race_pct)
        print(f"{race_col.replace('Race_', ''):15} {race_count:5d} cats ({race_pct:5.1f}%)")
    
    # Create mask for final filtering
    final_mask = pd.Series(True, index=df.index)
    
    for race_col in race_columns:
        # Get subset of breed_predictor_model for this race
        race_mask = df[race_col]
        race_data = df[race_mask]
        
        if len(race_data) == 0:
            continue
            
        # Calculate outlier scores for each numerical column
        outlier_scores = pd.DataFrame()
        
        for col in numerical_cols:
            if race_data[col].nunique() > 1:  # Only process if there's variation
                Q1 = race_data[col].quantile(0.25)
                Q3 = race_data[col].quantile(0.75)
                IQR = Q3 - Q1
                
                # Calculate z-scores for more robust outlier detection
                z_scores = np.abs((race_data[col] - race_data[col].mean()) / race_data[col].std())
                outlier_scores[col] = z_scores
        
        if not outlier_scores.empty:
            # Calculate mean outlier score across all columns
            mean_outlier_score = outlier_scores.mean(axis=1)
            
            # Calculate threshold for given percentage
            threshold = mean_outlier_score.quantile(1 - percentage/100)
            
            # Update final mask
            race_indices = race_data.index[mean_outlier_score <= threshold]
            final_mask.loc[race_mask] = race_data.index.isin(race_indices)
    
    # Apply final mask to remove outliers
    cleaned_df = df[final_mask].copy()
    
    # Print summary of removed records
    removed_count = len(df) - len(cleaned_df)
    print(f"\nRemoved {removed_count} records ({(removed_count/len(df)*100):.1f}% of total)")
    
    # Calculate final race distribution
    print("\nFinal Race Distribution:")
    print("Race            Initial Count (%)    Final Count (%)    Change")
    print("-" * 65)
    for race_col in race_columns:
        race_name = race_col.replace('Race_', '')
        final_count = cleaned_df[race_col].sum()
        final_pct = (final_count / len(cleaned_df)) * 100
        initial_count, initial_pct = initial_dist[race_name]
        change_pct = final_pct - initial_pct
        print(f"{race_name:15} {initial_count:5d} ({initial_pct:5.1f}%)    {final_count:5d} ({final_pct:5.1f}%)    {change_pct:+5.1f}%")
    
    return cleaned_df

# Assuming your DataFrame is called 'df'
df = remove_race_outliers(df, percentage=5)


Initial Race Distribution:
Bengal            238 cats (  7.8%)
Birman            192 cats (  6.3%)
British Shorthair   166 cats (  5.4%)
Chartreux          31 cats (  1.0%)
European         1018 cats ( 33.3%)
Maine coon        197 cats (  6.4%)
No breed          482 cats ( 15.8%)
Other             135 cats (  4.4%)
Persian           192 cats (  6.3%)
Ragdoll           216 cats (  7.1%)
Savannah           26 cats (  0.9%)
Siamese            58 cats (  1.9%)
Sphynx             76 cats (  2.5%)
Turkish angora     28 cats (  0.9%)

Removed 158 records (5.2% of total)

Final Race Distribution:
Race            Initial Count (%)    Final Count (%)    Change
-----------------------------------------------------------------
Bengal            238 (  7.8%)      226 (  7.8%)     +0.0%
Birman            192 (  6.3%)      182 (  6.3%)     -0.0%
British Shorthair   166 (  5.4%)      157 (  5.4%)     -0.0%
Chartreux          31 (  1.0%)       29 (  1.0%)     -0.0%
European         1018 ( 33.3%)      

> Training a model using a dataset wich is balanced using SMOTE

In [158]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from collections import Counter

def balance_dataset_smote(df, race_counts_dict):
    """
    Balance the dataset using SMOTE according to custom counts for each race.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame
    race_counts_dict (dict): Dictionary with races as keys and desired counts as values
                            Example: {'European': 500, 'Sphynx': 100, ...}
    
    Returns:
    pandas.DataFrame: Balanced dataset with specified counts for each race
    """
    # Get all race columns
    race_columns = [col for col in df.columns if col.startswith('Race_')]
    
    # Create feature matrix X (excluding race columns)
    feature_columns = [col for col in df.columns if not col.startswith('Race_')]
    X = df[feature_columns]
    
    # Process each race separately
    balanced_dfs = []
    
    for race in race_counts_dict.keys():
        race_col = f'Race_{race}'
        
        if race_col not in df.columns:
            print(f"Warning: {race} not found in dataset. Skipping...")
            continue
            
        # Create binary classification problem for current race
        y = df[race_col].astype(int)
        
        # Calculate sampling strategy
        current_counts = Counter(y)
        desired_count = race_counts_dict[race]
        
        # Only apply SMOTE if we need more samples and have at least one positive sample
        if desired_count > current_counts[1] and current_counts[1] > 0:
            # Calculate ratio to achieve desired count
            sampling_strategy = {1: desired_count}
            
            try:
                # Apply SMOTE
                smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
                X_resampled, y_resampled = smote.fit_resample(X, y)
                
                # Create temporary DataFrame with features
                temp_df = pd.DataFrame(X_resampled, columns=feature_columns)
                
                # Add race columns (all False by default)
                for rc in race_columns:
                    temp_df[rc] = False
                
                # Set current race to True where y_resampled is 1
                temp_df[race_col] = y_resampled == 1
                
                # Only keep positive samples
                temp_df = temp_df[temp_df[race_col]]
                
            except ValueError as e:
                print(f"Warning: SMOTE failed for {race}, using random oversampling instead")
                # Fallback to random oversampling
                temp_df = df[df[race_col]].sample(n=desired_count, replace=True, random_state=42)
                
        else:
            # If we want fewer samples, randomly select without replacement
            n_samples = min(desired_count, current_counts[1])
            temp_df = df[df[race_col]].sample(n=n_samples, random_state=42)
        
        balanced_dfs.append(temp_df)
    
    # Combine all balanced samples
    balanced_df = pd.concat(balanced_dfs, ignore_index=True)
    
    # Print summary of new race distribution
    print("\nNew race distribution:")
    for race in race_counts_dict.keys():
        race_col = f'Race_{race}'
        if race_col in balanced_df.columns:
            count = balanced_df[race_col].sum()
            print(f"{race}: {int(count)}")
    
    return balanced_df

# Example usage:
# Define your desired counts for each race
desired_counts = {
    'European': 800,
    'No breed': 600,
    'Bengal': 500,
    'Ragdoll': 400,
    'Maine coon': 350,
    'Birman': 300,
    'Persian': 300,
    'British Shorthair': 300,
    'Other': 300,
    'Sphynx': 150,
    'Siamese': 100,
    'Chartreux': 60,
    'Turkish angora': 60,
    'Savannah': 50
}

# Balance the dataset
df = balance_dataset_smote(df, desired_counts)


New race distribution:
European: 800
No breed: 600
Bengal: 500
Ragdoll: 400
Maine coon: 350
Birman: 300
Persian: 300
British Shorthair: 300
Other: 300
Sphynx: 150
Siamese: 100
Chartreux: 60
Turkish angora: 60
Savannah: 50


In [None]:
df = pd.read_excel('Dataset.xlsx')

print(df.info())

In [161]:

df.to_excel('Dataset_Preprocessed.xlsx', index=False)

In [160]:
# Train the model
best_model, accuracies, histories, scaler = train(df)


Fold 1/6
Epoch 5/1000
Train Loss: 1.8295, Train Acc: 0.4051
Val Loss: 1.6840, Val Acc: 0.4551
Epoch 10/1000
Train Loss: 1.3956, Train Acc: 0.5441
Val Loss: 1.2978, Val Acc: 0.5997
Epoch 15/1000
Train Loss: 1.1857, Train Acc: 0.5974
Val Loss: 1.0981, Val Acc: 0.6180
Epoch 20/1000
Train Loss: 1.0556, Train Acc: 0.6287
Val Loss: 0.9818, Val Acc: 0.6489
Epoch 25/1000
Train Loss: 0.9726, Train Acc: 0.6376
Val Loss: 0.9052, Val Acc: 0.6713
Epoch 30/1000
Train Loss: 0.9171, Train Acc: 0.6540
Val Loss: 0.8544, Val Acc: 0.6713


KeyboardInterrupt: 