In [1]:
%pip install tensorflow
%pip install numpy
%pip install pandas
%pip install ast
%pip install logging
%pip install ast
%pip install seaborn
%pip install mlflow
%pip install imbalanced-learn


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting ast
  Using cached AST-0.0.2.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[8 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "<string>", line 2, in <module>
  [31m   [0m   File "<pip-setuptools-caller>", line 34, in <module>
  [31m   [0m   File "/private/var/folders/d8/yhrnl4gn78vbcvy365vfz_q00000gp/T/pip-install-0xoloo4l/ast_66319348514d4c698ea23b1c544fa017/setup.py", line 6, in <module>
  [31m   [0m     README = codecs.open(os.path.join(here, 'AST/README'), encoding='utf8').read()
  [31m   [0m              ^^^^^^^^^^^^^^^^

In [1]:
import logging
import numpy as np
import pandas as pd
import ast
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report


2024-11-19 11:39:03.126037: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Initialize logging
logging.basicConfig(level=logging.INFO)

def preprocess_data(data_path):
    """
    Preprocesses the data by loading, validating, and filtering it.
    Args: data_path (str): Path to the CSV file containing data.
    Returns: tuple: (padded_sequences, labels, classes)
    """
    logging.info('Starting data preprocessing')

    try:
        # Load the data
        df = pd.read_csv(data_path)
        
        # Validate 'Babbles' column
        required_columns = ['Babbles', 'Sex']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"Missing columns: {missing_columns}")
        
        # Convert strings to lists and filter by length
        df['Babbles'] = df['Babbles'].apply(ast.literal_eval)
        
        df = df[df['Babbles'].apply(lambda x: len(x) >= 50)]
        
        # Pad sequences
        sequences = df['Babbles'].values
        padded_sequences = tf.keras.utils.pad_sequences(sequences, padding='post', dtype='float32')
        
        # Encode labels
        le = LabelEncoder()
        labels = le.fit_transform(df['Sex'])
        
        logging.info(f'Processed {len(padded_sequences)} valid sequences')

        return padded_sequences, labels, le.classes_

    except FileNotFoundError:
        logging.error(f"File {data_path} not found.")
        raise
    except Exception as e:
        logging.error(f"Error during preprocessing: {e}")
        raise


def create_model(input_length, num_classes):
    """
    Creates and returns a compiled GRU model.
    
    Args:
        input_length (int): Length of input sequences.
        num_classes (int): Number of output classes.
    
    Returns: tf.keras.Model: The compiled model.
    """
    logging.info('Creating the model')
    
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_length, 1)),
        tf.keras.layers.GRU(64, return_sequences=True),
        tf.keras.layers.Dropout(0.3),  # Slightly increased dropout
        tf.keras.layers.GRU(32),
        tf.keras.layers.Dropout(0.3),  # Slightly increased dropout
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    logging.info('Model created successfully')
    return model


def plot_confusion_matrix(y_true, y_pred, classes):
    """
    Plots the confusion matrix.
    
    Args:
        y_true (array): True labels.
        y_pred (array): Predicted labels.
        classes (list): Class names.
    """
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.show()


# Custom callback for detailed training progress
class TrainingCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % 5 == 0:
            print(f'Epoch {epoch + 1}: loss = {logs["loss"]:.4f}, '
                  f'accuracy = {logs["accuracy"]:.4f}, '
                  f'val_loss = {logs["val_loss"]:.4f}, '
                  f'val_accuracy = {logs["val_accuracy"]:.4f}')


def handle_class_imbalance(X_train, y_train):
    """
    Handle class imbalance by applying oversampling, undersampling, or class weights.
    """
    # Convert one-hot encoded labels back to integer labels (if needed)
    y_train_labels = np.argmax(y_train, axis=1)
    
    # Log initial class distribution
    class_counts = np.bincount(y_train_labels)
    logging.info(f"Initial class distribution in training data: {dict(zip(np.unique(y_train_labels), class_counts))}")
    
    # If classes are imbalanced, apply oversampling, undersampling, or weighted loss
    if np.any(class_counts < 0.1 * len(y_train_labels)):  # Example condition for imbalance (you can adjust the threshold)
        logging.info("Class imbalance detected. Applying SMOTE oversampling.")
        smote = SMOTE(sampling_strategy='auto', random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_labels)
        y_train_resampled = tf.keras.utils.to_categorical(y_train_resampled, num_classes=y_train.shape[1])  # Convert back to one-hot encoding
        
        # Log the new class distribution after resampling
        new_class_counts = np.bincount(np.argmax(y_train_resampled, axis=1))
        logging.info(f"New class distribution after SMOTE oversampling: {dict(zip(np.unique(y_train_resampled), new_class_counts))}")
        
        return X_train_resampled, y_train_resampled, None
    else:
        # Calculate class weights for imbalanced classes
        class_weights = compute_class_weight('balanced', classes=np.unique(y_train_labels), y=y_train_labels)
        class_weights_dict = {i: weight for i, weight in zip(np.unique(y_train_labels), class_weights)}
        
        # Log class weights distribution
        logging.info(f"Class weights: {class_weights_dict}")
        return X_train, y_train, class_weights_dict


def main():
    # Preprocess data
    try:
        X, y, classes = preprocess_data('./CMBabble_Master_Sex_scm.csv')
    except Exception as e:
        logging.error(f"Preprocessing failed: {e}")
        return
    
    # Reshape and split data
    X = X.reshape(X.shape[0], X.shape[1], 1)
    y = tf.keras.utils.to_categorical(y)
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Handle class imbalance (oversampling, undersampling, or class weights)
    X_train, y_train, class_weights = handle_class_imbalance(X_train, y_train)
    
    # Create and train model
    model = create_model(X.shape[1], len(classes))
    
    callbacks = [
        TrainingCallback(),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', 
            patience=25, 
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5, 
            patience=10, 
            min_lr=0.0001
        ),
        tf.keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1)
    ]
    
    history = model.fit(
        X_train, y_train,
        epochs=50,  # Adjust for practical training time
        batch_size=32,
        validation_split=0.2,
        callbacks=callbacks,
        verbose=1,
        class_weight=class_weights  # Apply class weights if available
    )

    # Evaluate on the test set
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f"\nTest Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}\n")

    # Get predicted classes for all test samples
    predictions = model.predict(X_test)
    predicted_classes = np.argmax(predictions, axis=1)
    true_classes = np.argmax(y_test, axis=1)

    plot_confusion_matrix(true_classes, predicted_classes, classes)
    print(classification_report(true_classes, predicted_classes, target_names=classes, zero_division=1))

    
    return model, history


if __name__ == "__main__":
    main()



INFO:root:Starting data preprocessing
ERROR:root:File ./Jupyter_Notebooks/CMBabble_Master_Sex_scm.csv not found.
ERROR:root:Preprocessing failed: [Errno 2] No such file or directory: './Jupyter_Notebooks/CMBabble_Master_Sex_scm.csv'
