In [1]:
%pip install tensorflow
%pip install numpy
%pip install pandas
%pip install ast
%pip install logging
%pip install matplotlib
%pip install seaborn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting ast
  Using cached AST-0.0.2.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[8 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "<string>", line 2, in <module>
  [31m   [0m   File "<pip-setuptools-caller>", line 34, in <module>
  [31m   [0m   File "/private/var/folders/d8/yhrnl4gn78vbcvy365vfz_q00000gp/T/pip-install-jh1cmrs9/ast_9bd2d60360bc44aaba7372e609bfd20c/setup.py", line 6, in <module>
  [31m   [0m     README = codecs.open(os.path.join(here, 'AST/README'), encoding='utf8').read()
  [31m   [0m              ^^^^^^^^^^^^^^^^

In [6]:
import tensorflow as tf
import numpy as np
import pandas as pd
import ast
import logging
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix


2024-11-19 11:25:48.032187: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Initialize logging
logging.basicConfig(level=logging.INFO)

# Data preprocessing function
def preprocess_data(data_path):
    # Read and preprocess data
    df = pd.read_csv(data_path)
    
    # Convert string representations of lists to actual lists
    df['Babbles'] = df['Babbles'].apply(ast.literal_eval)

    df = df[df['Babbles'].apply(lambda x: len(x) >= 50)]
    logging.info(f'{len(df)} rows remain after filtering')
    
    # Get sequences and pad them
    sequences = df['Babbles'].values
    padded_sequences = tf.keras.utils.pad_sequences(sequences, padding='post', dtype='float32')

    # Encode labels
    le = LabelEncoder()
    labels = le.fit_transform(df['Sex'])
    
    return padded_sequences, labels, le.classes_

# Create the LSTM model
def create_lstm_model(max_length, num_classes):
    model = tf.keras.Sequential([
        # Input layer
        tf.keras.layers.Input(shape=(max_length, 1)),
        
        # LSTM layers
        tf.keras.layers.LSTM(128, return_sequences=True),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        
        tf.keras.layers.LSTM(64),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        
        # Dense layers with residual connections
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

# Custom callback for detailed training progress
class TrainingCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % 5 == 0:
            print(f'Epoch {epoch + 1}: loss = {logs["loss"]:.4f}, '
                  f'accuracy = {logs["accuracy"]:.4f}, '
                  f'val_loss = {logs["val_loss"]:.4f}, '
                  f'val_accuracy = {logs["val_accuracy"]:.4f}')

def main():
    # Set random seed for reproducibility
    tf.random.set_seed(42)
    np.random.seed(42)
    
    # Load and preprocess data
    X, y, classes = preprocess_data('../CMBabble_Master_Sex_scm.csv')
    
    # Reshape input for LSTM (samples, time steps, features)
    X = X.reshape(X.shape[0], X.shape[1], 1)
    
    # Convert labels to one-hot encoding
    y = tf.keras.utils.to_categorical(y)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Create and compile the model
    model = create_lstm_model(X.shape[1], len(classes))
    
    # Use a fixed initial learning rate with ReduceLROnPlateau
    initial_learning_rate = 0.001
    
    # Compile the model with fixed learning rate
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=initial_learning_rate),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Define callbacks
    callbacks = [
        TrainingCallback(),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=20,
            restore_best_weights=True,
            min_delta=0.001
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=10,
            min_lr=0.00001,
            verbose=1
        )
    ]
    
    # Train the model
    history = model.fit(
        X_train, y_train,
        epochs=150,  # Increased epochs since we have early stopping
        batch_size=32,
        validation_split=0.2,
        callbacks=callbacks,
        verbose=1
    )
    
    # Evaluate on test set
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f'Test loss: {test_loss:.4f}\n')
    print(f'\nTest accuracy: {test_accuracy:.4f}')
    
    # Function to predict on new sequences
    def predict_sequence(sequence):
        # Ensure sequence is in correct format
        sequence = np.array(sequence)
        sequence = tf.keras.utils.pad_sequences([sequence], maxlen=X.shape[1], padding='post')
        sequence = sequence.reshape(1, X.shape[1], 1)
        
        # Make prediction
        prediction = model.predict(sequence)
        predicted_class = classes[np.argmax(prediction)]
        confidence = np.max(prediction)
        
        return predicted_class, confidence, prediction[0]

    # Example prediction
    print("\nExample prediction:")
    for i in range(5):  # Predicting for the first 5 test samples
        sample_sequence = X_test[i].reshape(-1).tolist()
        pred_class, confidence, class_probabilities = predict_sequence(sample_sequence)
        print(f"Sample {i + 1}:")
        print(f"Predicted class: {pred_class}")
        print(f"Confidence: {confidence:.4f}")
        print("Class probabilities:", {c: f"{p:.4f}" for c, p in zip(classes, class_probabilities)})

        # Optionally, print the true class to compare
        true_class = classes[np.argmax(y_test[i])]
        print(f"True class: {true_class}\n")
    
    return model, history, predict_sequence

if __name__ == "__main__":
    main()


INFO:root:561 rows remain after filtering


Epoch 1/150
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 2s/step - accuracy: 0.4763 - loss: 1.0530 - val_accuracy: 0.4444 - val_loss: 0.6933 - learning_rate: 0.0010
Epoch 2/150
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 1s/step - accuracy: 0.5134 - loss: 1.0243 - val_accuracy: 0.5556 - val_loss: 0.6923 - learning_rate: 0.0010
Epoch 3/150
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 1s/step - accuracy: 0.4441 - loss: 1.0382 - val_accuracy: 0.5556 - val_loss: 0.6924 - learning_rate: 0.0010
Epoch 4/150
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 1s/step - accuracy: 0.5736 - loss: 0.8115 - val_accuracy: 0.5556 - val_loss: 0.6914 - learning_rate: 0.0010
Epoch 5/150
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.4826 - loss: 0.8653Epoch 5: loss = 0.8767, accuracy = 0.4916, val_loss = 0.6891, val_accuracy = 0.5556
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15