In [2]:
import polars as pl
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
from tensorflow.keras.layers import LSTM, Dense
import helpers.input_processor as ip
import tqdm
import tensorflow as tf
from tensorflow.data import Dataset as tfds
import tensorflow_io as tfio
import math, random
import numpy as np

from matplotlib import pyplot as plt
import tensorflow.python.util as util

In [5]:
def pl_to_tf(df):
    #convert to tensorflow dataset
    ds = df.select(pl.all().map(lambda s: s.to_numpy())).row(0)
    ds = tfds.from_tensor_slices(ds)
    return ds


In [3]:
#load data from dataframe
data_dir = "data/raw_training/training_data/"
target_label = 'murmur_in_recording'
df = (
    ip.loadTrainingData(data_dir)
    .filter(pl.col(target_label) != 'Unknown')
    .pipe(ip.encodeData)
    .select([
        pl.col('audio_file').apply(lambda x: os.path.join(data_dir, x)),
        pl.col(target_label)
    ])
)

#balance the data so that there is an equal number of murmur positive and murmur negative samples
#do this by duplicating random rows of whichever group (pos or neg) is smaller
neg_df = df.filter(pl.col(target_label)==0.0)
pos_df = df.filter(pl.col(target_label)==1.0)
numNeg = neg_df.height
numPos = pos_df.height

while numNeg != numPos:
    if numNeg < numPos:
        df.vstack(neg_df.sample(n=min(numPos-numNeg, neg_df.height), shuffle=True), in_place=True)
    else: 
        df.vstack(pos_df.sample(n=min(numNeg-numPos, pos_df.height), shuffle=True), in_place=True)
    numNeg = df.filter(pl.col(target_label)==0.0).height
    numPos = df.filter(pl.col(target_label)==1.0).height

#check number of positive and negative samples
numNeg = df.filter(pl.col(target_label)==0.0).height
numPos = df.filter(pl.col(target_label)==1.0).height
print('Total Samples:       ', df.height)
print('Positive Samples:    ', numPos)
print('Negative Samples:    ', numNeg)
print('Percent Positive Samples:    ', numPos/(numPos+numNeg))

#split into train and test sets
split_ratio = 0.8
total_size = df.height
train_size = round(split_ratio * total_size)
test_size = total_size - train_size

df = df.sample(frac=1.0, shuffle=True)  #shuffle rows in dataframe
train_df = df.head(train_size)
test_df = df.tail(test_size)

#convert train and test sets from polars dataframe to tensorflow dataset
train_ds = pl_to_tf(train_df)
test_ds = pl_to_tf(test_df)

loading data from save file:  cache/ingested_data.json
Total Samples:        5328
Positive Samples:     2664
Negative Samples:     2664
Percent Positive Samples:     0.5


Andres tensorflow code