In [1]:
import os
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
CSV_PATH = 'path_to_csv_file'
OUTPUT_ROOT = 'path_to_output_folder_for_spectrograms'
IMG_SIZE = (224, 224)
TEST_SIZE = 0.2
SEED = 42



In [None]:
# Labeling 0 for non-defaulters and 1 for defaulters
os.makedirs(os.path.join(OUTPUT_ROOT, 'train', '0'), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_ROOT, 'train', '1'), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_ROOT, 'val', '0'), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_ROOT, 'val', '1'), exist_ok=True)


df = pd.read_csv(CSV_PATH, na_values=['', ' ', 'NA', 'N/A'])
def clean_data(df):
    
    for col in df.columns:
        if 'bureau_enquiry' in col:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    
    df = df.replace([np.inf, -np.inf], np.nan)
    
    
    numeric_cols = df.select_dtypes(include=np.number).columns
    df[numeric_cols] = df[numeric_cols].fillna(0)
    
    return df

df_clean = clean_data(df)



In [None]:
features = df_clean.drop(['account_number', 'bad_flag'], axis=1)
target = df_clean['bad_flag']

X_train, X_val, y_train, y_val = train_test_split(
    features, target, 
    test_size=TEST_SIZE, 
    stratify=target,
    random_state=SEED
)

N_FFT = 256
HOP_LENGTH = 128
MIN_SIGNAL_LENGTH = N_FFT * 2

In [None]:
def safe_stft_conversion(signal, output_path):
    """Robust STFT conversion with enhanced validation"""
    try:
        signal = signal.astype(np.float32)
        
        if np.all(signal == 0):
            signal += np.random.normal(0, 1e-6, signal.shape)
            
        if len(signal) < MIN_SIGNAL_LENGTH:
            pad_width = MIN_SIGNAL_LENGTH - len(signal)
            signal = np.pad(signal, (0, pad_width), mode='edge')
            
        stft = librosa.stft(signal, n_fft=N_FFT, hop_length=HOP_LENGTH)
        spectrogram = librosa.amplitude_to_db(np.abs(stft), ref=np.max)
        
        fig = plt.figure(figsize=(IMG_SIZE[0]/100, IMG_SIZE[1]/100), dpi=100)
        ax = fig.add_subplot(111)
        ax.axis('off')
        librosa.display.specshow(spectrogram, cmap='viridis')
        plt.savefig(output_path, bbox_inches='tight', pad_inches=0)
        plt.close()
        return True
        
    except Exception as e:
        print(f"Failed to process: {str(e)}")
        return False

In [None]:
from tqdm import tqdm

def process_dataset(X, y, dataset_type):
    for idx, (_, row) in enumerate(tqdm(X.iterrows(), total=len(X), desc=f'Processing {dataset_type} dataset')):
        if idx >= len(y):
            print(f"Index {idx} is out of bounds for the target array.")
            continue
        signal = row.values
        label = y.iloc[idx]
        class_dir = '1' if label == 1 else '0'
        output_path = os.path.join(
            OUTPUT_ROOT, 
            dataset_type, 
            class_dir, 
            f'{dataset_type}_{idx}.png'
        )
        if not safe_stft_conversion(signal, output_path):
            continue

process_dataset(X_train, y_train, 'train')
process_dataset(X_val, y_val, 'val')

print(f"Dataset successfully created at {OUTPUT_ROOT}")

Processing train dataset:   0%|          | 0/153113 [00:00<?, ?it/s]

Processing train dataset: 100%|██████████| 153113/153113 [41:22<00:00, 61.68it/s]   
Processing val dataset: 100%|██████████| 38279/38279 [16:37<00:00, 38.37it/s]  

Dataset successfully created at /home/csgrad/yadvende/Finance_Vision/CreditCard_MVIT/dataset/



