**Preprocessing and baseline model**

*https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification/overview*

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, log_loss
from scipy.stats import entropy
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from tqdm import tqdm
import time
from sklearn.base import BaseEstimator, TransformerMixin
import random

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
base_path = '/content/drive/My Drive/HMS/train_spectrograms/'

In [4]:
train_file_path = '/content/drive/My Drive/HMS/train.csv'

In [5]:
df = pd.read_csv(train_file_path)

In [6]:
list_unique_values = list(df['spectrogram_id'].unique())
print(f"number of unique values:{len(list_unique_values)}")

number of unique values:11138


In [7]:
# for ranom generation, if smaller array are rquired
random.seed(42)
list_unique_value_random = random.sample(list_unique_values, 100)
print(f"number of random unique values:{len(list_unique_value_random)}")

number of random unique values:100


load spectograms parquets according to unique_values with names 4653464<br>

In [8]:
def load_parquet_files(unique_values, base_path):
    """
    Loads parquet files from a specified base path into a dictionary.

    This function iterates over a list of unique values, constructs a file path for each corresponding
    parquet file in the specified base directory, and attempts to load the file into a pandas DataFrame.
    Each successfully loaded DataFrame is stored in a dictionary with its unique value as the key.
    The function tracks the time taken to load all files and prints the duration upon completion.
    If a file cannot be loaded, an error message is printed.

    Parameters:
    - unique_values (list): A list of unique values used to identify the parquet files to be loaded.
                            Each value corresponds to a part of the filename for a parquet file.
    - base_path (str): The base directory path where the parquet files are stored. Each parquet file
                       is expected to be named using its unique value from the unique_values list and
                       have a '.parquet' extension.

    Returns:
    - dict: A dictionary where each key is a unique value from the unique_values list and each value
            is a pandas DataFrame loaded from the corresponding parquet file.

    Example usage:
    # Define the list of unique values and base path
    unique_values = [924234, 1219001, 353733]
    base_path = '/content/drive/My Drive/HMS/train_spectrograms/'

    # Load the parquet files into a dictionary
    parquet_dict = load_parquet_files(unique_values, base_path)

    """
    parquet_dict = {}
    start_time = time.time()
    for value in tqdm(unique_values, desc="Loading Parquet Files"):
        file_path = f"{base_path}/{value}.parquet"
        try:
            parquet_data = pd.read_parquet(file_path)
            parquet_dict[value] = parquet_data
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    end_time = time.time()
    print(f"Completed in {end_time - start_time:.2f} seconds")
    return parquet_dict
# parquet_dict = load_parquet_files(list_unique_values, base_path )
parquet_dict = load_parquet_files(list_unique_value_random, base_path ) # smaller  random


Loading Parquet Files: 100%|██████████| 100/100 [02:18<00:00,  1.39s/it]

Completed in 138.89 seconds





expanded parquest with subset names as 87687_2<br>
and values of the 50 lines by subset <br>


In [9]:

def expand_parquet_dict(parquet_dict, df):
    """
    Expands the data in a dictionary of parquet files by selecting specific data
    based on spectrogram identifiers and label offset information from a DataFrame.

    This function iterates over each row in a DataFrame that contains spectrogram IDs,
    sub-IDs, and label offset seconds. It uses these to construct a unique key for each
    piece of data and selects a specific range of data from the corresponding parquet file
    in the parquet_dict. The selected data is then stored in a new dictionary with the unique
    key as its identifier.

    Parameters:
    - parquet_dict (dict): A dictionary where keys are spectrogram IDs and values are DataFrames
                           loaded from parquet files.
    - df (pd.DataFrame): A DataFrame containing at least the columns 'spectrogram_id',
                         'spectrogram_sub_id', and 'spectrogram_label_offset_seconds'. These columns
                         are used to identify which data to select from each parquet file.

    Returns:
    - dict: A dictionary where each key is a unique identifier composed of the spectrogram ID and
            sub-ID, and each value is a selected portion of the data from the corresponding parquet
            file, based on the label offset seconds.

    Example usage:
    expanded_parquet_dict = expand_parquet_dict(parquet_dict, df)
    """
    expanded_parquet_dict = {}
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Expanding Parquet Dict"):
        spectrogram_id = row['spectrogram_id']
        spectrogram_sub_id = row['spectrogram_sub_id']
        label_offset_seconds = row['spectrogram_label_offset_seconds']

        unique_key = f"{spectrogram_id}_{spectrogram_sub_id}"
        if spectrogram_id in parquet_dict:
            spectrogram_data = parquet_dict[spectrogram_id]
            start_row = int(label_offset_seconds) + 1
            selected_data = spectrogram_data[start_row:start_row + 51]

            expanded_parquet_dict[unique_key] = selected_data

    return expanded_parquet_dict

after_expand_parquet_dict = expand_parquet_dict(parquet_dict, df)

Expanding Parquet Dict: 100%|██████████| 106800/106800 [00:12<00:00, 8609.30it/s] 


to encode targets

In [10]:
# label encoder
le = LabelEncoder()

df['expert_consensus_encoded'] = le.fit_transform(df['expert_consensus'])

In [11]:
le.classes_ # [0,1,2,3,4,5] - target class equivalent by order

array(['GPD', 'GRDA', 'LPD', 'LRDA', 'Other', 'Seizure'], dtype=object)

create df to the split of unique_id and expert_consensus_encoded

In [12]:
def create_matched_dataframe(expand_parquet_dict, df):
    """
    Creates a matched DataFrame  to be used in train_test_split by merging an expanded parquet dictionary
    with an existing DataFrame based on spectrogram identifiers, and prepares it for model input by encoding
    the expert consensus.

    This function first extracts keys from the expanded parquet dictionary, which are unique identifiers
    consisting of spectrogram IDs and sub-IDs. It then splits these keys to create a new DataFrame with
    separate columns for spectrogram IDs and sub-IDs. This new DataFrame is merged with the original DataFrame
    (df) on these identifiers to ensure that each row corresponds to the correct spectrogram data.

    After merging, the function selects relevant columns for the machine learning model and constructs a
    final DataFrame that includes a unique identifier (combining spectrogram ID and sub-ID) and the encoded
    expert consensus.

    Parameters:
    - expand_parquet_dict (dict): A dictionary where keys are unique identifiers (combining spectrogram ID
                                  and sub-ID) and values are DataFrames with spectrogram data. This dictionary
                                  is created by the `expand_parquet_dict` function.
    - df (pd.DataFrame): The original DataFrame that contains 'spectrogram_id', 'spectrogram_sub_id', and
                         'expert_consensus_encoded' columns, among others. This DataFrame is used to match
                         spectrogram data with its corresponding expert consensus.

    Returns:
    - pd.DataFrame: A DataFrame ready for splitting into training and testing datasets. It contains two columns:
                    'unique_id' (a combination of 'spectrogram_id' and 'spectrogram_sub_id') and fitting
                    'expert_consensus_encoded' (the encoded expert consensus labels).

    Example usage:
    df_to_split = create_matched_dataframe(expand_parquet_dict, df)

    """
    keys = list(expand_parquet_dict.keys())
    spectrogram_ids = [key.split('_')[0] for key in tqdm(keys, desc="Creating Matched DataFrame")]
    spectrogram_sub_ids = [key.split('_')[1] for key in keys]

    matched_df = pd.DataFrame({
        'spectrogram_id': spectrogram_ids,
        'spectrogram_sub_id': spectrogram_sub_ids
    })

    matched_df['spectrogram_id'] = matched_df['spectrogram_id'].astype(int)
    matched_df['spectrogram_sub_id'] = matched_df['spectrogram_sub_id'].astype(int)

    merged_df = pd.merge(matched_df, df, on=['spectrogram_id', 'spectrogram_sub_id'], how='left')

    final_df = merged_df[['spectrogram_id', 'spectrogram_sub_id', 'expert_consensus_encoded']]
    final_df = final_df.copy()
    final_df['unique_id'] = final_df['spectrogram_id'].astype(str) + '_' + final_df['spectrogram_sub_id'].astype(str)
    return final_df[['unique_id', 'expert_consensus_encoded']]

df_to_split = create_matched_dataframe(after_expand_parquet_dict, df)

Creating Matched DataFrame: 100%|██████████| 613/613 [00:00<00:00, 309660.17it/s]


splitting:

In [13]:
X = df_to_split['unique_id']
y = df_to_split['expert_consensus_encoded']

In [14]:
X.shape, y.shape

((613,), (613,))

In [15]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((490,), (123,), (490,), (123,))

In [17]:

EXPECTED_NUM_FEATURES = 20000  # coerce amount of features

class DataMatcher(BaseEstimator, TransformerMixin):
    """
    A custom transformer class for matching and transforming spectrogram data into a consistent format suitable for machine learning models.

    This transformer takes a dictionary of expanded parquet data and a list of unique IDs. For each unique ID, it retrieves the corresponding
    spectrogram data from the dictionary, flattens it into a 1D array, and ensures that each transformed sample has a consistent number of features.
    If the original data has more features than expected, it truncates the array; if it has fewer, it pads the array with zeros. Missing data is
    handled by creating an array of zeros.

    Parameters:
    - expand_parquet_dict (dict): A dictionary where keys are unique identifiers for spectrogram data and values are DataFrames containing the
                                  corresponding spectrogram data.

    Methods:
    - fit(self, X, y=None): Placeholder method for compatibility with sklearn's transformer API. It doesn't learn anything from the data and returns
                            the transformer itself.
    - transform(self, X): Transforms the provided unique IDs into a numpy array of matched and formatted spectrogram data.

    Returns:
    - numpy.ndarray: An array where each row corresponds to a flattened and feature-consistent representation of the spectrogram data associated with
                     a unique ID in the input list.

    Example usage:
    data_matcher = DataMatcher(expand_parquet_dict=after_expand_parquet_dict)
    X_transformed = data_matcher.transform(X_train_ids)
    """
    def __init__(self, expand_parquet_dict):
        self.expand_parquet_dict = expand_parquet_dict

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        matched_data = []
        for unique_id in X:
            data = self.expand_parquet_dict.get(unique_id)
            if data is not None:
                flattened_data = data.iloc[1:, 1:].values.flatten()
                if len(flattened_data) > EXPECTED_NUM_FEATURES:
                    flattened_data = flattened_data[:EXPECTED_NUM_FEATURES]
                elif len(flattened_data) < EXPECTED_NUM_FEATURES:
                    flattened_data = np.pad(flattened_data, (0, EXPECTED_NUM_FEATURES - len(flattened_data)), 'constant')
                matched_data.append(flattened_data)
            else:
                matched_data.append(np.zeros(EXPECTED_NUM_FEATURES))
        matched_data_array = np.vstack(matched_data)
        return matched_data_array



pipeline and Random Forest Classifier

In [18]:
# Instantiate the custom transformer with the expanded parquet dictionary
data_matcher = DataMatcher(expand_parquet_dict=after_expand_parquet_dict)

# Define the preprocessing and model training steps options: '0' or mean value
pipeline_steps = [
    ('data_matcher', data_matcher),
    # ('imputer', SimpleImputer(strategy='constant', fill_value=0)) # fill NaNs with '0'
    ('imputer', SimpleImputer(strategy='mean'))  # fill Nans with mean value

]

model_pipeline = Pipeline(steps=pipeline_steps + [('classifier', RandomForestClassifier(random_state=42))])

X_train_ids = X_train.to_numpy()
X_test_ids = X_test.to_numpy()

model_pipeline.fit(X_train_ids, y_train)

y_pred = model_pipeline.predict(X_test_ids)
print(y_pred)

[5 4 4 0 5 4 0 3 3 2 4 0 4 0 4 4 1 4 4 5 5 4 4 4 0 0 0 4 4 5 5 0 4 1 5 0 1
 4 4 4 5 1 1 4 4 4 2 1 4 5 5 4 4 4 4 3 4 4 1 0 4 4 0 4 1 4 4 5 4 1 5 4 5 4
 4 3 4 4 0 2 4 0 4 0 4 4 4 0 2 4 2 1 1 0 1 0 4 2 4 5 4 5 4 5 0 3 2 4 5 4 2
 5 0 2 4 4 4 0 4 4 4 1 3]


metrics:

In [19]:

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.6585365853658537
              precision    recall  f1-score   support

           0       0.90      0.75      0.82        24
           1       0.54      0.64      0.58        11
           2       0.78      0.39      0.52        18
           3       1.00      0.86      0.92         7
           4       0.51      0.81      0.62        36
           5       0.78      0.52      0.62        27

    accuracy                           0.66       123
   macro avg       0.75      0.66      0.68       123
weighted avg       0.71      0.66      0.66       123



In [20]:
y_test.value_counts()

4    36
5    27
0    24
2    18
1    11
3     7
Name: expert_consensus_encoded, dtype: int64

In [21]:
y_probs = model_pipeline.predict_proba(X_test_ids)

logloss = log_loss(y_test, y_probs)
print(f"Log Loss: {logloss}")

Log Loss: 0.8373231601295982
