Same as [CatBoost Starter - [LB 0.67]](https://www.kaggle.com/code/cdeotte/catboost-starter-lb-0-67)

**V3**
- Modified targets as Other vs All Seizures

**V5**
- Modified target weights as 0.95 and 0.05

# Load Libraries

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
import pandas as pd, numpy as np
import matplotlib.pyplot as plt

VER = 2

In [None]:
df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
df['expert_consensus'].hist()

# Load Train Data

In [None]:
df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
TARGETS = list(df.columns[-6:])
TARGETS_GROUP2 = ['other_vote']
TARGETS_GROUP1 = list(set(list(df.columns[-6:])) - set(TARGETS_GROUP2))
df['all_target_group1'] = df.apply(lambda row: sum([row[x] for x in TARGETS_GROUP1]) , axis=1)
df['all_target_group2'] = df.apply(lambda row: sum([row[x] for x in TARGETS_GROUP2]) , axis=1)
TARGETS = ['all_target_group1','all_target_group2']
print('Train shape:', df.shape )
print('Targets', list(TARGETS))
df.head()

# Create Non-Overlapping Eeg Id Train Data
The competition data description says that test data does not have multiple crops from the same `eeg_id`. Therefore we will train and validate using only 1 crop per `eeg_id`. There is a discussion about this [here][1].

[1]: https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification/discussion/467021

In [None]:
train = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_id':'first','spectrogram_label_offset_seconds':'min'})
train.columns = ['spec_id','min']

tmp = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_label_offset_seconds':'max'})
train['max'] = tmp

tmp = df.groupby('eeg_id')[['patient_id']].agg('first')
train['patient_id'] = tmp

tmp = df.groupby('eeg_id')[TARGETS].agg('sum')
for t in TARGETS:
    train[t] = tmp[t].apply(lambda x: 0.95 if x >= 0.5 else 0.05)
y_data = train[TARGETS].values
y_data = y_data / y_data.sum(axis=1,keepdims=True)
train[TARGETS] = y_data

tmp = df.groupby('eeg_id')[['expert_consensus']].agg('first')
train['org_target'] = tmp

TARGETS_GROUP1_NAMES = [x.lower().replace("_vote","").strip() for x in TARGETS_GROUP1]
train['target'] = train.apply(lambda x: "Group1" if  x['org_target'].lower() in  TARGETS_GROUP1_NAMES else "Group2", axis=1)

train = train.reset_index()
print('Train non-overlapp eeg_id shape:', train.shape )

train['all_target_group1'] = train.apply(lambda x: 0.95 if x['target'] == 'Group1' else 0.05, axis=1)
train['all_target_group2'] = train.apply(lambda x: 0.95 if x['target'] == 'Group2' else 0.05, axis=1)
train.head()

In [None]:
train['org_target'].hist()

In [None]:
train['target'].hist()

# Feature Engineer
In this section, we create features for our CatBoost model. 

First we need to read in all 11k train spectrogram files. Reading thousands of files takes 11 minutes with Pandas. Instead, we can read 1 file from my [Kaggle dataset here][1] which contains all the 11k spectrograms in less than 1 minute! To use my [Kaggle dataset][1], set variable `READ_SPEC_FILES = False`. Don't forget to upvote this helpful [dataset][1] :-)

Next we need to engineer features for our CatBoost model. In this notebook, we just take the mean (over time) of each of the 400 spectrogram frequencies (using middle 10 minutes). This produces 400 features (per each unique eeg id). We can improve CV and LB score by engineering new features (and/or tuning CatBoost).

UPDATE: Version 2 creates features from `means` and `mins`. And version 2 uses `10 minute windows` and `20 second windows`.

[1]: https://www.kaggle.com/datasets/cdeotte/brain-spectrograms

In [None]:
READ_SPEC_FILES = False
FEATURE_ENGINEER = True

In [None]:
%%time
# READ ALL SPECTROGRAMS
PATH = '/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/'
files = os.listdir(PATH)
print(f'There are {len(files)} spectrogram parquets')

if READ_SPEC_FILES: 
    spectrograms = {}
    for i,f in enumerate(files):
        if i%100==0: print(i,', ',end='')
        tmp = pd.read_parquet(f'{PATH}{f}')
        name = int(f.split('.')[0])
        spectrograms[name] = tmp.iloc[:,1:].values
else:
    spectrograms = np.load('/kaggle/input/brain-spectrograms/specs.npy',allow_pickle=True).item()

In [None]:
%time
# ENGINEER FEATURES
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')

# FEATURE NAMES
SPEC_COLS = pd.read_parquet(f'{PATH}1000086677.parquet').columns[1:]
FEATURES = [f'{c}_mean_10m' for c in SPEC_COLS]
FEATURES += [f'{c}_min_10m' for c in SPEC_COLS]
FEATURES += [f'{c}_mean_20s' for c in SPEC_COLS]
FEATURES += [f'{c}_min_20s' for c in SPEC_COLS]
print(f'We are creating {len(FEATURES)} features for {len(train)} rows... ',end='')

if FEATURE_ENGINEER:
    data = np.zeros((len(train),len(FEATURES)))
    for k in tqdm(range(len(train)),  total=len(train)):
        #if k%100==0: print(k,', ',end='')
        row = train.iloc[k]
        r = int( (row['min'] + row['max'])//4 ) 
        
        # 10 MINUTE WINDOW FEATURES (MEANS and MINS)
        x = np.nanmean(spectrograms[row.spec_id][r:r+300,:],axis=0)
        data[k,:400] = x
        x = np.nanmin(spectrograms[row.spec_id][r:r+300,:],axis=0)
        data[k,400:800] = x
        
        # 20 SECOND WINDOW FEATURES (MEANS and MINS)
        x = np.nanmean(spectrograms[row.spec_id][r+145:r+155,:],axis=0)
        data[k,800:1200] = x
        x = np.nanmin(spectrograms[row.spec_id][r+145:r+155,:],axis=0)
        data[k,1200:1600] = x

    train[FEATURES] = data
else:
    train = pd.read_parquet('/kaggle/input/brain-spectrograms/train.pqt')
print()
print('New train shape:',train.shape)

In [None]:
train

# Train CatBoost
We use the default settings for CatBoost which are pretty good. We can tune CatBoost manually to improve CV and LB score. Note that CatBoost will automatically use both Kaggle T4 GPUs (when we add parameter `task_type='GPU'`)  for super fast training!

In [None]:
import catboost as cat, gc
from catboost import CatBoostClassifier, Pool
print('CatBoost version',cat.__version__)

In [None]:
train[TARGETS]

In [None]:
from sklearn.model_selection import KFold, GroupKFold

all_oof = []
all_true = []
TARS = {'Group1':0, 'Group2':1}

gkf = GroupKFold(n_splits=5)
all_index = []
for i, (train_index, valid_index) in enumerate(gkf.split(train, train.target, train.patient_id)):   
    
    print('#'*25)
    print(f'### Fold {i+1}')
    print(f'### train size {len(train_index)}, valid size {len(valid_index)}')
    print('#'*25)
    
    model = CatBoostClassifier(task_type='GPU',
                               loss_function='MultiClass')
    
    train_pool = Pool(
        data = train.loc[train_index,FEATURES],
        label = train.loc[train_index,'target'].map(TARS),
    )
    valid_pool = Pool(
        data = train.loc[valid_index,FEATURES],
        label = train.loc[valid_index,'target'].map(TARS),
    )
    
    
    model.fit(train_pool,
             verbose=100,
             eval_set=valid_pool,
             )
    model.save_model(f'CAT_v{VER}_f{i}.cat')
    
    oof = model.predict_proba(valid_pool)
    all_oof.append(oof)
    all_true.append(train.loc[valid_index, TARGETS].values)
    all_index.extend(valid_index)
    del train_pool, valid_pool, oof #model
    gc.collect()
    
    #break
    
all_oof = np.concatenate(all_oof)
all_true = np.concatenate(all_true)

# Feature Importance
Below we display the CatBoost top 25 feature importance for the last fold we trained.

In [None]:
TOP = 25

feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(10, 8))
plt.barh(np.arange(len(sorted_idx))[-TOP:], feature_importance[sorted_idx][-TOP:], align='center')
plt.yticks(np.arange(len(sorted_idx))[-TOP:], np.array(FEATURES)[sorted_idx][-TOP:])
plt.title(f'Feature Importance - Top {TOP}')
plt.show()

In [None]:
check_all_true_classes = np.argmax(train.loc[valid_index, TARGETS].values, axis=1)
check_all_true_classes

In [None]:
train.iloc[valid_index].head(3)

In [None]:
mappings = {'Group1':0, 'Group2': 1}
all_true_classes = [mappings[x] for x in list(train.iloc[valid_index]['target'])]
str(all_true_classes[0:3])

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.metrics import confusion_matrix

all_oof_classes = np.argmax(all_oof, axis=1)
all_true_classes = np.argmax(all_true, axis=1)
accuracy = accuracy_score(all_true_classes, all_oof_classes)

f1 = f1_score(all_true_classes, all_oof_classes, average='weighted')
accuracy, f1

In [None]:
conf_matrix = confusion_matrix(all_true_classes, all_oof_classes)
TARGETS_NAMES = [','.join([x.replace("_vote", " ") for x in TARGETS_GROUP1]), ','.join([x.replace("_vote", " ") for x in TARGETS_GROUP2])]
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=TARGETS_NAMES, yticklabels=TARGETS_NAMES)
plt.title('Can we split well?')
plt.xlabel('Predicted Target')
plt.ylabel('True Target')
plt.show()

In [None]:
from collections import Counter
Counter(all_true_classes), Counter(all_oof_classes)

In [None]:
focus_ids = []
for i, (true_class, oof_class) in  enumerate(zip(all_true_classes, all_oof_classes)):
    if true_class == 0 and oof_class == 1:
        focus_ids.append(all_index[i])

In [None]:
len(focus_ids)

In [None]:
train.iloc[focus_ids]['org_target'].hist()

In [None]:
best_accuracy = 0
for thresold in range(0,100):
    t = thresold/100
    temp_all_oof = [1]*len(all_oof)
    for i in range(len(temp_all_oof)):
        if all_oof[i][1] > t:
            temp_all_oof[i] = 1
        else:
            temp_all_oof[i] = 0
    all_oof_classes = temp_all_oof
    all_true_classes = np.argmax(all_true, axis=1)
    accuracy = accuracy_score(all_true_classes, all_oof_classes)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        print("Best found", t, best_accuracy)