In [None]:
import pandas as pd, numpy as np, os
import torch
import torch.nn as nn
import warnings
warnings.filterwarnings("ignore")

In [None]:
base_path = "/kaggle/input/hms-harmful-brain-activity-classification/"
train_csv = pd.read_csv(base_path+"train.csv")
train_eeg1 = pd.read_parquet(base_path+"train_eegs/1628180742.parquet")
train_spectrogram1 = pd.read_parquet(base_path+"train_spectrograms/2147388374.parquet")
targets = train_csv.columns[-6:]
tars = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}

In [None]:
train = train_csv.groupby('eeg_id')[['spectrogram_id', 
                                     'spectrogram_label_offset_seconds']].agg({
    'spectrogram_id':'first', 'spectrogram_label_offset_seconds' : 'min'
})

train.columns=['spec_id', 'min_time']

tmp = train_csv.groupby('eeg_id')[['spectrogram_id', 
                                     'spectrogram_label_offset_seconds']].agg({
    'spectrogram_label_offset_seconds':'max'    
})
train['max_time']=tmp

tmp = train_csv.groupby('eeg_id')[['patient_id']].agg({
    'patient_id':'first'
})
train['patient_id']=tmp

tmp = train_csv.groupby('eeg_id')[targets].agg('sum')
for t in targets:
    train[t]=tmp[t].values
    
y_train = train[targets].values
y_train = y_train/y_train.sum(axis=1, keepdims=True)

train[targets]=y_train
tmp = train_csv.groupby('eeg_id')[['expert_consensus']].agg('first')
train['target']=tmp
train = train.reset_index()

In [None]:
train.head()

In [None]:
%%time
GEN_FEATS=False
from tqdm.notebook import trange
from tqdm import tqdm
import time
from scipy.stats import kurtosis, skew, entropy
import gc
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

PATH = "/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/"

if GEN_FEATS:
    cols = train_spectrogram1.columns[1:]
    feats = [f"{c}_mean" for c in cols]
    feats += [f"{c}_std" for c in cols]
    feats += [f"{c}_kurtosis" for c in cols]
    feats += [f"{c}_skew" for c in cols]
    feats += [f"{c}_min" for c in cols]
    feats += [f"{c}_max" for c in cols]
    feats += [f"{c}_energy" for c in cols]
    feats += [f"{c}_entropy" for c in cols]
    print(f"Loading {len(feats)} Spectogram Features....")
    del cols; gc.collect()
    data = np.zeros((len(train), len(feats)))
    for i in tqdm(range(len(train)), desc="Generating features"):
        spec_id = str(train.iloc[i,:]['spec_id'])
        spec = pd.read_parquet(PATH+spec_id+".parquet")
        spec = spec.drop(['time'], axis=1).values
        #mean
        data[i, :400] = np.nanmean(spec, axis=0, keepdims=True)
        #std
        data[i, 400:800] = np.nanstd(spec, axis=0, keepdims=True)
        #kurtosis
        data[i, 800:1200] = kurtosis(spec, nan_policy = "omit", 
                                     axis=0, keepdims=True)
        #skew
        data[i, 1200:1600] = skew(spec, nan_policy = "omit",
                                 axis=0, keepdims=True)
        #min
        data[i, 1600:2000] = np.nanmin(spec, axis=0, keepdims=True)
        
        #max
        data[i, 2000:2400] = np.nanmax(spec, axis=0, keepdims=True)
        
        #energy
        data[i, 2400:2800] = np.nansum(spec**2, axis=0, keepdims=True)/len(spec)
        
        #entropy
        spec = np.nan_to_num(spec)
        data[i, 2800:3200] = entropy(spec, axis=0 )
    
    train_data = pd.DataFrame(data)
    train_data.columns = feats
    train_data.to_parquet("./heavydata.parquet", compression="gzip")
    
LOAD_SPEC = True
if LOAD_SPEC:
    print("Loading 3200 Spectogram Features....")
    data = pd.read_parquet("/kaggle/input/spectogram-data/heavydata.parquet")
    feats = data.columns.values
    data = data.values
    data[:, 800:1200] = np.nan_to_num(data[:, 800:1200])
    data[:, 1200:1600] = np.nan_to_num(data[:, 1200:1600])
    data[:, 2800:3200] = np.nan_to_num(data[:, 2800:3200])
    
    
    print("Loaded all the features from spectograms")

#nanstd() scipy.stats.kurtosis() scipy.stats.skew() min() max() scipy.stats.entropy()
#nan_policy='omit' keepdims=True
#check axis (=0 most prolly)

In [None]:
def get_eeg(eeg_id, test=False):
    if test:
        PATH = "/kaggle/input/hms-harmful-brain-activity-classification/test_eegs/"
    else:
        PATH = "/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/"
    eeg = pd.read_parquet(PATH+eeg_id+".parquet")
    start = (eeg.shape[0] - 10_000)//2
    data = eeg.iloc[start:start+10_000, :-1].values
    return data

In [None]:
%%time
GEN_EEG_FEATS = False
if GEN_EEG_FEATS:
    cols = train_eeg1.columns[:-1]
    feats = [f"{c}_mean" for c in cols]
    feats += [f"{c}_std" for c in cols]
    feats += [f"{c}_kurtosis" for c in cols]
    feats += [f"{c}_skew" for c in cols]
    feats += [f"{c}_min" for c in cols]
    feats += [f"{c}_max" for c in cols]
    feats += [f"{c}_energy" for c in cols]
    feats += [f"{c}_entropy" for c in cols]
    
    eeg_data = np.zeros((len(train), len(feats)))
    
    for i in trange(len(train)):
        eeg_id = str(train.iloc[i,:]['eeg_id'])
        data = get_eeg(eeg_id)
        
        #mean
        eeg_data[i, :19] = np.nanmean(data, axis=0, keepdims=True)
        #std
        eeg_data[i, 19:38] = np.nanstd(data, axis=0, keepdims=True)
        #kurtosis
        eeg_data[i, 38:19*3] = kurtosis(data, axis=0,
                                       nan_policy='omit', keepdims=True)
        #skew
        eeg_data[i, 19*3:19*4] = skew(data, axis=0,
                                     nan_policy='omit', keepdims=True)
        #min
        eeg_data[i, 19*4:19*5] = np.nanmin(data, axis=0, keepdims=True)
        #max
        eeg_data[i, 19*5:19*6] = np.nanmax(data, axis=0, keepdims=True)
        #energy
        eeg_data[i, 19*6:19*7] = np.nansum(data**2, axis=0, keepdims=True)/len(data)
        #entropy
        data = np.nan_to_num(data)
        eeg_data[i, 19*7:19*8] = entropy(data, axis=0)
        
    train_data = pd.DataFrame(eeg_data)
    train_data.columns = feats
    train_data.to_parquet("./eeg_data.parquet", compression='gzip')
    
LOAD_EEG = False
if LOAD_EEG:
    print(f"Loading {19*8} EEG features....")
    data = pd.read_parquet("/kaggle/input/spectogram-data/eeg_data.parquet")
    feats = data.columns.values
    #data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data = data.values
    data = np.nan_to_num(data)
    print("Loaded all the features")

In [None]:
Y = train['target'].map(tars).values

In [None]:
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier

In [None]:
def KL_loss(p,q):
    epsilon=10**(-15)
    p=torch.clip(p,epsilon,1-epsilon)
    q = nn.functional.log_softmax(q,dim=1)
    return torch.mean(torch.sum(p*(torch.log(p)-q),dim=1))

def AOS(total_error):
    return 1/total_error

In [None]:
%%time
from tqdm.auto import tqdm
from sklearn.model_selection import GroupKFold
train_cat=True
if train_cat:
    print("Training CatBoost model")
    n_splits=5
    gkf = GroupKFold(n_splits=5)
    device = 'GPU' if torch.cuda.is_available() else 'CPU'
    probs=[]
    true=[]
    best_score = np.inf
    print(f"Running on {device}")
    for i, (train_index, valid_index) in enumerate(tqdm(gkf.split(data, Y, train.patient_id))):
        cat_model = CatBoostClassifier(task_type=device, loss_function="MultiClass")
        train_pool = Pool(data = data[train_index, :], label = Y[train_index])
        valid_pool = Pool(data = data[valid_index, :], label = Y[valid_index])
    
        cat_model.fit(train_pool,
                 verbose=100,
                 eval_set=valid_pool)
    
        prob = cat_model.predict_proba(valid_pool)
        probs.append(prob)
        true.append(y_train[valid_index,:])
        
        score = float(KL_loss(torch.tensor(prob), torch.tensor(y_train[valid_index, :])))
        print(f"Score : {score}")
        if score<best_score:
            cat_model.save_model("./bestCATboost.cat")
            print(f"saved the model!")
            best_score=score
    
        del train_pool, valid_pool, prob
        gc.collect()

In [None]:
import matplotlib.pyplot as plt
TOP = 25
if train_cat:
    feature_importance = cat_model.feature_importances_
    #print(feature_importance)
    sorted_idx = np.argsort(feature_importance)
    fig = plt.figure(figsize=(10, 8))
    plt.barh(np.arange(len(sorted_idx))[-TOP:], feature_importance[sorted_idx][-TOP:], align='center')
    plt.yticks(np.arange(len(sorted_idx))[-TOP:], np.array(feats)[sorted_idx][-TOP:])
    plt.title(f'Feature Importance - Top {TOP}')
    plt.show()

In [None]:
def gen_valid_pool(X, y):
    pool = []
    for i in range(len(X)):
        tup = (X[i,:], y[i])
        pool.append(tup)
    return np.array(pool)

In [None]:
train_xgb=False
if train_xgb:
    print("Training XGBoost")
    probs=[]
    true=[]
    n_splits=5
    gkf = GroupKFold(n_splits=n_splits)
    tree_method = 'gpu_hist' if torch.cuda.is_available() else 'hist'
    print(f"Running on {tree_method}")
    for i, (train_index, valid_index) in enumerate(tqdm(gkf.split(data, Y, train.patient_id))):
        xgb_model = XGBClassifier(tree_method=tree_method)
        #train_pool = Pool(data = data[train_index, :], label = Y[train_index])
        #valid_pool = gen_valid_pool(X = data[valid_index, :], y = Y[valid_index])
        train_X = data[train_index, :]
        train_Y = Y[train_index]
        val_X = data[valid_index, :]
        val_Y = Y[valid_index]
    
        xgb_model.fit(X = train_X, y = train_Y,
                 verbose=100)
    
        prob = xgb_model.predict_proba(val_X)
        probs.append(prob)
        true.append(y_train[valid_index, :])
    
        del train_X, train_Y, val_X, val_Y
        gc.collect()
    

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib
train_rf = False
if train_rf:
    print("Training Random Forest")
    probs=[]
    true=[]
    n_splits=5
    gkf = GroupKFold(n_splits=n_splits)
    device = 'GPU' if torch.cuda.is_available() else 'CPU'
    best_score = np.inf
    for i, (train_index, valid_index) in enumerate(tqdm(gkf.split(data, Y, train.patient_id))):
        rfc_model = RandomForestClassifier(n_estimators=200, 
                                     n_jobs=-1, 
                                    verbose=100,
                                     random_state=17)
        
        train_x, val_x, train_y, val_y = data[train_index,:], data[valid_index, :], Y[train_index], Y[valid_index]
        
        rfc_model.fit(train_x, train_y)
        prob = rfc_model.predict_proba(val_x)
        probs.append(prob)
        true.append(y_train[valid_index, :])
        score = float(KL_loss(torch.tensor(prob), torch.tensor(y_train[valid_index, :])))
        print(f"Score : {score}")
        if score<best_score:
            joblib.dump(rfc_model, "./rfc_model.joblib")
            print(f"saved the model!")
            best_score=score
            
        del train_x, val_x, train_y, val_y
        gc.collect()
            

In [None]:
if train_cat:
    s=0
    for i in range(n_splits):
        err = KL_loss(torch.tensor(probs[i]), torch.tensor(true[i]))
        err = float(err)
        s += err
    cat_err = s/5
    
if train_xgb and train_cat:
    s=0
    for i in range(n_splits, len(probs)):
        err = KL_loss(torch.tensor(probs[i]), torch.tensor(true[i]))
        err = float(err)
        s += err
    
    xgb_err = s/5

    cat_AOS, xgb_AOS = AOS(cat_err), AOS(xgb_err)
    
if train_rf:
    s=0
    for i in range(n_splits):
        s+=float(KL_loss(torch.tensor(probs[i]), torch.tensor(true[i])))
        
    rf_error = s/5

In [None]:
cat_err

In [None]:
if LOAD_SPEC:
    test_path = "/kaggle/input/hms-harmful-brain-activity-classification/test_spectrograms/"
    test_csv = pd.read_csv(base_path+"test.csv")
    test_data = np.zeros((len(test_csv), len(feats)))
    for k in range(len(test_csv)):
        spec_id = str(test_csv.iloc[k]['spectrogram_id'])
        spec = pd.read_parquet(test_path+spec_id+".parquet")
        spec = spec.drop(['time'], axis=1).values
        #mean
        test_data[k,:400] = np.nanmean(spec, axis=0, keepdims=True)
    
        #std
        test_data[k, 400:800] = np.nanstd(spec, axis=0, keepdims=True)
    
        #kurtosis
        test_data[k, 800:1200] = kurtosis(spec, nan_policy = "omit", 
                                     axis=0)
        #skew
        test_data[k, 1200:1600] = skew(spec, nan_policy = "omit", 
                                     axis=0)
        #min
        test_data[k, 1600:2000] = np.nanmin(spec, axis=0, keepdims=True)
        #max
        test_data[k, 2000:2400] = np.nanmax(spec, axis=0, keepdims=True)
        #energy
        test_data[k, 2400:2800] = np.nansum(spec**2, axis=0, keepdims=True)/len(spec)
        #entropy
        spec = np.nan_to_num(spec)
        test_data[k, 2800:3200] = entropy(spec, axis=0)
    
    test_data = np.nan_to_num(test_data)
    
    sample_sub = pd.read_csv("/kaggle/input/hms-harmful-brain-activity-classification/sample_submission.csv", index_col=False)

In [None]:
if LOAD_EEG:
    test_path = "/kaggle/input/hms-harmful-brain-activity-classification/test_eegs/"
    test_csv = pd.read_csv(base_path+"test.csv")
    test_data = np.zeros((len(test_csv), len(feats)))
    
    for i in range(len(test_csv)):
        eeg_id = str(test_csv.iloc[i]['eeg_id'])
        data = get_eeg(eeg_id, test=True)
        
        
        #mean
        test_data[i, :19] = np.nanmean(data, axis=0, keepdims=True)
        #std
        test_data[i, 19:38] = np.nanstd(data, axis=0, keepdims=True)
        #kurtosis
        test_data[i, 38:19*3] = kurtosis(data, axis=0,
                                       nan_policy='omit')
        #skew
        test_data[i, 19*3:19*4] = skew(data, axis=0,
                                     nan_policy='omit')
        #min
        test_data[i, 19*4:19*5] = np.nanmin(data, axis=0, keepdims=True)
        #max
        test_data[i, 19*5:19*6] = np.nanmax(data, axis=0, keepdims=True)
        #energy
        test_data[i, 19*6:19*7] = np.nansum(data**2, axis=0, keepdims=True)/len(data)
        #entropy
        data = np.nan_to_num(data)
        test_data[i, 19*7:19*8] = entropy(data, axis=0)
        
    test_data = np.nan_to_num(test_data)
    
    sample_sub = pd.read_csv("/kaggle/input/hms-harmful-brain-activity-classification/sample_submission.csv", index_col=False)
    

In [None]:
test_data.shape

In [None]:
if train_cat:
    cat_preds=[]
    for i in trange(n_splits):
        model = CatBoostClassifier(task_type=device)
        model.load_model("./bestCATboost.cat")
    
        test_pool = Pool(data = test_data)
    
        cat_pred = model.predict_proba(test_pool)
        cat_preds.append(cat_pred)
    
    cat_pred = np.mean(np.array(cat_preds), axis=0)

    print(cat_pred)

In [None]:
if train_xgb:
    xgb_preds=[]
    for i in trange(n_splits):
        xgb_pred = xgb_model.predict_proba(test_data)
        xgb_preds.append(xgb_pred)
    
    xgb_pred = np.mean(np.array(xgb_preds), axis=0)
    print(xgb_pred)

In [None]:
if train_rf:
    rf_preds=[]
    for i in trange(n_splits):
        best_rf_model = joblib.load("./rfc_model.joblib")
        test_data[np.isneginf(test_data)]=np.nan
        test_data[:, 19*7:19*8] = 0
        print(test_data)
        
        pred = best_rf_model.predict_proba(test_data)
        rf_preds.append(pred)
        
    rf_pred = np.mean(np.array(rf_preds), axis=0)
    print(rf_pred)

In [None]:
if train_cat:
    final_pred = cat_pred
    
if train_xgb and train_cat:
    final_pred = (cat_pred + xgb_pred)/2
    print(final_pred.sum())
    eps = final_pred.sum() - 1.
    print(final_pred[0,5])
    final_pred[0,5] -= eps
    print(final_pred[0,5])
    final_pred.sum()
    
if train_rf:
    final_pred = rf_pred


In [None]:
import shutil
if train_cat:
    shutil.rmtree('/kaggle/working/catboost_info')
    #os.remove("./bestCATboost.cat")
    
if train_rf:
    os.remove("./rfc_model.joblib")
    
sample_sub[targets]=final_pred
print(final_pred.sum())
sample_sub.to_csv("./submission.csv", index=False)
print("Shape: ", sample_sub.shape)
sample_sub.head()