In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import *
from statistics import quantiles
pd.set_option('mode.chained_assignment', None)

p = '../input/hms-harmful-brain-activity-classification/'
train = pd.read_csv(p+'train.csv')
col = ['eeg_id', 'spectrogram_id', 'seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote', 'expert_consensus']
train = train[col]
train = train.drop_duplicates(subset=['eeg_id']).reset_index(drop=True)
test = pd.read_csv(p+'test.csv')
sub = pd.read_csv(p+'sample_submission.csv')

In [None]:
df = pd.read_parquet('../input/hms-harmful-brain-activity-classification/train_eegs/'+ str(train.eeg_id[0]) +'.parquet')[:10000][:10_000][4_000:6_000] #200 * 5
_ = df.plot(subplots=True)

In [None]:
def getStats(path, ids, idname):
    all_data = []
    for id_ in tqdm(ids):
        try:
            df = pd.read_parquet(path + str(id_) + '.parquet')[:10_000][4_000:6_000]
            cols = ['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz', 'Fp2', 'F4', 'C4', 'P4', 'F8', 'T4', 'T6', 'O2'] #, 'EKG'
            head = [idname] + ['mean_'+str(i) for i in range(4000,6000)]
            df_data = [id_] + list(df[cols].sum(axis=1).values)
        except:
            df_data = [id_] + [np.nan for i in range(2000)]
        all_data.append(df_data[:])
    return pd.DataFrame(all_data, columns=head)

def getStats2(path, ids, idname):
    all_data = []
    for id_ in tqdm(ids):
        try:
            df = pd.read_parquet(path + str(id_) + '.parquet')
            df = df[df['time'].isin(range(295,306))].reset_index(drop=True)
            cols = df.columns[1:] #remove time column
            head = [idname] + ['_'.join([c,str(i)]) for i in range(6) for c in cols]
            df_data = [id_] + list(df[:6][cols].values.flatten())
        except:
            df_data = [id_] + [np.nan for i in range(400*6)]
        all_data.append(df_data[:])
    return pd.DataFrame(all_data, columns=head)

In [None]:
%%time
trainx = getStats('../input/hms-harmful-brain-activity-classification/train_eegs/', train.eeg_id.values, 'eeg_id')
testx = getStats('../input/hms-harmful-brain-activity-classification/test_eegs/', test.eeg_id.values, 'eeg_id')

trainx2 = getStats2('../input/hms-harmful-brain-activity-classification/train_spectrograms/', train.spectrogram_id.values, 'spectrogram_id')
testx2 = getStats2('../input/hms-harmful-brain-activity-classification/test_spectrograms/', test.spectrogram_id.values, 'spectrogram_id')

In [None]:
trainx = pd.concat((trainx, trainx2), axis=1)
trainx.to_csv('trainx.csv', index=False) #Use output with different models no rerun required
print(trainx.shape)

testx = pd.concat((testx, testx2), axis=1)

In [None]:
epsilon=10e-15
def kl_divergence(solution, submission, micro=True):
    for col in solution.columns:
        submission[col] = np.clip(submission[col], epsilon, 1 - epsilon)
        y_nonzero_indices = solution[col] != 0
        solution[col] = solution[col].astype(float)
        solution.loc[y_nonzero_indices, col] = solution.loc[y_nonzero_indices, col] * np.log(solution.loc[y_nonzero_indices, col] / submission.loc[y_nonzero_indices, col])
        solution.loc[~y_nonzero_indices, col] = 0
        if micro:
            return np.average(solution.sum(axis=1))
        else:
            return np.average(solution.mean())

In [None]:
xcol = [c for c in trainx.columns if c not in ['eeg_id','spectrogram_id', 'expert_consensus']]
ycol = [c for c in train.columns if c not in ['eeg_id','spectrogram_id', 'expert_consensus']]

cd = {'Seizure':'seizure_vote', 'GPD':'gpd_vote', 'LRDA':'lrda_vote', 'Other':'other_vote', 'GRDA':'grda_vote', 'LPD':'lpd_vote'}
train['expert_consensus'] = train['expert_consensus'].map(cd)
for i in range(len(train)):
    c = train['expert_consensus'][i]
    train[c][i] = train[c][i]+10 #adding weight to expert consensus

ysum = train[ycol].sum(axis=1) 
for c in ycol:
    train[c] = (train[c] / ysum).astype(np.float64)

In [None]:
x1, x2, y1, y2 = model_selection.train_test_split(trainx[xcol].fillna(0), train[ycol], test_size=0.3, random_state=11, stratify=train.expert_consensus)
model = ensemble.ExtraTreesRegressor(n_estimators=400, max_depth=None, n_jobs=-1, random_state=1, verbose=0, max_features='sqrt')
model.fit(x1, y1)
pred = pd.DataFrame(model.predict(x2), columns=ycol)
ysum = pred.sum(axis=1)
for c in pred.columns: pred[c] = (pred[c] / ysum).astype(np.float64)
score=kl_divergence(y2.reset_index(drop=True), pred)
print(score)
model.fit(trainx[xcol].fillna(0), train[ycol])
sub = pd.DataFrame(model.predict(testx[xcol].fillna(0)), columns=ycol)
ysum = sub.sum(axis=1)
for c in sub.columns: sub[c] = (sub[c] / ysum).astype(np.float64)
sub['eeg_id'] = test['eeg_id']
sub.to_csv('submission.csv', index=False)
sub.head()

In [None]:
fe = pd.DataFrame({'features':xcol, 'importance': model.feature_importances_})
fe = fe.sort_values(by=['importance'], ascending=False)[:40]
fe.plot(kind='barh', x='features', y='importance')

# Ｈ𝐀𝑷𝑷𝓎 🇰𝗮𝘨𝘨🇱𝖎Ｎɢ 💯