In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
'''for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))'''

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Face into data

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [None]:
train = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
print(f'Train data shape = {train.shape}')
train.head()

In [None]:
print(f'Count of unique patients: {len(train.patient_id.unique())}')
print(f'Count of unique spectrograms: {len(train.spectrogram_id.unique())}')
print(f'Count of unique EEGs: {len(train.eeg_id.unique())}')

In [None]:
train_columns = ['eeg_id','eeg_sub_id','eeg_label_offset_seconds','spectrogram_id','spectrogram_sub_id','spectrogram_label_offset_seconds','label_id','patient_id']

In [None]:
TARGET_COLUMNS = ['seizure_vote','lpd_vote','gpd_vote','lrda_vote','grda_vote','other_vote']
CLASS_NAMES = ['Seizure', 'LPD', 'GPD', 'LRDA','GRDA', 'Other']
LABEL2NAME = dict(enumerate(CLASS_NAMES))
NAME2LABEL = {v:k for k, v in LABEL2NAME.items()}

EEG_PATH_TEMPL = '/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/'
SP_PATH_TEMPL = '/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/'

WIN_SIZE =  10 # 10 seconds
EEG_FR = 200 # 200 samples per seconds
EEG_T = WIN_SIZE*EEG_FR
CHAINS = {
    'LL' : [("Fp1","F7"),("F7","T3"),("T3","T5"),("T5","O1")],
    'RL' : [("Fp2","F8"),("F8","T4"),("T4","T6"),("T6","O2")],
    'LP' : [("Fp1","F3"),("F3","C3"),("C3","P3"),("P3","O1")],
    'RP' : [("Fp2","F4"),("F4","C4"),("C4","P4"),("P4","O2")]
}
SP_WIN = 600 # 10 minutes = 600 seconds
EGG_WIN = 50 # 50 seconds

LABELED_SECS = 10


## Visualization data

In [None]:
def get_eeg_sp_data(train_row):
    eeg_id = train_row.eeg_id
    sp_id = train_row.spectrogram_id
    
    eeg_parquet = pd.read_parquet(f'{EEG_PATH_TEMPL}{eeg_id}.parquet')
    sp_parquet = pd.read_parquet(f'{SP_PATH_TEMPL}{sp_id}.parquet')
    
    # offset of data
    eeg_offset = int(train_row.eeg_label_offset_seconds + 20) #only 10 central seconds from 50 secs were labeled
    sp_offset = int(train_row.spectrogram_label_offset_seconds )
    
    # get spectrogram data
    sp = sp_parquet.loc[(sp_parquet.time>=sp_offset)&(sp_parquet.time<sp_offset+SP_WIN)]
    sp = sp.loc[:, sp.columns != 'time']
    sp = {
        "LL": sp.filter(regex='^LL', axis=1),
        "RL": sp.filter(regex='^RL', axis=1),
        "RP": sp.filter(regex='^RP', axis=1),
        "LP": sp.filter(regex='^LP', axis=1)}
    
    # calculate eeg data
    eeg_data = eeg_parquet.iloc[eeg_offset*EEG_FR:(eeg_offset+WIN_SIZE)*EEG_FR]
    
    eeg = {}
    for chain in CHAINS.keys():
        eeg[chain] = []
        for s_i, signals in enumerate(CHAINS[chain]):
            diff=eeg_data[signals[0]]-eeg_data[signals[1]]
            diff.ffill(inplace = True)
            eeg[chain].append(diff)
    
    return eeg, sp, train_row[TARGET_COLUMNS].values

In [None]:
example_id = 4567
exp_row = train.iloc[example_id]
exp_row

In [None]:
eeg_data, sp_data, targets = get_eeg_sp_data(exp_row)

In [None]:
eeg_data['LL'][0].shape, sp_data['LL'].shape, targets

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
 
# create objects

def plot_data(eeg_data, sp_data):
    fig, axes = plt.subplots(ncols=2, nrows=len(CHAINS)*4,figsize=(30, 40))
    
    time_x = np.arange(-5,5,1/200)
    x_ticks = np.arange(-5,5,1)
    
    for i, chain in enumerate(CHAINS):
        # plot eeg raw signals
        for j, dt in enumerate(eeg_data[chain]):
            ax = sns.lineplot(x=time_x, y=dt, ax=axes[i*4+j, 0])
            ax.set_xticks(x_ticks)
            ax.set_title(f'{CHAINS[chain][i][0]}-{CHAINS[chain][i][1]}')
            ax.grid(True) 
        
        # plot spectrogram
        gs = axes[i*4, 1].get_gridspec()
        axsbig = fig.add_subplot(gs[i*4:(i+1)*4, -1])
        log_spec = np.log(sp_data[chain].T + np.finfo(float).eps)
        height = log_spec.shape[0]
        width = log_spec.shape[1]
        X = np.linspace(0, np.size(sp_data[chain]), num=width, dtype=int)
        Y = range(height)
        axsbig.pcolormesh(X, Y, log_spec)
        axsbig.set_title(chain)
    fig.tight_layout()
    plt.show()

In [None]:
print(f'TARGET = {targets}')
plot_data(eeg_data, sp_data)

In [None]:
def plot_spectrogram(spectrogram, ax):
    assert len(spectrogram.shape) == 2   
    # Convert the frequencies to log scale and transpose, so that the time is
    # represented on the x-axis (columns).
    # Add an epsilon to avoid taking a log of zero.
    log_spec = np.log(spectrogram.T + np.finfo(float).eps)
    height = log_spec.shape[0]
    width = log_spec.shape[1]
    X = np.linspace(0, np.size(spectrogram), num=width, dtype=int)
    Y = range(height)
    ax.pcolormesh(X, Y, log_spec)

In [None]:
fig, axes = plt.subplots(4, figsize=(20, 35))
for i, chain in enumerate(CHAINS.keys()):
    plot_spectrogram(sp_data[chain], axes[i])
    axes[i].set_title(chain)
plt.show()


# Model1 
In this model the spectrogram data is used

## Read all spectrograms

In [None]:
print(f'Shape of a spectrogram is {sp_data["LL"].shape}')

In [None]:
## Read all spectrograms
READ_SPEC_FILES = False

# READ ALL SPECTROGRAMS
files = os.listdir(SP_PATH_TEMPL)
print(f'There are {len(files)} spectrogram parquets')

if READ_SPEC_FILES:    
    spectrograms = {}
    for i,f in tqdm(enumerate(files)):
        tmp = pd.read_parquet(f'{SP_PATH_TEMPL}{f}')
        sp_id = int(f.split('.')[0])
        spectrograms[sp_id] = tmp.iloc[:,1:].values
        #with open("/kaggle/working/specs.npy", "wb") as f:
            #np.save(f, spectrograms)
else:
    spectrograms = np.load('/kaggle/input/all-spectrograms/specs.npy',allow_pickle=True).item()

In [None]:
len(spectrograms)

## Create a DataReader

In [None]:
SPECTROGRAM_SHAPE = (300,400)
OUTPUT_SHAPE = 6

In [None]:
# Convert target value to propabilities
y_data = train[TARGET_COLUMNS].values
y_data = y_data / y_data.sum(axis=1,keepdims=True)
train[TARGET_COLUMNS] = y_data
train[1000: 1010]

In [None]:
class DataGenerator:
    'Generates data for Keras'
    def __init__(self, eeg_data, specs, mode='train', specs_shape = SPECTROGRAM_SHAPE, output_shape = OUTPUT_SHAPE): 

        self.eeg_data = eeg_data
        self.mode = mode
        self.specs = specs
        self.specs_shape = specs_shape
        self.height = self.specs_shape[0]
        self.width = self.specs_shape[1]
        self.output_shape = output_shape
        self.indexes = np.arange( len(self.eeg_data) )
        self.on_epoch_end()
        
    def __len__(self):
        return len(self.eeg_data)
    
    def __call__(self):
        for j,i in enumerate(self.indexes):
            yield self.__getitem__(i)

            if j == self.__len__()-1:
                self.on_epoch_end()

    def __getitem__(self, index):
        return self.__data_generation(index)

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        np.random.shuffle(self.indexes)
                        
    def __data_generation(self, index):
        'Generates data containing batch_size samples' 
        
        X = np.zeros((self.height,self.width,1),dtype='float32')
        y = np.zeros((self.output_shape,),dtype='float32')
        
        row = self.eeg_data.iloc[index]
        
        # offset of data
        sp_offset = 0
        if self.mode == 'train':
            sp_offset = int(row.spectrogram_label_offset_seconds )//2 

        # get spectrogram data
            # EXTRACT 300 ROWS OF SPECTROGRAM
        img = self.specs[row.spectrogram_id][sp_offset:sp_offset+self.height,0: self.width]
        X[:,:,0] = np.nan_to_num(img, nan=0.0)

        if self.mode!='test':
            y = row[TARGET_COLUMNS]
            
        return X,y

In [None]:
'''
class DataGenerator:
    'Generates data for Keras'
    def __init__(self, eeg_data, specs, mode='train', specs_shape = SPECTROGRAM_SHAPE, output_shape = OUTPUT_SHAPE): 

        self.eeg_data = eeg_data
        self.mode = mode
        self.specs = specs
        self.specs_shape = specs_shape
        self.height = self.specs_shape[0]
        self.width = self.specs_shape[1]
        self.output_shape = output_shape
        self.indexes = np.arange( len(self.eeg_data) )
        self.on_epoch_end()
        
    def __len__(self):
        return len(self.eeg_data)
    
    def __call__(self):
        for j,i in enumerate(self.indexes):
            yield self.__getitem__(i)

            if j == self.__len__()-1:
                self.on_epoch_end()

    def __getitem__(self, index):
        return self.__data_generation(index)

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        np.random.shuffle(self.indexes)
                        
    def __data_generation(self, index):
        'Generates data containing batch_size samples' 
        
        X = tf.zeros((self.height,self.width,4),dtype='float32')
        y = tf.zeros((self.output_shape,),dtype='float32')
        img = np.ones(self.specs_shape,dtype='float32')
        
        row = self.eeg_data.iloc[index]
        
        # offset of data
        sp_offset = 0
        if self.mode == 'train':
            sp_offset = int(row.spectrogram_label_offset_seconds )//2 

        # get spectrogram data
        for k in range(4):
            # EXTRACT 300 ROWS OF SPECTROGRAM
            img = self.specs[row.spectrogram_id][sp_offset:sp_offset+self.height,k*self.width:(k+1)*self.width]

            # NORMALIZATION PER IMAGE
            ep = 1e-6
            m = np.nanmean(img)
            s = np.nanstd(img)
            img = (img-m)/(s+ep)
            #img = tf.image.per_image_standardization(img)
            img = np.nan_to_num(img, nan=0.0)

            X[:,:,k] = img

                
            #X[j] = tf.image.per_image_standardization(X[j,:,:,:])
            if self.mode!='test':
                y = row[TARGET_COLUMNS]
            
        return X,y
        '''

In [None]:
import tensorflow as tf

In [None]:
data_gen = DataGenerator(train, spectrograms)
tf_ds =tf.data.Dataset.from_generator(data_gen, 
                                      output_types = (tf.float32, tf.float32)).shuffle(1000).batch(128).prefetch(tf.data.AUTOTUNE)

In [None]:
%%time
for (x,y) in tf_ds:
    print(x.shape)
    print(y.shape)
    break

In [None]:

img1 = x[5]
label1 = y[5]

fig, ax = plt.subplots(1, figsize=(22, 20))
print(label1)
plot_spectrogram(img1[:,:,0].numpy(), ax)
plt.show()

### Create train and validation datasets

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Multiply, Add, Conv2D, AveragePooling2D,Normalization,MaxPooling2D,Dropout,Flatten
from sklearn.model_selection import train_test_split

In [None]:
train_data, val_data = train_test_split(train, test_size=0.30, random_state=1234)
train_data.shape, val_data.shape

In [None]:
def preprocess_img(img, label, img_shape=(128,128)):
    # normalization
    img = tf.image.resize(img, [img_shape[0], img_shape[1]])
    img = tf.image.per_image_standardization(img)
    return img, label

In [None]:
train_ds_gen = DataGenerator(train_data, spectrograms)
val_ds_gen = DataGenerator(val_data, spectrograms)

In [None]:
train_ds = tf.data.Dataset.from_generator(train_ds_gen,output_types = (tf.float32, tf.float32), output_shapes =([300 , 400 , 1] , [6 , ]))
train_ds = train_ds.map(preprocess_img)\
        .shuffle(1000) \
        .batch(32) \
        .prefetch(tf.data.AUTOTUNE)

In [None]:
val_ds = tf.data.Dataset.from_generator(val_ds_gen,output_types = (tf.float32, tf.float32), output_shapes =([300 , 400 , 1] , [6 , ]))
val_ds = val_ds.map(preprocess_img)\
        .shuffle(1000) \
        .batch(32) \
        .prefetch(tf.data.AUTOTUNE)

In [None]:
def get_lenet_model(input_shape,num_labels):
    model = tf.keras.Sequential()
    model.add(Conv2D(filters=16, kernel_size=(3, 3), activation='relu', input_shape=(128,128,1)))
    model.add(AveragePooling2D())
    model.add(Conv2D(filters=32, kernel_size=(3, 3), activation='relu'))
    model.add(AveragePooling2D())
    model.add(Flatten())
    model.add(Dense(units=120, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=84, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=num_labels, activation='softmax', dtype='float32'))
    return model

In [None]:
# USE MULTIPLE GPUS
gpus = tf.config.list_physical_devices('GPU')
if len(gpus)<=1: 
    strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    print(f'Using {len(gpus)} GPU')
else: 
    strategy = tf.distribute.MirroredStrategy()
    print(f'Using {len(gpus)} GPUs')

In [None]:
with strategy.scope():
    model = get_lenet_model(SPECTROGRAM_SHAPE,OUTPUT_SHAPE)
    opt = tf.keras.optimizers.Adam(learning_rate = 1e-3)
    loss = tf.keras.losses.KLDivergence()

    model.compile(loss=loss, optimizer = opt)
    model.summary()

In [None]:
history = model.fit(
    train_ds, verbose=1,
    validation_data = val_ds,
    steps_per_epoch = 250,
    validation_steps = 125, epochs=120)
#model.save('/kaggle/working/model.h5')

In [None]:
model.evaluate(val_ds)

In [None]:
#model.save('/kaggle/working/model.h5')

In [None]:
# read test_data
test = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/test.csv')
print(f'Test data shape = {test.shape}')
test.head()

In [None]:
test_sp_id = test.iloc[0].spectrogram_id
test_sp_id

In [None]:
test_sp = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/test_spectrograms/{test_sp_id}.parquet')
test_spectrogram = {test_sp_id: test_sp.iloc[:,1:].values}

In [None]:
test_spectrogram[test_sp_id].shape

In [None]:
test_sp = tf.convert_to_tensor(test_spectrogram[test_sp_id])
test_sp = tf.expand_dims(test_sp, axis = 2)
test_sp = preprocess_img(test_sp,None)
tets_data = test_sp[0].numpy()
tets_data.shape

In [None]:
plt.imshow(tets_data)

In [None]:
np.std(tets_data[:,:,0])

In [None]:
tets_data = np.expand_dims(tets_data,0)

In [None]:
#prediction
prediction = model.predict(tets_data)

In [None]:
prediction = tf.reshape(prediction, -1).numpy()
prediction

In [None]:
prediction.sum()

In [None]:
sub_df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/sample_submission.csv')

In [None]:
sub_df[TARGET_COLUMNS] = prediction
sub_df

In [None]:
sub_df.to_csv("submission.csv", index=False)