In [None]:
import pandas as pd
import copy
from sklearn.model_selection import train_test_split
import numpy as np
import keras
import os
import concurrent.futures
import xgboost as xgb
from sklearn.metrics import accuracy_score
import lightgbm as lgb

In [None]:
train = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
test = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/test.csv')
sub = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/sample_submission.csv')

In [None]:
temp = pd.read_parquet('/kaggle/input/hms-harmful-brain-activity-classification/test_eegs/3911565283.parquet')

# Getting features with describe on EEGS

**describe contains [count, mean, std, min, 25%, 50%, 75%, max] which can be flattened to get a 1D feature vector for an eeg file** 

In [None]:
# %%time
# import os
# import concurrent.futures
# import pandas as pd

# folder_path = '/kaggle/input/hms-harmful-brain-activity-classification/train_eegs'

# def process_file(filename):
#     file_path = os.path.join(folder_path, filename)
#     df = pd.read_parquet(file_path)
#     df = df.describe().drop(df.describe().index[0])
#     return filename.split('.')[0], df.to_numpy().flatten()

# train_eegs = {}
# file_list = os.listdir(folder_path)

# with concurrent.futures.ProcessPoolExecutor() as executor:
#     futures = {executor.submit(process_file, filename): filename for filename in file_list}
#     for future in concurrent.futures.as_completed(futures):
#         filename = futures[future]
#         try:
#             result = future.result()
#             train_eegs[result[0]] = result[1]
#         except Exception as e:
#             print(f"Error processing file {filename}: {e}")

In [None]:
from joblib import dump, load
# dump(train_eegs,'train_eegs.joblib')
train_eegs = load('/kaggle/input/train-eegs/train_eegs.joblib')

In [None]:
x_train = train[['eeg_id','seizure_vote','lpd_vote','gpd_vote','lrda_vote','grda_vote','other_vote','expert_consensus']]
x_train = x_train.drop_duplicates()

In [None]:
val = []
for i in x_train.values:
    key = str(i[0])
    val.append(train_eegs[key])
val = np.asarray(val)

In [None]:
n = pd.DataFrame(val, columns= range(len(val[0])) )

In [None]:
x_train = x_train.reset_index()
x_train = pd.concat([x_train, n], axis=1)
x_train = x_train.drop(columns = ['index'])

# Making x_test

In [None]:
x_test = test[['eeg_id']]
x_sub = sub[['eeg_id']]

In [None]:
%%time

folder_path = '/kaggle/input/hms-harmful-brain-activity-classification/test_eegs'

def process_file(filename):
    file_path = os.path.join(folder_path, filename)
    df = pd.read_parquet(file_path)
    df = df.describe().drop(df.describe().index[0])
    return filename.split('.')[0], df.to_numpy().flatten()

test_eegs = {}
file_list = os.listdir(folder_path)

with concurrent.futures.ProcessPoolExecutor() as executor:
    futures = {executor.submit(process_file, filename): filename for filename in file_list}
    for future in concurrent.futures.as_completed(futures):
        filename = futures[future]
        try:
            result = future.result()
            test_eegs[result[0]] = result[1]
        except Exception as e:
            print(f"Error processing file {filename}: {e}")

In [None]:
val = []
for i in x_sub.values:
    key = str(i[0])
    val.append(test_eegs[key])
val = np.asarray(val)

In [None]:
n = pd.DataFrame(val, columns= range(len(val[0])) )

In [None]:
x_sub = x_sub.reset_index()
x_sub = pd.concat([x_sub, n], axis=1)
x_sub = x_sub.drop(columns= ['index'])

# data labeling and preprocessing

**Considering this a classification problem **

In [None]:
x_train['expert_consensus'] = x_train['expert_consensus'].replace({'Seizure': 0, 'LPD': 1, 'GPD': 2,'LRDA':3, 'GRDA':4, 'Other':5 })

In [None]:
lgb_train = x_train.drop(columns = ['eeg_id','seizure_vote','lpd_vote',	'gpd_vote',	'lrda_vote','grda_vote','other_vote','expert_consensus']).copy()
lgb_test = x_sub.drop(columns = ['eeg_id']).copy()

# LGBM Modeling

In [None]:
params = {
 'objective': 'multiclass',
 'num_class': 6,
 'boosting_type': 'gbdt',
 'metric': 'multi_logloss',
 'device': 'gpu',
 'num_leaves': 121,
 'learning_rate': 0.018623105710769177,
 'feature_fraction': 0.5894871939636406,
 'bagging_fraction': 0.756777580360579,
 'max_depth': 8
}


lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(lgb_train, x_train.expert_consensus , verbose=0)

y_pred = lgb_model.predict(lgb_test)
y_pred_proba = lgb_model.predict_proba(lgb_test)

In [None]:
params = {'objective': 'multi:softprob',
 'num_class': 6,
 'booster': 'gbtree',
 'eval_metric': 'mlogloss',
 'max_depth': 8,
 'learning_rate': 0.008406279027937572,
 'subsample': 0.7273986104941954,
 'colsample_bytree': 0.6818816981862805,
 'min_child_weight': 6,
 'gamma': 2.2671779654246492e-07}

xgb_model = xgb.XGBClassifier(**params, )
xgb_model.fit(lgb_train, x_train.expert_consensus )

final_y_pred_proba = xgb_model.predict_proba(lgb_test)

In [None]:
y_pred_proba = y_pred_proba*0.8 + final_y_pred_proba*0.2

In [None]:
sub['seizure_vote'] = y_pred_proba[:,0]
sub['lpd_vote'] = y_pred_proba[:,1]
sub['gpd_vote'] = y_pred_proba[:,2]
sub['lrda_vote'] = y_pred_proba[:,3]
sub['grda_vote'] = y_pred_proba[:,4]
sub['other_vote'] = y_pred_proba[:,5]

In [None]:
sub.to_csv('submission.csv',index = False)

***An upvote would be appreciated if you find this notebook helpful :')***