In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:# Group by patient_id and summarize the votes
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy.io import loadmat
import os
import dask.dataframe as dd
from dask.multiprocessing import get
import warnings
warnings.filterwarnings('ignore', category=Warning)

In [None]:
train_df = pd.read_csv('../input/hms-harmful-brain-activity-classification/train.csv')
train_df

In [None]:
print(train_df.isnull().sum())

In [None]:
label_cols = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
train_df[label_cols].sum().plot(kind='bar')
plt.title('Distribution of Annotator Votes')
plt.show()

In [None]:
train_df['expert_consensus'].value_counts().plot(kind='bar')
plt.title('Distribution of Expert Consensus Labels')
plt.show()

In [None]:
vote_columns = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']

sns.boxplot(data=train_df[vote_columns])
plt.title('Boxplot of Annotator Votes')
plt.show()

In [None]:
# Pairwise comparison of votes to see if there are any patterns
sns.pairplot(train_df[vote_columns])
plt.title('Pairwise Distribution of Votes')
plt.show()

In [None]:
corr = train_df[vote_columns].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Annotator Votes')
plt.show()

In [None]:
patient_group = train_df.groupby('patient_id')[vote_columns].sum()

patient_group.plot(kind='bar', stacked=True)
plt.title('Sum of Votes per Patient')
plt.show()

In [None]:
train_df['expert_consensus'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Class Distribution in Expert Consensus')
plt.show()

In [None]:
eeg_data = pd.read_parquet('/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/1000913311.parquet')
eeg_data

In [None]:
scaler = StandardScaler()
eeg_data_scaled = pd.DataFrame(scaler.fit_transform(eeg_data), columns=eeg_data.columns)

In [None]:
features_eeg = ['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz', 'Fp2', 'F4', 'C4', 'P4', 'F8', 'T4', 'T6', 'O2', 'EKG']

plt.figure(figsize=(15, 10))
for i, feature in enumerate(features_eeg):
    plt.subplot(len(features_eeg), 1, i+1)
    plt.plot(eeg_data[feature])
    plt.title(feature, loc='left')
plt.tight_layout()
plt.show()

In [None]:
def extract_eeg_segment(eeg_data, offset_seconds, duration, sampling_rate):
    start_sample = int(offset_seconds * sampling_rate)
    end_sample = start_sample + int(duration * sampling_rate)
    return eeg_data.iloc[start_sample:end_sample]

sampling_rate = 200  
segment_duration = 1

# Extracting EEG segments and aligning them with labels
aligned_eeg_data = []
for index, row in train_df.iterrows():
    eeg_segment = extract_eeg_segment(eeg_data_scaled, row['eeg_label_offset_seconds'], segment_duration, sampling_rate)
    aligned_eeg_data.append(eeg_segment.mean(axis=0))

features_df = pd.DataFrame(aligned_eeg_data)

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(features_df, train_df[label_cols], test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean') 
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)

model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train_imputed, y_train)

In [None]:
predictions = model.predict_proba(X_val_imputed)

In [None]:
from scipy.stats import entropy
from sklearn.preprocessing import MultiLabelBinarizer

flat_predictions = np.hstack([pred[:, 1].reshape(-1, 1) for pred in predictions])

# Convert y_val to binary format
mlb = MultiLabelBinarizer()
actual_binary = mlb.fit_transform(y_val.apply(lambda x: [i for i, val in enumerate(x) if val], axis=1))

# Adjusted KL Divergence Calculation
def calculate_kl_divergence(predicted, actual):
    kl_divergence = 0
    for pred, act in zip(predicted.T, actual.T):  # Transpose to iterate over each label
        # Adding a small value to predicted probabilities to avoid log(0) error
        pred = np.clip(pred, 1e-15, 1 - 1e-15)
        kl_divergence += entropy(act, pred)
    return kl_divergence / actual.shape[1]

kl_divergence_score = calculate_kl_divergence(flat_predictions, actual_binary)

print(f"KL Divergence on validation set: {kl_divergence_score}")

In [None]:
test_df = pd.read_csv('../input/hms-harmful-brain-activity-classification/test.csv')

submission_data = []

for eeg_id in test_df['eeg_id'].unique():
    eeg_test_data = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/test_eegs/{eeg_id}.parquet')
    eeg_test_data_scaled = pd.DataFrame(scaler.transform(eeg_test_data[features_eeg]), columns=features_eeg)

    test_pred = model.predict_proba(eeg_test_data_scaled)
    test_flat_pred = np.hstack([pred[:, 1].reshape(-1, 1) for pred in test_pred])
    
    test_flat_pred /= np.sum(test_flat_pred, axis=1, keepdims=True)

    eeg_pred_aggregated = np.mean(test_flat_pred, axis=0)

    submission_data.append({'eeg_id': eeg_id, **dict(zip(label_cols, eeg_pred_aggregated))})

submission_df = pd.DataFrame(submission_data, columns=['eeg_id'] + label_cols)
submission_df.apply(lambda x: x/x.sum(), axis=1)
submission_df.to_csv('submission.csv', index=False)

In [None]:
# sampling_rate = 200

# aligned_eeg = []

# # Loop through each row in train_df
# for index, row in train_df.iterrows():
#     start_sample = int(row['eeg_label_offset_seconds'] * sampling_rate)
#     end_sample = start_sample + sampling_rate  # Adjust the window size as needed
    
#     # Aggregate the EEG data within this window
#     aggregated_data = eeg_data_scaled.iloc[start_sample:end_sample].mean()
#     aligned_eeg.append(aggregated_data)

# # Convert the list to DataFrame
# aligned_eeg_df = pd.DataFrame(aligned_eeg)

In [None]:
# print(aligned_eeg_df.isnull().sum())

In [None]:

# y = train_df[label_cols]

# X_train, X_test, y_train, y_test = train_test_split(aligned_eeg_df, y, test_size=0.2, random_state=42)

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.multiclass import OneVsRestClassifier

# imputer = SimpleImputer(strategy='mean')
# X_train_imputed = imputer.fit_transform(X_train)
# X_test_imputed = imputer.transform(X_test)

# models = {}
# for label in label_cols:
#     clf = LogisticRegression()
#     clf.fit(X_train_imputed, y_train[label])
#     models[label] = clf

In [None]:
# df_test = pd.read_csv('../input/hms-harmful-brain-activity-classification/test.csv')

# for label in label_cols:
#     # Predict probabilities for each label
#     label_prob = models[label].predict_proba(df_test)[:, 1]
    
#     # Check if the number of predictions matches the number of eeg_id in submission
#     if len(label_prob) == len(submission['eeg_id']):
#         submission[label] = label_prob
#     else:
#         raise ValueError("Mismatch in the number of predictions and number of eeg_ids")

In [None]:
# submission_df = pd.DataFrame(prob_predictions, columns=label_cols)
# submission_df['eeg_id'] = train_df['eeg_id']  # Add 'eeg_id' column

# # Reorder columns to match the desired format
# submission_df = submission_df[['eeg_id'] + label_cols]
# submission_df
# # Calculate the sum of probabilities for each row and normalize them
# row_sums = submission_df[label_cols].sum(axis=1)
# submission_df[label_cols] = submission_df[label_cols].div(row_sums, axis=0)

# # Save the submission DataFrame to a CSV file
# submission_df.to_csv('submission.csv', index=False)

# # Display the modified submission DataFrame
# print(submission_df.head())