# UPDATE

V10:Data preprocessing has been added, resolving the issue of loss convergence.

V12:Added L2 regularization and filtering

In [None]:
import os,gc
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import glob
import torch
import warnings

# 禁用所有警告
warnings.filterwarnings('ignore')

# Load Train data

In [None]:
PATH = '/kaggle/input/hms-harmful-brain-activity-classification/'
df = pd.read_csv(PATH + 'train.csv')
TARGETS = df.columns[-6:]
print('Train shape:', df.shape )
print('Targets', list(TARGETS))
df.head()

# Create Non-Overlapping Eeg Id Train Data

In [None]:
train = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_id':'first','spectrogram_label_offset_seconds':'min'})
train.columns = ['spectrogram_id','min']

tmp = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_label_offset_seconds':'max'})
train['max'] = tmp

tmp = df.groupby('eeg_id')[['patient_id']].agg('first')
train['patient_id'] = tmp

tmp = df.groupby('eeg_id')[TARGETS].agg('sum')
for t in TARGETS:
    train[t] = tmp[t].values
    
y_data = train[TARGETS].values
y_data = y_data / y_data.sum(axis=1,keepdims=True)
train[TARGETS] = y_data

tmp = df.groupby('eeg_id')[['expert_consensus']].agg('first')
train['target'] = tmp

train = train.reset_index()
print('Train non-overlapp eeg_id shape:', train.shape )

train.head(20)

In [None]:
import numpy as np

ycol = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
cd = {'Seizure': 'seizure_vote', 'GPD': 'gpd_vote', 'LRDA': 'lrda_vote', 'Other': 'other_vote', 'GRDA': 'grda_vote', 'LPD': 'lpd_vote'}

# 提取概率列和标签列
eeg_id_col = train.iloc[:, 0]  # 第一列是 eeg_id 列
prob_cols = train.iloc[:, -7:-1]  # 倒数第七到倒数第二列是概率列
label_col = train.iloc[:, -1]  # 最后一列是标签列

# 将概率列转换为 float32 类型
prob_cols = prob_cols.astype("float32")

# 归一化概率列
prob_cols_normalized = prob_cols.div(prob_cols.sum(axis=1), axis=0)

# 重新组合成 DataFrame
normalized_data = pd.concat([eeg_id_col, prob_cols_normalized, label_col], axis=1)

normalized_data.head(20)

In [None]:
normalized_data.to_csv("normalized_data.csv", index=False)

In [None]:
EEG_PATH = '/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/'
train_path = '/kaggle/input/processed/normalized_data.csv'

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import glob
from scipy.signal import butter, sosfilt
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

import pandas as pd

class EEGDataset(Dataset):
    def __init__(self, csv_file, eeg_path):
        self.csv = pd.read_csv(csv_file)  # 加载 CSV 文件
        self.eeg_path = eeg_path
        self.sos = self.butter_bandpass_filter_init()
        self.FEATS = ['Fp1','T3','C3','O1','Fp2','C4','T4','O2']  # 指定需要使用的特征列

    def butter_bandpass_filter_init(self):
        lowcut = 0.5  # 设置带通滤波的低频截止频率
        highcut = 40.0  # 设置带通滤波的高频截止频率
        fs = 200.0  # 采样频率
        order = 5  # 滤波器阶数

        nyq = 0.5 * fs
        low = lowcut / nyq
        high = highcut / nyq
        sos = butter(order, [low, high], analog=False, btype='band', output='sos')
        return sos

    def butter_bandpass_filter(self, data):
        y = sosfilt(self.sos, data)
        return y
    
    def __len__(self):
        return len(self.csv)
    
    def __getitem__(self, idx):
        eeg_id = self.csv.loc[idx, 'eeg_id']
        eeg_file_path = f"{self.eeg_path}{eeg_id}.parquet"  # 构建 EEG 数据文件的路径
        eeg_data = pd.read_parquet(eeg_file_path)[self.FEATS].values  # 从 Parquet 文件中加载指定的特征列
        # 检查是否存在 NaN 值
        if np.isnan(eeg_data).any():
            # 如果存在 NaN 值，你可以选择将其填充为特定的值或者进行插值处理
            # 这里我们使用 SimpleImputer 进行简单的填充处理，将 NaN 值替换为均值
            imputer = SimpleImputer(strategy='mean')
            eeg_data = imputer.fit_transform(eeg_data)
        eeg_data = self.butter_bandpass_filter(eeg_data)  # 对数据进行滤波
        eeg_data = torch.tensor(eeg_data, dtype=torch.float32)  # 转换为 PyTorch 张量
                    
        # 选择中间 10000 个时间点的数据
        mid_index = eeg_data.shape[0] // 2
        start_index = mid_index - 5000  # 从中间向左偏移5000个时间点
        end_index = mid_index + 5000  # 到中间向右偏移5000个时间点
        eeg_data = eeg_data[start_index:end_index]
        # 交换维度位置
        eeg_data = torch.transpose(eeg_data, 0, 1)
        
        labels = torch.tensor(self.csv.loc[idx, ycol].values.astype(np.float32), dtype=torch.float32)  # 加载对应的标签
        #labels = labels.unsqueeze(0).expand(eeg_data.size(0), -1)  # 调整标签的尺寸与输出相匹配
        
        return eeg_data, labels

# 创建 EEG 数据集实例
dataset = EEGDataset(train_path, EEG_PATH)


# 创建数据加载器
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)

In [None]:
X, y = dataset[0]
print(f"Sample {0 + 1}: X shape {X.shape}, y shape {y.shape}")

# CNN+LSTM Model

In [None]:
import torch.nn.functional as F

class CNNLSTM(nn.Module):

    def __init__(self, in_channels=8, num_classes=6):
        super(CNNLSTM, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, 32, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm1d(32)
        self.relu1 = nn.ReLU(inplace=True)
        self.dropout1 = nn.Dropout(p=0.5)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)

        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm1d(64)
        self.relu2 = nn.ReLU(inplace=True)
        self.dropout2 = nn.Dropout(p=0.75)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)

        self.lstm1 = nn.LSTM(input_size=64, hidden_size=128, num_layers=1, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(input_size=256, hidden_size=128, num_layers=1, batch_first=True, bidirectional=True)

        self.attention = nn.Sequential(
            nn.Linear(128 * 2, 64),
            nn.Tanh(),
            nn.Linear(64, 1)
        )

        self.fc = nn.Linear(128 * 2, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.pool1(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.pool2(x)

        # reshape for LSTM
        batch_size, channels, seq_length = x.size()
        x = x.permute(0, 2, 1)

        # First LSTM layer
        x, _ = self.lstm1(x)

        # Second LSTM layer
        x, _ = self.lstm2(x)
        
        # Attention layer
        att_weights = F.softmax(self.attention(x), dim=1)
        x = torch.sum(att_weights * x, dim=1)

        # Fully connected layer
        x = self.fc(x)

        return x


In [None]:
# 定义一些超参数
input_channels = 8
num_classes = 6  # 类别数

# 创建EEGNet实例
model = CNNLSTM(in_channels=input_channels, num_classes=num_classes)

# 将模型放在GPU上
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(dataloader):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(dataloader)}")

# Predict test data

In [None]:
test_path = '/kaggle/input/hms-harmful-brain-activity-classification/test.csv'
TEST_EEG_PATH = '/kaggle/input/hms-harmful-brain-activity-classification/test_eegs/'

class TestEEGDataset(Dataset):
    def __init__(self, csv_file, eeg_path):
        self.csv = pd.read_csv(csv_file)  # Load the CSV file
        self.eeg_path = eeg_path
        self.sos = self.butter_bandpass_filter_init()  # Initialize the Butterworth filter parameters
        self.FEATS = ['Fp1','T3','C3','O1','Fp2','C4','T4','O2']
        
    def __len__(self):
        return len(self.csv)
    
    def butter_bandpass_filter_init(self):
        lowcut = 0.5  # Set the low-frequency cutoff for bandpass filtering
        highcut = 45.0  # Set the high-frequency cutoff for bandpass filtering
        fs = 200.0  # Sampling frequency
        order = 5  # Filter order

        nyq = 0.5 * fs
        low = lowcut / nyq
        high = highcut / nyq
        sos = butter(order, [low, high], analog=False, btype='band', output='sos')  # Create second-order sections for the Butterworth filter
        return sos

    def butter_bandpass_filter(self, data):
        y = sosfilt(self.sos, data)
        return y

    def __getitem__(self, idx):
        eeg_id = self.csv.loc[idx, 'eeg_id']
        eeg_file_path = f"{self.eeg_path}{eeg_id}.parquet"  # Build the EEG data file path
        eeg_data = pd.read_parquet(eeg_file_path)[self.FEATS].values  # Load EEG data from Parquet file

        eeg_data = self.butter_bandpass_filter(eeg_data)  # Apply filtering to the data
        eeg_data = torch.tensor(eeg_data, dtype=torch.float32)  # Convert to PyTorch tensor
         
        # Select 10,000 data points from the middle
        mid_index = eeg_data.shape[0] // 2
        start_index = mid_index - 5000  # Offset 5000 data points to the left from the middle
        end_index = mid_index + 5000  # Offset 5000 data points to the right from the middle
        eeg_data = eeg_data[start_index:end_index]
        # Swap dimensions
        eeg_data = torch.transpose(eeg_data, 0, 1)
        
        return eeg_data

# 创建 EEG 数据集实例
testdataset = TestEEGDataset(test_path, TEST_EEG_PATH)

# 创建数据加载器
test_dataloader = DataLoader(testdataset, batch_size=32, shuffle=True, num_workers=4)

# 2. 将模型设置为评估模式
model.eval()

predictions = []
# 执行推理
with torch.no_grad():
    for inputs in test_dataloader:  # 注意这里不需要 labels
        inputs = inputs.to(device)
        outputs = model(inputs)
        # 处理输出，得到类别概率值
        probabilities = torch.softmax(outputs, dim=1)
        predictions.append(probabilities.cpu().numpy())

# 4. 对预测结果进行处理，如转换为概率
predictions = np.concatenate(predictions, axis=0)

# 5. 输出预测结果
print(predictions)

In [None]:
# 创建 DataFrame
columns = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
results_df = pd.DataFrame(predictions, columns=columns)

# 打印 DataFrame
print(results_df)

In [None]:
test_data = pd.read_csv(test_path)
sub = pd.DataFrame({'eeg_id':test_data.eeg_id.values})
sub[TARGETS] = results_df
sub.to_csv('submission.csv',index=False)
print('Submissionn shape',sub.shape)
sub.head()

In [None]:
sub.iloc[:,-6:].sum(axis=1)