In [4]:
import pandas as pd

train_df = pd.read_csv('exoTrain.csv')  # Replace with actual train file path
test_df = pd.read_csv('exoTest.csv')    # Replace with actual test file path

flux_columns = [col for col in train_df.columns if 'flux' in col]

X_train = train_df[flux_columns].values
y_train = train_df['LABEL'].values  # Replace 'target' with label column

X_test = test_df[flux_columns].values
y_test = test_df['LABEL'].values


In [9]:
X_train = train_df[flux_columns].values
X_test = test_df[flux_columns].values


In [16]:
# Assuming y_train and y_test contain labels 1 and 2:
y_train = y_train - 1
y_test = y_test - 1


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape for CNN: (samples, channels=1, sequence_length)
X_train_reshaped = X_train_scaled.reshape(-1, 1, X_train.shape[1])
X_test_reshaped = X_test_scaled.reshape(-1, 1, X_test.shape[1])


In [18]:
import torch
from torch.utils.data import Dataset, DataLoader

class KeplerDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = KeplerDataset(X_train_reshaped, y_train)
test_dataset = KeplerDataset(X_test_reshaped, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [28]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

# Assuming you have these variables ready from your previous steps:
# X_train: NumPy array of shape (num_train_samples, num_features)
# y_train: NumPy array of labels (0 and 1)
# X_test, y_test similarly for test set

# 1. Apply SMOTE on training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print(f"Class distribution after SMOTE: {np.bincount(y_train_res)}")

# 2. Scale (normalize) data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# 3. Reshape for CNN input: (samples, channels=1, sequence_length)
X_train_reshaped = X_train_scaled.reshape(-1, 1, X_train_scaled.shape[1])
X_test_reshaped = X_test_scaled.reshape(-1, 1, X_test_scaled.shape[1])

# 4. Dataset and DataLoader
class KeplerDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = KeplerDataset(X_train_reshaped, y_train_res)
test_dataset = KeplerDataset(X_test_reshaped, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 5. Define the 1D CNN model (from previous definition)
class Kepler1DCNN(nn.Module):
    def __init__(self, input_length, num_classes=2):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=5, padding=2)
        self.bn1 = nn.BatchNorm1d(16)
        self.pool = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(16, 32, 5, padding=2)
        self.bn2 = nn.BatchNorm1d(32)
        self.conv3 = nn.Conv1d(32, 64, 3, padding=1)
        self.bn3 = nn.BatchNorm1d(64)
        pooled_length = input_length // 2
        self.fc1 = nn.Linear(64 * pooled_length, 128)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(128, num_classes)
    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = x.view(x.size(0), -1)
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

# 6. Instantiate model
input_length = X_train_reshaped.shape[2]
model = Kepler1DCNN(input_length=input_length)

# 7. Weighted Loss for balanced training
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_res), y=y_train_res)
weights = torch.tensor(class_weights, dtype=torch.float32)
criterion = nn.CrossEntropyLoss(weight=weights)

optimizer = optim.Adam(model.parameters(), lr=0.001)

# 8. Train
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")

# 9. Evaluate
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch_y.cpu().numpy())

print(classification_report(all_labels, all_preds, target_names=['Class 0', 'Class 1']))


Class distribution after SMOTE: [5050 5050]
Epoch 1/10, Loss: 1.3131
Epoch 2/10, Loss: 0.5956
Epoch 3/10, Loss: 0.5731
Epoch 4/10, Loss: 0.4894
Epoch 5/10, Loss: 0.3085
Epoch 6/10, Loss: 0.2146
Epoch 7/10, Loss: 0.1891
Epoch 8/10, Loss: 0.1747
Epoch 9/10, Loss: 0.2402
Epoch 10/10, Loss: 0.1144
              precision    recall  f1-score   support

     Class 0       0.99      0.98      0.99       565
     Class 1       0.09      0.20      0.12         5

    accuracy                           0.98       570
   macro avg       0.54      0.59      0.56       570
weighted avg       0.98      0.98      0.98       570



In [29]:
from sklearn.metrics import classification_report

# all_labels and all_preds from prediction step
print(classification_report(all_labels, all_preds, target_names=['Class 0', 'Class 1']))


              precision    recall  f1-score   support

     Class 0       0.99      0.98      0.99       565
     Class 1       0.09      0.20      0.12         5

    accuracy                           0.98       570
   macro avg       0.54      0.59      0.56       570
weighted avg       0.98      0.98      0.98       570

