In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import numpy as np
from lib.LCWavelet import *
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

## Arquitectura del modelo

In [26]:
class ShallueModel(nn.Module):
    def __init__(self, global_size=2001, local_size=201, num_classes=2):
        super(ShallueModel, self).__init__()
        self.global_size = global_size
        self.local_size = local_size
        
        self.conv_global_odd = nn.Sequential(
            nn.Conv1d(1, 16, 5),
            nn.ReLU(),
            nn.Conv1d(16, 16, 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=5, stride=2),
            nn.Conv1d(16, 32, 5),
            nn.ReLU(),
            nn.Conv1d(32, 32, 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=5, stride=2),
            nn.Conv1d(32, 64, 5),
            nn.ReLU(),
            nn.Conv1d(64, 64, 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=5, stride=2),
            nn.Conv1d(64, 128, 5),
            nn.ReLU(),
            nn.Conv1d(128, 128, 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=5, stride=2),
            nn.Conv1d(128, 256, 5),
            nn.ReLU(),
            nn.Conv1d(256, 256, 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=5, stride=2),
        )
        
        self.conv_global_even = nn.Sequential(
            nn.Conv1d(1, 16, 5),
            nn.ReLU(),
            nn.Conv1d(16, 16, 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=5, stride=2),
            nn.Conv1d(16, 32, 5),
            nn.ReLU(),
            nn.Conv1d(32, 32, 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=5, stride=2),
            nn.Conv1d(32, 64, 5),
            nn.ReLU(),
            nn.Conv1d(64, 64, 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=5, stride=2),
            nn.Conv1d(64, 128, 5),
            nn.ReLU(),
            nn.Conv1d(128, 128, 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=5, stride=2),
            nn.Conv1d(128, 256, 5),
            nn.ReLU(),
            nn.Conv1d(256, 256, 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=5, stride=2),
        )
        
        self.conv_local_odd = nn.Sequential(
            nn.Conv1d(1, 16, 5),
            nn.ReLU(),
            nn.Conv1d(16, 16, 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=7, stride=2),
            nn.Conv1d(16, 32, 5),
            nn.ReLU(),
            nn.Conv1d(32, 32, 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=7, stride=2),
        )
        
        self.conv_local_even = nn.Sequential(
            nn.Conv1d(1, 16, 5),
            nn.ReLU(),
            nn.Conv1d(16, 16, 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=7, stride=2),
            nn.Conv1d(16, 32, 5),
            nn.ReLU(),
            nn.Conv1d(32, 32, 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=7, stride=2),
        )
        
        # Calcular automáticamente el número de features resultantes de la concatenación
        with torch.no_grad():
            dummy_global = torch.zeros(1, 1, self.global_size)
            dummy_local = torch.zeros(1, 1, self.local_size)
            out_global_odd  = self.conv_global_odd(dummy_global)
            out_global_even = self.conv_global_even(dummy_global)
            out_local_odd   = self.conv_local_odd(dummy_local)
            out_local_even  = self.conv_local_even(dummy_local)
            
            # Flatten cada salida y sumar sus dimensiones
            num_features = out_global_odd.view(1, -1).size(1) + \
                           out_global_even.view(1, -1).size(1) + \
                           out_local_odd.view(1, -1).size(1) + \
                           out_local_even.view(1, -1).size(1)
            
        print("Número de features concatenados:", num_features)
        
        self.fc = nn.Sequential(
            nn.Linear(num_features, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )
        
    def forward(self, inputs):
        global_odd = self.conv_global_odd(inputs[0])
        global_even = self.conv_global_even(inputs[1])
        local_odd = self.conv_local_odd(inputs[2])
        local_even = self.conv_local_even(inputs[3])
        
        global_odd = global_odd.view(global_odd.size(0), -1)
        global_even = global_even.view(global_even.size(0), -1)
        local_odd = local_odd.view(local_odd.size(0), -1)
        local_even = local_even.view(local_even.size(0), -1)
        
        # Concatenación de todas las ramas
        x = torch.cat((global_odd, global_even, local_odd, local_even), dim=1)
        x = self.fc(x)
        return F.softmax(x, dim=1)
    

## Cargado de datos

In [3]:
def load_data(path, file_name):
    lc = LightCurveWaveletGlobalLocalCollection.from_pickle(path + file_name)
    try:
        getattr(lc, 'levels')
    except AttributeError:
        lc.levels = [1, 2, 3, 4]
    return lc
        
path='all_data/'
files = os.listdir(path)
kepler_files = [f for f in files if f.endswith('.pickle')]
light_curves = []

for file in tqdm(kepler_files, desc='Loading data'):
    light_curves.append(load_data(path, file))



Loading data: 100%|██████████| 9346/9346 [00:44<00:00, 208.12it/s]


### Separar entre los confirmados y candidatos

In [4]:
candidates = [lc for lc in light_curves if lc.headers['class'] == 'CANDIDATE']
print("Número de candidatos:", len(candidates))

confirmed = [lc for lc in light_curves if lc.headers['class'] == 'CONFIRMED' or lc.headers['class'] == 'FALSE POSITIVE']
print("Número de confirmados:", len(confirmed))

classes = [lc.headers['class'] for lc in confirmed]
classes = set(classes)
classes = {v: k for k, v in enumerate(classes)}
print("Clases:", classes)

Número de candidatos: 2046
Número de confirmados: 7300
Clases: {'FALSE POSITIVE': 0, 'CONFIRMED': 1}


In [5]:
global_odd = []
global_even = []
local_odd = []
local_even = []
labels = []

for lc in tqdm(confirmed, desc='Processing light curves'):
    global_odd.append(lc.pliegue_impar_global._light_curve.flux.value)
    global_even.append(lc.pliegue_par_global._light_curve.flux.value)
    local_odd.append(lc.pliegue_impar_local._light_curve.flux.value)
    local_even.append(lc.pliegue_par_local._light_curve.flux.value)
    # Convertir la clase a un número entero
    labels.append(classes[lc.headers['class']])
    
print("Número de datos:", len(global_odd))
print('Elementos de cada clase:', {k: labels.count(k) for k in set(labels)})


Processing light curves: 100%|██████████| 7300/7300 [00:00<00:00, 171024.91it/s]

Número de datos: 7300
Elementos de cada clase: {0: 4637, 1: 2663}





### Separar las muestras en train y test

In [6]:
items = []

for i in tqdm(range(len(global_odd)), desc='Creating items'):
    item = {
        'global_odd': global_odd[i],
        'global_even': global_even[i],
        'local_odd': local_odd[i],
        'local_even': local_even[i],
        'label': labels[i]
    }
    items.append(item)

train, test = train_test_split(items, test_size=0.3, random_state=42)
train_local_odd = torch.tensor([item['local_odd'] for item in train])
train_local_even = torch.tensor([item['local_even'] for item in train])
train_global_odd = torch.tensor([item['global_odd'] for item in train])
train_global_even = torch.tensor([item['global_even'] for item in train])
train_labels = torch.tensor([item['label'] for item in train])

test_local_odd = torch.tensor([item['local_odd'] for item in test])
test_local_even = torch.tensor([item['local_even'] for item in test])
test_global_odd = torch.tensor([item['global_odd'] for item in test])
test_global_even = torch.tensor([item['global_even'] for item in test])
test_labels = torch.tensor([item['label'] for item in test])

train_dataset = torch.utils.data.TensorDataset(train_global_odd, train_global_even, train_local_odd, train_local_even, train_labels)
test_dataset = torch.utils.data.TensorDataset(test_global_odd, test_global_even, test_local_odd, test_local_even, test_labels)

print("Tamaño del conjunto de entrenamiento:", len(train_dataset))
print("Tamaño del conjunto de prueba:", len(test_dataset))

Creating items: 100%|██████████| 7300/7300 [00:00<00:00, 1823067.53it/s]
  


Tamaño del conjunto de entrenamiento: 5110
Tamaño del conjunto de prueba: 2190


In [35]:
batch_size = 64
num_epochs = 10
learning_rate = 0.001
loss_fn = nn.CrossEntropyLoss()
model = ShallueModel(global_size=2001, local_size=201, num_classes=2)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Número de features concatenados: 28672


In [32]:
def train_fn(model, train_loader, optimizer, loss_fn):
    model.train()
    train_size = len(train_loader.dataset)
    n_batches = len(train_loader)
    total_loss = 0.0
    correct = 0
    for batch, data in enumerate(tqdm(train_loader, desc='Training')):
        global_odd, global_even, local_odd, local_even, labels = data
        # check if any tensor is empty
        if global_odd.numel() == 0 or global_even.numel() == 0 or local_odd.numel() == 0 or local_even.numel() == 0:
            continue
        # check if any tensor has nan
        if torch.isnan(global_odd).any() or torch.isnan(global_even).any() or torch.isnan(local_odd).any() or torch.isnan(local_even).any():
            continue
        
        
        optimizer.zero_grad()
        # Forward propagation
        outputs = model((global_odd.unsqueeze(1).float(), global_even.unsqueeze(1).float(), local_odd.unsqueeze(1).float(), local_even.unsqueeze(1).float()))
        
        # Compute loss and backpropagation
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == labels).sum().item()

    accuracy = correct / train_size
    train_loss = total_loss / n_batches
        
    return train_loss, accuracy


def val_fn(model, test_loader, loss_fn):
    model.eval()
    test_size = len(test_loader.dataset)
    n_batches = len(test_loader)
    total_loss = 0.0
    correct = 0
    all_labels = []
    all_predictions = []
    
    with torch.no_grad():
        for batch, data in enumerate(tqdm(test_loader, desc='Validation')):
            global_odd, global_even, local_odd, local_even, labels = data
            # check if any tensor is empty
            if global_odd.numel() == 0 or global_even.numel() == 0 or local_odd.numel() == 0 or local_even.numel() == 0:
                continue
            # check if any tensor has nan
            if torch.isnan(global_odd).any() or torch.isnan(global_even).any() or torch.isnan(local_odd).any() or torch.isnan(local_even).any():
                continue
            
            outputs = model((global_odd.unsqueeze(1).float(), global_even.unsqueeze(1).float(), local_odd.unsqueeze(1).float(), local_even.unsqueeze(1).float()))
            
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()
            
            all_labels.extend(labels.numpy())
            all_predictions.extend(predicted.numpy())
    
    accuracy = correct / test_size
    val_loss = total_loss / n_batches
    
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    precision = precision_score(all_labels, all_predictions, average='weighted')
    recall = recall_score(all_labels, all_predictions, average='weighted')
    
    return val_loss, accuracy, f1, precision, recall

In [36]:
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}', '------'*20)
    train_loss, train_accuracy = train_fn(model, train_loader, optimizer, loss_fn)
    val_loss, val_accuracy, f1, precision, recall = val_fn(model, test_loader, loss_fn)
    
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')
    print(f'F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')
print("Entrenamiento y validación completados.")

Epoch 1/10 ------------------------------------------------------------------------------------------------------------------------


Training: 100%|██████████| 80/80 [00:14<00:00,  5.48it/s]
Validation: 100%|██████████| 35/35 [00:01<00:00, 26.40it/s]


Train Loss: 0.5935, Train Accuracy: 0.6317
Validation Loss: 0.5614, Validation Accuracy: 0.6534
F1 Score: 0.6929, Precision: 0.6920, Recall: 0.6940
Epoch 2/10 ------------------------------------------------------------------------------------------------------------------------


Training: 100%|██████████| 80/80 [00:16<00:00,  4.93it/s]
Validation: 100%|██████████| 35/35 [00:01<00:00, 26.31it/s]


Train Loss: 0.5543, Train Accuracy: 0.6765
Validation Loss: 0.5426, Validation Accuracy: 0.6703
F1 Score: 0.7075, Precision: 0.7061, Recall: 0.7119
Epoch 3/10 ------------------------------------------------------------------------------------------------------------------------


Training: 100%|██████████| 80/80 [00:17<00:00,  4.58it/s]
Validation: 100%|██████████| 35/35 [00:01<00:00, 25.94it/s]


Train Loss: 0.5435, Train Accuracy: 0.6926
Validation Loss: 0.5383, Validation Accuracy: 0.6836
F1 Score: 0.7219, Precision: 0.7208, Recall: 0.7260
Epoch 4/10 ------------------------------------------------------------------------------------------------------------------------


Training: 100%|██████████| 80/80 [00:20<00:00,  3.97it/s]
Validation: 100%|██████████| 35/35 [00:01<00:00, 26.35it/s]


Train Loss: 0.5317, Train Accuracy: 0.7020
Validation Loss: 0.5308, Validation Accuracy: 0.6904
F1 Score: 0.7268, Precision: 0.7271, Recall: 0.7333
Epoch 5/10 ------------------------------------------------------------------------------------------------------------------------


Training: 100%|██████████| 80/80 [00:22<00:00,  3.55it/s]
Validation: 100%|██████████| 35/35 [00:01<00:00, 26.11it/s]


Train Loss: 0.5274, Train Accuracy: 0.7090
Validation Loss: 0.5302, Validation Accuracy: 0.6886
F1 Score: 0.7176, Precision: 0.7261, Recall: 0.7313
Epoch 6/10 ------------------------------------------------------------------------------------------------------------------------


Training: 100%|██████████| 80/80 [00:22<00:00,  3.55it/s]
Validation: 100%|██████████| 35/35 [00:01<00:00, 26.40it/s]


Train Loss: 0.5194, Train Accuracy: 0.7155
Validation Loss: 0.5233, Validation Accuracy: 0.7005
F1 Score: 0.7362, Precision: 0.7384, Recall: 0.7439
Epoch 7/10 ------------------------------------------------------------------------------------------------------------------------


Training: 100%|██████████| 80/80 [00:31<00:00,  2.56it/s]
Validation: 100%|██████████| 35/35 [00:01<00:00, 26.52it/s]


Train Loss: 0.5154, Train Accuracy: 0.7235
Validation Loss: 0.5273, Validation Accuracy: 0.6991
F1 Score: 0.7311, Precision: 0.7379, Recall: 0.7425
Epoch 8/10 ------------------------------------------------------------------------------------------------------------------------


Training: 100%|██████████| 80/80 [00:26<00:00,  3.05it/s]
Validation: 100%|██████████| 35/35 [00:01<00:00, 26.32it/s]


Train Loss: 0.5106, Train Accuracy: 0.7284
Validation Loss: 0.5208, Validation Accuracy: 0.7027
F1 Score: 0.7371, Precision: 0.7414, Recall: 0.7464
Epoch 9/10 ------------------------------------------------------------------------------------------------------------------------


Training: 100%|██████████| 80/80 [00:29<00:00,  2.74it/s]
Validation: 100%|██████████| 35/35 [00:01<00:00, 26.58it/s]


Train Loss: 0.5120, Train Accuracy: 0.7427
Validation Loss: 0.5070, Validation Accuracy: 0.7196
F1 Score: 0.7632, Precision: 0.7625, Recall: 0.7643
Epoch 10/10 ------------------------------------------------------------------------------------------------------------------------


Training: 100%|██████████| 80/80 [00:25<00:00,  3.15it/s]
Validation: 100%|██████████| 35/35 [00:01<00:00, 26.70it/s]

Train Loss: 0.4947, Train Accuracy: 0.7442
Validation Loss: 0.5158, Validation Accuracy: 0.7078
F1 Score: 0.7499, Precision: 0.7489, Recall: 0.7517
Entrenamiento y validación completados.



