# Importación de librerías

In [375]:
import numpy as np
import polars as pl
import torch
from torch.utils.data import Dataset
from torch.utils.data import random_split
import torch.nn.functional as F
import torch.nn as nn
from torchmetrics.regression import MeanSquaredError, MeanAbsoluteError

# Lectura de datos

In [376]:
solarFlareDataset = pl.read_csv("doc/flare.data1", 
    dtypes={
        "column_1": pl.Categorical,
        "column_2": pl.Categorical,
        "column_3": pl.Categorical}, 
    skip_rows=1, has_header= False, separator= " ", 
    new_columns=["modified Zurich class","largest spot size","spot distribution","activity","evolution", 
                "previous 24 hour flare activity", "historically-complex", "became complex on this pass", "area", 
                "area of largest spot", "common flares", "moderate flares", "severe flares"],
    null_values=['?']).drop_nulls()

solarFlareDataset

modified Zurich class,largest spot size,spot distribution,activity,evolution,previous 24 hour flare activity,historically-complex,became complex on this pass,area,area of largest spot,common flares,moderate flares,severe flares
cat,cat,cat,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""C""","""S""","""O""",1,2,1,1,2,1,2,0,0,0
"""D""","""S""","""O""",1,3,1,1,2,1,2,0,0,0
"""C""","""S""","""O""",1,3,1,1,2,1,1,0,0,0
"""D""","""S""","""O""",1,3,1,1,2,1,2,0,0,0
"""D""","""A""","""O""",1,3,1,1,2,1,2,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…
"""C""","""R""","""O""",1,2,1,2,2,1,2,0,0,0
"""D""","""R""","""O""",1,3,1,1,2,1,2,0,0,0
"""E""","""A""","""O""",1,3,1,1,2,1,2,0,0,0
"""C""","""R""","""O""",1,3,1,1,2,1,1,0,0,0


In [377]:
df_columnas_categoricas = solarFlareDataset.select([pl.col(pl.Categorical)])
print(f"Nombres columnas categóricas: \n\t{df_columnas_categoricas.columns}")

dummies = [[ {"nome": columna, 'valor': i}
        for i in df_columnas_categoricas.get_column(columna).cat.get_categories()
    ] for columna in df_columnas_categoricas.columns]
dummies_flat = [item for row in dummies for item in row]
display(dummies_flat)

novo_expr = [(pl.col(item['nome']) == item["valor"] ).alias(f'{item["nome"]}-{item["valor"]}') for item in dummies_flat]
print(novo_expr)

Nombres columnas categóricas: 
	['modified Zurich class', 'largest spot size', 'spot distribution']


[{'nome': 'modified Zurich class', 'valor': 'C'},
 {'nome': 'modified Zurich class', 'valor': 'D'},
 {'nome': 'modified Zurich class', 'valor': 'B'},
 {'nome': 'modified Zurich class', 'valor': 'F'},
 {'nome': 'modified Zurich class', 'valor': 'H'},
 {'nome': 'modified Zurich class', 'valor': 'E'},
 {'nome': 'largest spot size', 'valor': 'S'},
 {'nome': 'largest spot size', 'valor': 'A'},
 {'nome': 'largest spot size', 'valor': 'K'},
 {'nome': 'largest spot size', 'valor': 'R'},
 {'nome': 'largest spot size', 'valor': 'X'},
 {'nome': 'largest spot size', 'valor': 'H'},
 {'nome': 'spot distribution', 'valor': 'O'},
 {'nome': 'spot distribution', 'valor': 'I'},
 {'nome': 'spot distribution', 'valor': 'X'},
 {'nome': 'spot distribution', 'valor': 'C'}]

[<Expr ['[(col("modified Zurich class")…'] at 0x7FBCD8DF0940>, <Expr ['[(col("modified Zurich class")…'] at 0x7FBCD8C32D70>, <Expr ['[(col("modified Zurich class")…'] at 0x7FBCD8C32020>, <Expr ['[(col("modified Zurich class")…'] at 0x7FBCD861ABF0>, <Expr ['[(col("modified Zurich class")…'] at 0x7FBCD86188E0>, <Expr ['[(col("modified Zurich class")…'] at 0x7FBCD8618310>, <Expr ['[(col("largest spot size")) ==…'] at 0x7FBCD8619060>, <Expr ['[(col("largest spot size")) ==…'] at 0x7FBCD8618F10>, <Expr ['[(col("largest spot size")) ==…'] at 0x7FBCD861BFD0>, <Expr ['[(col("largest spot size")) ==…'] at 0x7FBCD8618B80>, <Expr ['[(col("largest spot size")) ==…'] at 0x7FBCD861A9E0>, <Expr ['[(col("largest spot size")) ==…'] at 0x7FBCD861A680>, <Expr ['[(col("spot distribution")) ==…'] at 0x7FBCD87ACF70>, <Expr ['[(col("spot distribution")) ==…'] at 0x7FBCD87AF520>, <Expr ['[(col("spot distribution")) ==…'] at 0x7FBCD87AE2F0>, <Expr ['[(col("spot distribution")) ==…'] at 0x7FBCD87AEDA0>]


In [378]:
grupo_numericos = solarFlareDataset.select([pl.col(pl.Int64)])
medias = torch.tensor(grupo_numericos.mean().to_numpy()).squeeze()
stds = torch.tensor(grupo_numericos.std().to_numpy()).squeeze()

# StandardScaler

In [379]:
class StandardScaler:

    def __init__(self, mean=None, std=None, epsilon=1e-7):
        """Standard Scaler.
        The class can be used to normalize PyTorch Tensors using native functions. The module does not expect the
        tensors to be of any specific shape; as long as the features are the last dimension in the tensor, the module
        will work fine.
        :param mean: The mean of the features. The property will be set after a call to fit.
        :param std: The standard deviation of the features. The property will be set after a call to fit.
        :param epsilon: Used to avoid a Division-By-Zero exception.
        """
        self.mean = mean
        self.std = std
        self.epsilon = epsilon

    def fit(self, values):
        dims = list(range(values.dim() - 1))
        self.mean = torch.mean(values, dim=dims)
        self.std = torch.std(values, dim=dims)
        

    def transform(self, values):
        return (values - self.mean) / (self.std + self.epsilon)

    def fit_transform(self, values):
        self.fit(values)
        return self.transform(values)

    def __call__(self, sample):
        values,saidas = sample
        return ((values - self.mean) / (self.std + self.epsilon), saidas)
        
    def __repr__(self):
        return f"mean: {self.mean}, std:{self.std}, epsilon:{self.epsilon}"

In [380]:
scaler = StandardScaler(medias, stds)
display(scaler)

mean: tensor([1.1393, 2.4861, 1.1920, 1.3684, 1.9474, 1.0279, 1.7554, 0.1331, 0.1362,
        0.0217], dtype=torch.float64), std:tensor([0.3468, 0.6020, 0.5900, 0.4831, 0.2236, 0.1648, 0.4305, 0.3990, 0.4794,
        0.1458], dtype=torch.float64), epsilon:1e-07

In [381]:
class SolarDataset(Dataset):
  def __init__(self, src_file, root_dir, transform=None, expr_dummies = None):
   self.transform = transform
   self.expr_dummies = expr_dummies
   self.dataSet = pl.scan_csv(src_file, dtypes={
           "column_1": pl.Categorical,
           "column_2": pl.Categorical,
           "column_3": pl.Categorical,
           "column_4": pl.Int64,
           "column_5": pl.Int64,
           "column_6": pl.Int64,
           "column_7": pl.Int64,
           "column_8": pl.Int64,
           "column_9": pl.Int64,
           "column_10": pl.Int64,
           "column_11": pl.Int64,
           "column_12": pl.Int64,
           "column_13": pl.Int64,
       },
       has_header=False,
       skip_rows=1,
       separator=' ',null_values=['?']).drop_nulls().rename({
           "column_1": "modified Zurich class",
           "column_2": "largest spot size",
           "column_3": "spot distribution",
           "column_4": "activity",
           "column_5": "evolution",
           "column_6": "previous 24 hour flare activity",
           "column_7": "historically-complex",
           "column_8": "became complex on this pass",
           "column_9": "area",
           "column_10": "area of largest spot",
           "column_11": "common flares",
           "column_12": "moderate flares",
           "column_13": "severe flares"
       }).with_row_index("id")


  def __len__(self):
   return self.dataSet.select(pl.len()).collect().item()
  def __getitem__(self, idx):
   if torch.is_tensor(idx):
     idx = idx.tolist()
   else:
     idx = [idx]
  
   seccion = self.dataSet.filter(pl.col("id").is_in(idx)).drop("id")
   datos = seccion.collect()

   datosNumericos = datos.select([pl.col(pl.Int64)])
   predsA =  self.transform.transform(torch.tensor(datosNumericos.to_numpy()).squeeze()) 

   # en predsC vou a ter as columnas feitas dummies
   predsC = datos.select([pl.col(pl.Categorical)]).with_columns(
    self.expr_dummies).drop(datos.select([pl.col(pl.Categorical)]).columns)
  
   tensorB = torch.tensor(predsC.to_numpy().astype(np.int32)).squeeze()

   entrada = torch.cat((predsA[:7], tensorB),dim=-1)
   
   sample = (entrada,predsA[7:] if predsA.dim() == 1 else predsA[:,7:].flatten())
   return sample


dataset = SolarDataset("doc/flare.data1",".",transform=scaler,expr_dummies=novo_expr)
print(dataset[0])


(tensor([-0.4017, -0.8074, -0.3253, -0.7626,  0.2353, -0.1690,  0.5681,  1.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000],
       dtype=torch.float64), tensor([-0.3336, -0.2841, -0.1486], dtype=torch.float64))


# División del dataset

In [382]:
lonxitudeDataset = len(dataset)

tamTrain =int(lonxitudeDataset*0.8)

tamVal = lonxitudeDataset - tamTrain
print(f"Tam dataset: {lonxitudeDataset} train: {tamTrain} tamVal: {tamVal}")

train_set, val_set = random_split(dataset,[tamTrain,tamVal])
train_ldr = torch.utils.data.DataLoader(train_set, batch_size=2,
    shuffle=True, drop_last=False)
validation_loader =torch.utils.data.DataLoader(val_set, batch_size=4, shuffle=False)    

Tam dataset: 323 train: 258 tamVal: 65


# Creación de la red neuronal

In [383]:
class Model(nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
        self.layer1 = nn.Linear(input_dim, 50)
        self.layer2 = nn.Linear(50, 50)
        self.layer3 = nn.Linear(50, 3)
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return x

In [384]:
model     = Model(23)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn   = nn.MSELoss(reduction='sum')
compiled_model = torch.compile(model)

In [385]:
display(compiled_model)

OptimizedModule(
  (_orig_mod): Model(
    (layer1): Linear(in_features=23, out_features=50, bias=True)
    (layer2): Linear(in_features=50, out_features=50, bias=True)
    (layer3): Linear(in_features=50, out_features=3, bias=True)
  )
)

In [386]:
entradaProba,dest = next(iter(train_ldr))
display(entradaProba)

saida = model(entradaProba.to(torch.float32))
display(dest)
display(saida)

display(loss_fn(saida, dest))

tensor([[-0.4017,  0.8537, -0.3253,  1.3073,  0.2353, -0.1690,  0.5681,  1.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000],
        [-0.4017,  0.8537, -0.3253,  1.3073,  0.2353, -0.1690,  0.5681,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000]],
       dtype=torch.float64)

tensor([[-0.3336, -0.2841, -0.1486],
        [-0.3336,  1.8018, -0.1486]], dtype=torch.float64)

tensor([[-0.1555,  0.0457,  0.2225],
        [-0.1618,  0.0148,  0.1678]], grad_fn=<AddmmBackward0>)

tensor(3.6010, dtype=torch.float64, grad_fn=<MseLossBackward0>)

# Entrenamiento

In [387]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    for i, data in enumerate(train_ldr):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = compiled_model(inputs.to(torch.float32))

        loss = loss_fn(outputs.to(torch.float64), labels)
        loss.backward()

        optimizer.step()

        running_loss += loss.item()

        if i % 10 == 9:
            last_loss = running_loss / 10 
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            
            running_loss = 0.
    
    print("\n")
    return last_loss

In [388]:
epoch_number = 0

EPOCHS = 2

best_vloss = 1_000_000.
loss_list     = np.zeros((EPOCHS,))
accuracy_list = np.zeros((EPOCHS,))

mse_metric = MeanSquaredError()
mae_metric = MeanAbsoluteError()


for epoch in range(EPOCHS):
    print('\nEPOCH {}:'.format(epoch + 1))

    compiled_model.train(True)
    avg_loss = train_one_epoch(epoch, None)
    loss_list[epoch] = avg_loss

    compiled_model.train(False)

    running_vloss = 0.0

    for i, vdata in enumerate(validation_loader):
        vinputs, vlabels = vdata
        print(f"Entrada: {vinputs}")

        voutputs = compiled_model(vinputs.to(torch.float32))
        vloss = loss_fn(voutputs, vlabels)

        print(f"\nSalidas deseadas: {vlabels}")
        print(f"\nSalidas: {voutputs}")
        print("<---------------------------------------------------------->")

        mse = mse_metric(voutputs, vlabels)
        mae = mae_metric(voutputs, vlabels)

        running_vloss += vloss
    
    mse = mse_metric.compute()
    mae = mae_metric.compute()

    print("\nMSE: ", mse)
    print("\nMAE: ", mae)


EPOCH 1:
  batch 10 loss: 1.8664343052136758
  batch 20 loss: 1.707137810418876
  batch 30 loss: 4.128366704872823
  batch 40 loss: 14.07966108638612
  batch 50 loss: 2.7777279104413184
  batch 60 loss: 1.3521344952679564
  batch 70 loss: 9.361524570034797
  batch 80 loss: 15.503384550116905
  batch 90 loss: 5.231255487522427
  batch 100 loss: 8.124717189939268
  batch 110 loss: 2.132339250147712
  batch 120 loss: 1.8221802858711782


Entrada: tensor([[ 2.4817,  0.8537, -0.3253,  1.3073,  0.2353, -0.1690,  0.5681,  0.0000,
          1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000],
        [-0.4017, -2.4686, -0.3253,  1.3073,  0.2353, -0.1690, -1.7547,  1.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          1.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000],
        [-0.4017, -2.4686, -0.3253,  1.3073,  0.2353, -0.1690,  0.5681,  1.0000,
