# Required Imports

In [2]:
import numpy as np
import pickle
import sys
import random
import matplotlib
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.metrics import functional as FM
import seaborn as sns
import gc
from sklearn.metrics import precision_recall_fscore_support as score
import sklearn as sk
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
sys.path.append('../DG/gan')
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
%matplotlib inline
import pickle
from sklearn.metrics import r2_score
# pl.utilities.seed.seed_everything(seed=2) # sets seed for pseudo-random number generators in: pytorch, numpy, python.random

# Real Data

In [2]:
# Each has three arrays: data_feature,data_attribute,data_gen_flag 
training_real = np.load('../data/web/data_train.npz')
test_real = np.load('../data/web/data_test.npz')

In [3]:
##############################################REAL#################################################
# Y here means the data that we are going to forecast it
real_train_X =  torch.from_numpy(training_real['data_feature'][:,:500]).float()
real_train_Y =  torch.from_numpy(training_real['data_feature'][:,500:]).float()

real_test_X =  torch.from_numpy(test_real['data_feature'][:,:500]).float()
real_test_Y =  torch.from_numpy(test_real['data_feature'][:,500:]).float()

# Generated Data

In [4]:
training_DG = np.load('../data_generated/web/generated_data_train.npz')
TST = np.load('WWT_generated_new.npz')
#########################################DG#################################################
DG_X =  torch.from_numpy(training_DG['data_feature'][:,:500]).float()
DG_Y =  torch.from_numpy(training_DG['data_feature'][:,500:]).float()

#########################################TST#################################################
TST_X =  torch.from_numpy(TST['X'][:,:500]).float()
TST_Y =  torch.from_numpy(TST['X'][:,500:]).float()

In [5]:
TST_X[:-1].shape

torch.Size([49999, 500, 1])

# Features & Attributes

In [3]:
with open('../data/web/data_feature_output.pkl', 'rb') as f:
    data_feature = pickle.load(f)    
with open('../data/web/data_attribute_output.pkl', 'rb') as f:
    data_attribute = pickle.load(f)

    
# data_feature is a list of 9 "output.Output" objects, where each object contains attrs -> (is_gen_flag, dim, normalization)
print("X Features")
for i,feature in enumerate(data_feature):
    print("Feature:",i+1," -- Normalization:",feature.normalization, " -- gen_flag:",feature.is_gen_flag, " -- Dim:",feature.dim)

print("\nY Features")
for i,feature in enumerate(data_attribute):
    print("Feature:",i+1," -- Normalization:",feature.normalization, " -- gen_flag:",feature.is_gen_flag, " -- Dim:",feature.dim)

X Features
Feature: 1  -- Normalization: Normalization.MINUSONE_ONE  -- gen_flag: False  -- Dim: 1

Y Features
Feature: 1  -- Normalization: None  -- gen_flag: False  -- Dim: 9
Feature: 2  -- Normalization: None  -- gen_flag: False  -- Dim: 3
Feature: 3  -- Normalization: None  -- gen_flag: False  -- Dim: 2


# Regressor Model - 1-MlP

In [13]:
class MLPModel1(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
                  nn.Flatten(start_dim=1),
                  nn.Linear(500,100),
                  nn.ReLU(),
                  nn.Linear(100,50),
                )
    def forward(self,x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y.squeeze()) 
        r2score = r2_score(y_hat.cpu().detach().numpy() , y.squeeze().cpu() )
        self.log('r2_score',r2score, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return {'loss': loss,'r2_score':r2score}
    
    def test_step(self,batch,batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y.squeeze(2))
        r2score = r2_score(y_hat.cpu().detach().numpy() , y.squeeze(2).cpu() )
        self.log('r2_score',r2score, on_epoch=True, prog_bar=True, logger=True)
        return {'loss': loss,'r2_score':r2score}
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

In [22]:
params = {'shuffle': True,'num_workers': 0,'batch_size':256}

if __name__ == '__main__':
#     dataset = torch.utils.data.TensorDataset(torch.FloatTensor(TST_X[:]).to(device),torch.FloatTensor(TST_Y[:]).to(device))
#     dataset = torch.utils.data.TensorDataset(torch.FloatTensor(real_train_X).to(device),torch.FloatTensor(real_train_Y).to(device))
    dataset = torch.utils.data.TensorDataset(torch.FloatTensor(DG_X).to(device),torch.FloatTensor(DG_Y).to(device))

#     dataset = torch.utils.data.TensorDataset(torch.cat((TST_all_X[:,:400].to(device),real_train_X[:percent,:400].to(device))),
#                                              torch.cat((TST_all_Y_labels.long().to(device),real_train_Y_labels[:percent].to(device))))

    train_dataloader  = torch.utils.data.DataLoader(dataset, **params)
    
    dataset =  torch.utils.data.TensorDataset(torch.FloatTensor(real_test_X).to(device),torch.FloatTensor(real_test_Y).to(device))
    test_dataloader  = torch.utils.data.DataLoader(dataset,batch_size=256)
    model = MLPModel1()

trainer = pl.Trainer(gpus=1,max_epochs=100,progress_bar_refresh_rate=1)
trainer.fit(model,train_dataloader)
trainer.test(test_dataloaders=test_dataloader) # No need to repass (model), It will by itself work from test_step
print("DONE")

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 55.1 K
-------------------------------------
55.1 K    Trainable params
0         Non-trainable params
55.1 K    Total params
0.221     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'r2_score': 0.8322919607162476}
--------------------------------------------------------------------------------
DONE


In [8]:
# R2 Score - 1-MLP Layer
# Real Data: 0.8915
# DG: 0.785
# TST: 0.711

# TST: 0.7754 using only 5000 samples
# TST: 0.7547 using only 28,000 samples
# TST: 0.716 using only 34,000 samples

# In TST, using 5 epochs is more than enough to get an r2 score of 0.787
# In TST, using 5 epochs and 43,000 samples -> 0.779


# MLP 5

In [13]:
class MLPModel5(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
                  nn.Flatten(start_dim=1),
                  nn.Linear(500,200),nn.ReLU(),
                  nn.Linear(200,200),nn.ReLU(),
                  nn.Linear(200,200),nn.ReLU(),
                  nn.Linear(200,200),nn.ReLU(),
                  nn.Linear(200,200),nn.ReLU(),
                  nn.Linear(200,50),
                )
    def forward(self,x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y.squeeze()) 
        r2score = r2_score(y_hat.cpu().detach().numpy() , y.squeeze().cpu() )
        self.log('r2_score',r2score, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return {'loss': loss,'r2_score':r2score}
    
    def test_step(self,batch,batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y.squeeze(2))
        r2score = r2_score(y_hat.cpu().detach().numpy() , y.squeeze(2).cpu() )
        self.log('r2_score',r2score, on_epoch=True, prog_bar=True, logger=True)
        return {'loss': loss,'r2_score':r2score}
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

In [32]:
params = {'shuffle': True,'num_workers': 0,'batch_size':64}

if __name__ == '__main__':
    dataset = torch.utils.data.TensorDataset(torch.FloatTensor(TST_X).to(device),torch.FloatTensor(TST_Y).to(device))
#     dataset = torch.utils.data.TensorDataset(torch.FloatTensor(real_train_X).to(device),torch.FloatTensor(real_train_Y).to(device))
#     dataset = torch.utils.data.TensorDataset(torch.FloatTensor(DG_X).to(device),torch.FloatTensor(DG_Y).to(device))

#     dataset = torch.utils.data.TensorDataset(torch.cat((TST_all_X[:,:400].to(device),real_train_X[:percent,:400].to(device))),
#                                              torch.cat((TST_all_Y_labels.long().to(device),real_train_Y_labels[:percent].to(device))))

    train_dataloader  = torch.utils.data.DataLoader(dataset, **params)
    
    dataset =  torch.utils.data.TensorDataset(torch.FloatTensor(real_test_X).to(device),torch.FloatTensor(real_test_Y).to(device))
    test_dataloader  = torch.utils.data.DataLoader(dataset,batch_size=16)
    model = MLPModel1()

trainer = pl.Trainer(gpus=1,max_epochs=100,progress_bar_refresh_rate=1)
trainer.fit(model,train_dataloader)
trainer.test(test_dataloaders=test_dataloader) # No need to repass (model), It will by itself work from test_step
print("DONE")

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 110 K 
-------------------------------------
110 K     Trainable params
0         Non-trainable params
110 K     Total params
0.441     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'r2_score': 0.7324246168136597}
--------------------------------------------------------------------------------
DONE


# Other ML Models

In [9]:
def other_models(train_X, train_Y,test_X,test_Y):
    lor = LinearRegression()
    lor.fit(train_X, train_Y)
    print('Linear Regression',lor.score(test_X,test_Y))
    
    kr = KernelRidge()
    kr.fit(train_X,train_Y)
    print('Kernel Ridge',kr.score(test_X,test_Y)) 
  

In [10]:
other_models(torch.flatten(TST_X,1),torch.flatten(TST_Y,1),torch.flatten(real_test_X,1),torch.flatten(real_test_Y,1))

Linear Regression -467.3841332060692


  overwrite_a=False)


Kernel Ridge 0.8117641893542101


In [16]:
other_models(torch.flatten(real_train_X,1),torch.flatten(real_train_Y,1),torch.flatten(real_test_X,1),torch.flatten(real_test_Y,1))

Linear Regression 0.9088504188664293


  overwrite_a=False)


Kernel Ridge 0.90875846680532


In [17]:
other_models(torch.flatten(DG_X,1),torch.flatten(DG_Y,1),torch.flatten(real_test_X,1),torch.flatten(real_test_Y,1))

Linear Regression 0.8132905114602174


  overwrite_a=False)


Kernel Ridge 0.8206123184299539


In [13]:
other_models(torch.flatten(torch.cat((real_train_X,TST_X)),1),torch.flatten(torch.cat((real_train_Y,TST_Y)),1),
             torch.flatten(real_test_X,1),torch.flatten(real_test_Y,1))

Linear Regression 0.9007136549766838


MemoryError: Unable to allocate 37.3 GiB for an array with shape (100000, 100000) and data type float32