In [3]:
import numpy as np
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [4]:
from data_preprocessing import load_X_y
import pandas as pd

weather_df, pollutant_df = load_X_y()
weather_df_normalized = (weather_df-weather_df.min())/(weather_df.max()-weather_df.min())

pollutant_tensor = torch.tensor(pollutant_df['P2.5'].values)
weather_tensor = torch.tensor(weather_df_normalized[:].values)

In [29]:
from torch.utils.data import Dataset

# creating a custom dataset in a sliding window manner
class WeatherPollutantDataset(Dataset):
    def __init__(self, weather: torch.Tensor, pollutant: torch.Tensor, window:int):
        self.weather = weather
        self.pollutant = pollutant
        # assumes the data starts on the same day
        # assumes their length is the same
        assert len(self.weather) == len(self.pollutant)
        self.window = window

    def __getitem__(self, index: int):
        """
        Using weather input on day i,i+1,...,i+window-1 to predict pollutant output on i+window
        """
        weather_input = self.weather[index:index+self.window].permute(1,0)
        pollutant_output = self.pollutant[index+self.window]
        return weather_input, pollutant_output

    def __len__(self):
        return len(self.weather) - self.window

In [30]:
train_set = WeatherPollutantDataset(weather=weather_tensor[:5014], pollutant=pollutant_tensor[:5014], window=14)
val_set   = WeatherPollutantDataset(weather=weather_tensor[5014:6028], pollutant=pollutant_tensor[5014:6028], window=14)
test_set  = WeatherPollutantDataset(weather=weather_tensor[6028:7042], pollutant=pollutant_tensor[6028:7042], window=14)

In [37]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_set, batch_size=len(train_set))
train_X, train_y = next(iter(train_loader))

In [38]:
train_X = train_X.numpy()
train_X = train_X.reshape(train_X.shape[0], train_X.shape[1]*train_X.shape[2])
train_y = train_y.numpy()

print(train_X.shape)
print(train_y.shape)

(5000, 168)
(5000,)
