In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import torch.nn as nn
import torch
import torch.autograd as autograd
import numpy as np
import torch.optim as optim

ModuleNotFoundError: No module named 'torch'

## Read and Reformat the data

In [None]:
data_file = "../data/220720PM25diffsite.csv"
output_file = "../result/nn_selected_features.csv"

In [None]:
df = pd.read_csv(data_file)

In [None]:
selected_features = ["pm25", "ENSOmonthly"
           ,"eNOx","SO2emis","PM25emis","eVOC","NH3emis"
           ,"TMAXbarstow","AWNDLAX","Mir850RH","Rhontario"
           ,"dayofweekf","dayofyear"]
all_features = ["pm25", "ENSOmonthly"
              ,"eNOx","SO2emis","PM25emis","eVOC","NH3emis"
              ,"TMAXbarstow","AWNDLAX","Mir850RH","Rhontario"
              ,"dayofweekf","dayofyear"
              ,"MirTemp500C","MirWS850ms","MirWD850","MirHeight850","MirWS500ms","MirWD500","Mir500RH"
              ,"SRmeanC","AWNDbarstow","TMAXLAX","TMAXontario","AWNDontario"]

In [None]:
df_all_features = df[all_features]

### Decide the features (for all features or selected features)

In [None]:
dataset = df_all_features.dropna()
label_name = "pm25"
y_vector = dataset[[label_name]]
# change it for all features or selected features
# features_names = all_features.copy()
features_names = selected_features.copy()
features_names.remove(label_name)
X_matrix = dataset[features_names]

In [None]:
def dayofweekToNum(data_frame):
    day_mapping = {"Mon": 1, "Tue": 2, "Wed": 3, "Thu": 4, "Fri": 5, "Sat": 6, "Sun": 7}
    dayofweekf = data_frame["dayofweekf"].to_numpy()
    res = []
    for i in range(0, len(dayofweekf)):
        res.append(day_mapping[dayofweekf[i]])
    data_frame.loc[:, ("dayofweekf")] = res
    return data_frame
print(X_matrix)
X_matrix = dayofweekToNum(X_matrix)

In [None]:
features_data = X_matrix.to_numpy()
_, num_features = features_data.shape
label_data = y_vector.to_numpy()
# split the data for 10-fold cross validation
kf = KFold(n_splits=10, shuffle=True, random_state=100)

## Neural Network

In [None]:
# DEFINE ACTIVATION FUNCTION
# ACTIVATION = torch.tanh

class NeuralNetwork(nn.Module):
    def __init__(self, num_features, num_layers, num_hiddens, ACTIVATION):
        super().__init__()
        self.num_layers = num_layers
        self.fcs = []
        self.activate = ACTIVATION
        # Define input layer
        self.input_fc = nn.Linear(num_features, num_hiddens)
        # Define hidden layers
        for i in range(0, num_layers):
            fc = nn.Linear(num_hiddens, num_hiddens)
            setattr(self, 'fc%i' % i, fc)
            self.fcs.append(fc)
        # Define output layers
        self.output_layer = nn.Linear(num_hiddens, 1)

    def forward(self, x):
        x = self.input_fc(x)
        for i in range(0, self.num_layers):
            x = self.fcs[i](x)
            x = self.activate(x)
        x = self.output_layer(x)
        return x

In [None]:
loss_function = nn.MSELoss(reduction ='mean')
def loss_func(model, features, true_values):
    predict_values = model.forward(features)
    res = loss_function(predict_values, true_values)
    return res

## Hyperparameters
ReLU activation function, 3 hidden layers, 40 neuron for each hidden layers; 5000 training batch; Adam optimizer

In [None]:
# hyperparameters and net
ACTIVATION = torch.relu
# select feature 3 layers and 40 nodes; all feature 2 layer and 60 nodes
net_parameters = [(3, 40)]
max_iter = 5000

In [None]:
features_data = torch.from_numpy(features_data).float()
label_data = torch.from_numpy(label_data).float()
data_rows = []
training_data_rows = []
final_r2 = 0
final_rmse = 0
final_mbe = 0
final_prediction = None
for net_parameter in net_parameters:
    r2_res = []
    rmse_res = []
    r2_res_training = []
    rmse_res_training = []
    for train_index, test_index in kf.split(features_data):
        # build the net
        net = NeuralNetwork(num_features, net_parameter[0], net_parameter[1], ACTIVATION)
        optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
        X_train, X_test = features_data[train_index], features_data[test_index]
        y_train, y_test = label_data[train_index], label_data[test_index]
        
        for i in range(max_iter):
            loss = loss_func(net, X_train, y_train)
            optimizer.zero_grad()     # zeroes the gradient buffers of all parameters
            loss.backward() #backprop
            optimizer.step()
        print(loss)
        # evaluate the model r2 using test data set, etc.
        net.eval()
        # test data performance
        predict_res = net(X_test)
        predict_res = predict_res.detach().numpy()
        r2_res.append(r2_score(y_test, predict_res))
        rmse_res.append(mean_squared_error(predict_res, y_test, squared=False))
        # training data performance
        predict_res = net(X_train)
        predict_res = predict_res.detach().numpy()
        r2_res_training.append(r2_score(y_train, predict_res))
        rmse_res_training.append(mean_squared_error(predict_res, y_train, squared=False))
    # evaluate the model, r2, rmse
    # for testing part
    r2_mean = np.mean(r2_res)
    rmse_mean = np.mean(rmse_res)
    data_row = [net_parameter[0], net_parameter[1], r2_mean, rmse_mean]
    data_rows.append(data_row)
    # for training part
    r2_mean = np.mean(r2_res_training)
    rmse_mean = np.mean(rmse_res_training)
    data_row = [net_parameter[0], net_parameter[1], r2_mean, rmse_mean]
    training_data_rows.append(data_row)
    # train by all data
    net = NeuralNetwork(num_features, net_parameter[0], net_parameter[1], ACTIVATION)
    optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
    for i in range(max_iter):
        loss = loss_func(net, features_data, label_data)
        optimizer.zero_grad()     # zeroes the gradient buffers of all parameters
        loss.backward() #backprop
        optimizer.step()
        # evaluate the model r2 using test data set, etc.
    net.eval()
    predict_res = net(features_data)
    predict_res = predict_res.detach().numpy()
    final_r2 = r2_score(label_data, predict_res)
    final_rmse = mean_squared_error(predict_res, label_data, squared=False)
    final_mbe = np.mean(predict_res - label_data.numpy())
    final_prediction = predict_res

## Network Structure

In [None]:
net = NeuralNetwork(num_features, net_parameters[0][0], net_parameters[0][1], ACTIVATION)
print(net)

## Cross Validation Results

In [None]:
# for Table S4
print("Training Data")
print("R2 = %f   RMSE = %f" %(training_data_rows[0][2], training_data_rows[0][3]))
print("Testing Data")
print("R2 = %f   RMSE = %f" %(data_rows[0][2], data_rows[0][3]))

## Final Model Performance

In [None]:
# for Table 1
print("R2 = %f   RMSE = %f    MBE = %f" %(final_r2, final_rmse, final_mbe))

## Generate Data for Annual Evaluation

In [None]:
X_matrix['predict_values'] = final_prediction
# save to csv
X_matrix.to_csv(output_file, index=False)