In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import torch.nn as nn
import torch
import torch.autograd as autograd
import numpy as np
import torch.optim as optim
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix
from string import ascii_uppercase

## Read and Reformat the data

In [2]:
data_file = "../data/220720PM25diffsite.csv"
output_file = "../result/nn_selected_features.csv"

In [3]:
df = pd.read_csv(data_file)

In [4]:
selected_features = ["pm25", "ENSOmonthly"
           ,"eNOx","SO2emis","PM25emis","eVOC","NH3emis"
           ,"TMAXbarstow","AWNDLAX","Mir850RH","Rhontario"
           ,"dayofweekf","dayofyear"]
all_features = ["pm25", "ENSOmonthly"
              ,"eNOx","SO2emis","PM25emis","eVOC","NH3emis"
              ,"TMAXbarstow","AWNDLAX","Mir850RH","Rhontario"
              ,"dayofweekf","dayofyear"
              ,"MirTemp500C","MirWS850ms","MirWD850","MirHeight850","MirWS500ms","MirWD500","Mir500RH"
              ,"SRmeanC","AWNDbarstow","TMAXLAX","TMAXontario","AWNDontario"]

In [5]:
df_all_features = df[all_features]

### Decide the features (for all features or selected features)

In [6]:
dataset = df_all_features.dropna()
label_name = "pm25"
y_vector = dataset[[label_name]]
# change it for all features or selected features
# features_names = all_features.copy()
features_names = selected_features.copy()
features_names.remove(label_name)
X_matrix = dataset[features_names]

In [7]:
def dayofweekToNum(data_frame):
    day_mapping = {"Mon": 1, "Tue": 2, "Wed": 3, "Thu": 4, "Fri": 5, "Sat": 6, "Sun": 7}
    dayofweekf = data_frame["dayofweekf"].to_numpy()
    res = []
    for i in range(0, len(dayofweekf)):
        res.append(day_mapping[dayofweekf[i]])
    data_frame.loc[:, ("dayofweekf")] = res
    return data_frame
print(X_matrix)
X_matrix = dayofweekToNum(X_matrix)

      ENSOmonthly      eNOx  SO2emis  PM25emis     eVOC  NH3emis  TMAXbarstow  \
3           24.78  1007.938   62.837    78.766  999.205   94.759         16.7   
4           24.78  1007.938   62.837    78.766  999.205   94.759         16.7   
6           24.78  1007.938   62.837    78.766  999.205   94.759         13.3   
8           24.78  1007.938   62.837    78.766  999.205   94.759         18.3   
9           24.78  1007.938   62.837    78.766  999.205   94.759         23.9   
...           ...       ...      ...       ...      ...      ...          ...   
7299        27.07   337.141   16.174    81.619  526.083   79.151          7.0   
7300        27.07   337.141   16.174    81.619  526.083   79.151          9.0   
7301        27.07   337.141   16.174    81.619  526.083   79.151         11.0   
7302        27.07   337.141   16.174    81.619  526.083   79.151          9.0   
7303        27.07   337.141   16.174    81.619  526.083   79.151          9.0   

      AWNDLAX   Mir850RH  R

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [8]:
# threshold = 12
# threshold = 35
# threshold = 20
threshold = 12
features_data = X_matrix.to_numpy()
_, num_features = features_data.shape
label_data = y_vector.to_numpy()
# split the data for 10-fold cross validation
kf = KFold(n_splits=10, shuffle=True, random_state=100)
classified_label = np.zeros(label_data.shape)
classified_label[label_data >= threshold] = 1
label_data = classified_label.ravel()

In [9]:
# DEFINE ACTIVATION FUNCTION
# ACTIVATION = torch.tanh

class NeuralNetwork(nn.Module):
    def __init__(self, num_features, num_layers, num_hiddens, ACTIVATION):
        super().__init__()
        self.num_layers = num_layers
        self.fcs = []
        self.activate = ACTIVATION
        # Define input layer
        self.input_fc = nn.Linear(num_features, num_hiddens)
        # Define hidden layers
        for i in range(0, num_layers):
            fc = nn.Linear(num_hiddens, num_hiddens)
            setattr(self, 'fc%i' % i, fc)
            self.fcs.append(fc)
        # Define output layers
        self.output_layer = nn.Linear(num_hiddens, 2)

    def forward(self, x):
        x = self.input_fc(x)
        for i in range(0, self.num_layers):
            x = self.fcs[i](x)
            x = self.activate(x)
        x = self.output_layer(x)
        return x

In [10]:
loss_function = nn.CrossEntropyLoss()
def loss_func(model, features, true_values):
    predict_values = model.forward(features)
    res = loss_function(predict_values, true_values)
    return res

## Hyperparameters
ReLU activation function, 3 hidden layers, 40 neuron for each hidden layers; 5000 training batch; Adam optimizer

In [11]:
# hyperparameters and net
ACTIVATION = torch.relu
# select feature 3 layers and 40 nodes; all feature 2 layer and 60 nodes
net_parameters = [(3, 40)]
max_iter = 5000

In [12]:
features_data = torch.from_numpy(features_data).float()
label_data = torch.from_numpy(label_data).long()
testing_data_rows = []
training_data_rows = []
final_accuracy = None
final_precision = None
final_f1 = None
final_pod = None
final_ftp = None
final_prediction = None
for net_parameter in net_parameters:
    accuracy_testing = []
    precision_testing = []
    f1_testing = []
    pod_testing = []
    ftp_testing = []
    
    accuracy_training = []
    precision_training = []
    f1_training = []
    pod_training = []
    ftp_training = []
    
    for train_index, test_index in kf.split(features_data):
        # build the net
        net = NeuralNetwork(num_features, net_parameter[0], net_parameter[1], ACTIVATION)
        optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
        X_train, X_test = features_data[train_index], features_data[test_index]
        y_train, y_test = label_data[train_index], label_data[test_index]
        
        for i in range(max_iter):
            loss = loss_func(net, X_train, y_train)
            optimizer.zero_grad()     # zeroes the gradient buffers of all parameters
            loss.backward() #backprop
            optimizer.step()
        print(loss)
        # evaluate the model r2 using test data set, etc.
        net.eval()
        # test data performance
        predict_res = net(X_test)
        _, predict_res = torch.max(predict_res, 1)
        predict_res = predict_res.detach().numpy()
        accuracy_testing.append(accuracy_score(y_test, predict_res))
        precision_testing.append(precision_score(y_test, predict_res))
        f1_testing.append(f1_score(y_test, predict_res))
        pod_testing.append(recall_score(y_test, predict_res))
        ftp_testing.append(1 - recall_score(y_test, predict_res))
        
        # training data performance
        predict_res = net(X_train)
        _, predict_res = torch.max(predict_res, 1)
        predict_res = predict_res.detach().numpy()
        accuracy_training.append(accuracy_score(y_train, predict_res))
        precision_training.append(precision_score(y_train, predict_res))
        f1_training.append(f1_score(y_train, predict_res))
        pod_training.append(recall_score(y_train, predict_res))
        ftp_training.append(1 - recall_score(y_train, predict_res))
        
    # write down the performance for current hyperparameters
    accuracy_mean = np.mean(accuracy_testing)
    precision_mean = np.mean(precision_testing)
    f1_mean = np.mean(f1_testing)
    pod_mean = np.mean(pod_testing)
    ftp_mean = np.mean(ftp_testing)
    data_row = [net_parameter[0], net_parameter[1], accuracy_mean, precision_mean, f1_mean, pod_mean, ftp_mean]
    testing_data_rows.append(data_row)
    
    accuracy_mean = np.mean(accuracy_training)
    precision_mean = np.mean(precision_training)
    f1_mean = np.mean(f1_training)
    pod_mean = np.mean(pod_training)
    ftp_mean = np.mean(ftp_training)
    data_row = [net_parameter[0], net_parameter[1], accuracy_mean, precision_mean, f1_mean, pod_mean, ftp_mean]
    training_data_rows.append(data_row)
    
    # train by all data
    net = NeuralNetwork(num_features, net_parameter[0], net_parameter[1], ACTIVATION)
    optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
    for i in range(max_iter):
        loss = loss_func(net, features_data, label_data)
        optimizer.zero_grad()     # zeroes the gradient buffers of all parameters
        loss.backward() #backprop
        optimizer.step()
    net.eval()
    # evaluate the model performance
    predict_res = net(features_data)
    _, predict_res = torch.max(predict_res, 1)
    predict_res = predict_res.detach().numpy()
    final_accuracy = accuracy_score(label_data, predict_res)
    final_precision = precision_score(label_data, predict_res)
    final_f1 = f1_score(label_data, predict_res)
    final_pod = recall_score(label_data, predict_res)
    final_ftp = 1 - recall_score(label_data, predict_res)
    
    final_prediction = predict_res

tensor(0.3551, grad_fn=<NllLossBackward0>)
tensor(0.3634, grad_fn=<NllLossBackward0>)
tensor(0.3741, grad_fn=<NllLossBackward0>)
tensor(0.4016, grad_fn=<NllLossBackward0>)
tensor(0.3917, grad_fn=<NllLossBackward0>)
tensor(0.3594, grad_fn=<NllLossBackward0>)
tensor(0.3747, grad_fn=<NllLossBackward0>)
tensor(0.3861, grad_fn=<NllLossBackward0>)
tensor(0.3987, grad_fn=<NllLossBackward0>)
tensor(0.3672, grad_fn=<NllLossBackward0>)


## Cross Validation Results

In [13]:
# for Table S4
print("Training Data")
print("accuracy = %f   precision = %f   f1 = %f   POD = %f   FTP = %f" 
      %(training_data_rows[0][2], training_data_rows[0][3], 
        training_data_rows[0][4], training_data_rows[0][5],
        training_data_rows[0][6]))
print("Testing Data")
print("accuracy = %f   precision = %f   f1 = %f   POD = %f   FTP = %f" 
      %(testing_data_rows[0][2], testing_data_rows[0][3], 
        testing_data_rows[0][4], testing_data_rows[0][5],
        testing_data_rows[0][6]))

Training Data
accuracy = 0.824703   precision = 0.843813   f1 = 0.857216   POD = 0.871854   FTP = 0.128146
Testing Data
accuracy = 0.796516   precision = 0.823056   f1 = 0.833173   POD = 0.844781   FTP = 0.155219


## Final Model Performance

In [14]:
# for Table 1
print("accuracy = %f   precision = %f   f1 = %f   POD = %f   FTP = %f"    
      %(final_accuracy, final_precision, final_f1, final_pod, final_ftp))

accuracy = 0.830924   precision = 0.867761   f1 = 0.858456   POD = 0.849348   FTP = 0.150652


## Confusion Matrix

In [15]:
confusion_matricies=confusion_matrix(label_data, final_prediction)
columns = ['class %s' %(i) for i in list(ascii_uppercase)[0:len(np.unique(label_data))]]
columns=['Non-exc','Exc']
# columns=['Non Exceedance', 'Exceedance']
df_cm = pd.DataFrame(confusion_matricies, index=columns, columns=columns)
df_cm

Unnamed: 0,Non-exc,Exc
Non-exc,1739,427
Exc,497,2802


In [16]:
net

NeuralNetwork(
  (input_fc): Linear(in_features=12, out_features=40, bias=True)
  (fc0): Linear(in_features=40, out_features=40, bias=True)
  (fc1): Linear(in_features=40, out_features=40, bias=True)
  (fc2): Linear(in_features=40, out_features=40, bias=True)
  (output_layer): Linear(in_features=40, out_features=2, bias=True)
)