In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np
from sklearn.datasets import make_friedman2
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from string import ascii_uppercase
from sklearn.linear_model import Perceptron

## Read and Reformat the data

In [2]:
data_file = "../data/220720PM25diffsite.csv"
output_file = "../result/gpr_all_features.csv"

df = pd.read_csv(data_file)
selected_features = ["pm25", "ENSOmonthly"
           ,"eNOx","SO2emis","PM25emis","eVOC","NH3emis"
           ,"TMAXbarstow","AWNDLAX","Mir850RH","Rhontario"
           ,"dayofweekf","dayofyear"]
all_features = ["pm25", "ENSOmonthly"
              ,"eNOx","SO2emis","PM25emis","eVOC","NH3emis"
              ,"TMAXbarstow","AWNDLAX","Mir850RH","Rhontario"
              ,"dayofweekf","dayofyear"
              ,"MirTemp500C","MirWS850ms","MirWD850","MirHeight850","MirWS500ms","MirWD500","Mir500RH"
              ,"SRmeanC","AWNDbarstow","TMAXLAX","TMAXontario","AWNDontario"]
df_selected = df[selected_features]
df_all_features = df[all_features]

In [3]:
dataset = df_all_features.dropna()
label_name = "pm25"
y_vector = dataset[[label_name]]
# change it for all features or selected features
# features_names = all_features.copy()
features_names = selected_features.copy()
features_names.remove(label_name)
X_matrix = dataset[features_names]

In [4]:
def dayofweekToNum(data_frame):
    day_mapping = {"Mon": 1, "Tue": 2, "Wed": 3, "Thu": 4, "Fri": 5, "Sat": 6, "Sun": 7}
    dayofweekf = data_frame["dayofweekf"].to_numpy()
    res = []
    for i in range(0, len(dayofweekf)):
        res.append(day_mapping[dayofweekf[i]])
    data_frame.loc[:, ("dayofweekf")] = res
    return data_frame
X_matrix = dayofweekToNum(X_matrix)
print(X_matrix)

      ENSOmonthly      eNOx  SO2emis  PM25emis     eVOC  NH3emis  TMAXbarstow  \
3           24.78  1007.938   62.837    78.766  999.205   94.759         16.7   
4           24.78  1007.938   62.837    78.766  999.205   94.759         16.7   
6           24.78  1007.938   62.837    78.766  999.205   94.759         13.3   
8           24.78  1007.938   62.837    78.766  999.205   94.759         18.3   
9           24.78  1007.938   62.837    78.766  999.205   94.759         23.9   
...           ...       ...      ...       ...      ...      ...          ...   
7299        27.07   337.141   16.174    81.619  526.083   79.151          7.0   
7300        27.07   337.141   16.174    81.619  526.083   79.151          9.0   
7301        27.07   337.141   16.174    81.619  526.083   79.151         11.0   
7302        27.07   337.141   16.174    81.619  526.083   79.151          9.0   
7303        27.07   337.141   16.174    81.619  526.083   79.151          9.0   

      AWNDLAX   Mir850RH  R

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [5]:
# threshold = 12
threshold = 25
features_data = X_matrix.to_numpy()
_, num_features = features_data.shape
label_data = y_vector.to_numpy()
# split the data for 10-fold cross validation
kf = KFold(n_splits=10, shuffle=True, random_state=100)
classified_label = np.zeros(label_data.shape)
classified_label[label_data >= threshold] = 1
label_data = classified_label.ravel()

## Perceptron

In [6]:
# after tuning none penalty is best
penalty = None
alphas = [0]

In [7]:
testing_data_rows = []
training_data_rows = []
final_accuracy = None
final_precision = None
final_f1 = None
final_pod = None
final_ftp = None
final_prediction = None
for alpha in alphas:
    accuracy_testing = []
    precision_testing = []
    f1_testing = []
    pod_testing = []
    ftp_testing = []
    
    accuracy_training = []
    precision_training = []
    f1_training = []
    pod_training = []
    ftp_training = []
    
    for train_index, test_index in kf.split(features_data):
        X_train, X_test = features_data[train_index], features_data[test_index]
        y_train, y_test = label_data[train_index], label_data[test_index]
        model = Perceptron(penalty=penalty, 
                           alpha=alpha, 
                           fit_intercept=True, 
                           max_iter=5000,
                           random_state=0)
        model.fit(X_train, y_train)
        # evaluate the model performance
        # test data
        predict_res = model.predict(X_test)
        accuracy_testing.append(accuracy_score(y_test, predict_res))
        precision_testing.append(precision_score(y_test, predict_res))
        f1_testing.append(f1_score(y_test, predict_res))
        pod_testing.append(recall_score(y_test, predict_res))
        ftp_testing.append(1 - recall_score(y_test, predict_res))
        
        # training data
        predict_res = model.predict(X_train)
        accuracy_training.append(accuracy_score(y_train, predict_res))
        precision_training.append(precision_score(y_train, predict_res))
        f1_training.append(f1_score(y_train, predict_res))
        pod_training.append(recall_score(y_train, predict_res))
        ftp_training.append(1 - recall_score(y_train, predict_res))

    # write down the performance for current hyperparameters
    accuracy_mean = np.mean(accuracy_testing)
    precision_mean = np.mean(precision_testing)
    f1_mean = np.mean(f1_testing)
    pod_mean = np.mean(pod_testing)
    ftp_mean = np.mean(ftp_testing)
    data_row = [alpha, accuracy_mean, precision_mean, f1_mean, pod_mean, ftp_mean]
    testing_data_rows.append(data_row)
    
    accuracy_mean = np.mean(accuracy_training)
    precision_mean = np.mean(precision_training)
    f1_mean = np.mean(f1_training)
    pod_mean = np.mean(pod_training)
    ftp_mean = np.mean(ftp_training)
    data_row = [alpha, accuracy_mean, precision_mean, f1_mean, pod_mean, ftp_mean]
    training_data_rows.append(data_row)
    
    # train by all data
    model = Perceptron(penalty=penalty, 
                   alpha=alpha, 
                   fit_intercept=True, 
                   max_iter=5000,
                   random_state=0)
    model.fit(features_data, label_data)
    # evaluate the model performance
    predict_res = model.predict(features_data)
    final_accuracy = accuracy_score(label_data, predict_res)
    final_precision = precision_score(label_data, predict_res)
    final_f1 = f1_score(label_data, predict_res)
    final_pod = recall_score(label_data, predict_res)
    final_ftp = 1 - recall_score(label_data, predict_res)
    
    final_prediction = predict_res

# clf = Perceptron(penalty=None,
    #                  alpha=0.0001,
    #                  l1_ratio=0.15,
    #                  fit_intercept=True,
    #                  max_iter=1000,
    #                  tol=0.001,
    #                  random_state=0)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
for i in range(0, len(testing_data_rows)):
    print("alpha = %f   accuracy = %f   precision = %f   f1 = %f   POD = %f   FTP = %f" 
      %(training_data_rows[i][0], training_data_rows[i][1], 
        training_data_rows[i][2], training_data_rows[i][3],
        training_data_rows[i][4], training_data_rows[i][5]))

alpha = 0.000000   accuracy = 0.801808   precision = 0.506452   f1 = 0.275175   POD = 0.344801   FTP = 0.655199


## Cross Validation Results

In [9]:
# for Table S4
print("Training Data")
print("accuracy = %f   precision = %f   f1 = %f   POD = %f   FTP = %f" 
      %(training_data_rows[0][1], training_data_rows[0][2], 
        training_data_rows[0][3], training_data_rows[0][4],
        training_data_rows[0][5]))
print("Testing Data")
print("accuracy = %f   precision = %f   f1 = %f   POD = %f   FTP = %f" 
      %(testing_data_rows[0][1], testing_data_rows[0][2], 
        testing_data_rows[0][3], testing_data_rows[0][4],
        testing_data_rows[0][5]))

Training Data
accuracy = 0.801808   precision = 0.506452   f1 = 0.275175   POD = 0.344801   FTP = 0.655199
Testing Data
accuracy = 0.798914   precision = 0.546075   f1 = 0.268443   POD = 0.336249   FTP = 0.663751


In [10]:
# for Table 1
print("accuracy = %f   precision = %f   f1 = %f   POD = %f   FTP = %f"    
      %(final_accuracy, final_precision, final_f1, final_pod, final_ftp))

accuracy = 0.810064   precision = 0.600000   f1 = 0.005747   POD = 0.002887   FTP = 0.997113


## Generate Data for Annual Evaluation

In [11]:
confusion_matricies=confusion_matrix(label_data, final_prediction)
columns = ['class %s' %(i) for i in list(ascii_uppercase)[0:len(np.unique(label_data))]]
columns=['Non-exc','Exc']
# columns=['Non Exceedance', 'Exceedance']
df_cm = pd.DataFrame(confusion_matricies, index=columns, columns=columns)
df_cm
# ax=sn.heatmap(df_cm,cmap='Oranges',annot=True,fmt='g',cbar_kws={'label': 'Occurrences'})
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Confusion Matrix for ' + "Perceptron")

Unnamed: 0,Non-exc,Exc
Non-exc,4424,2
Exc,1036,3
