In [1]:
#Import basic python packages for data analysis and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.lines as mlines
import pylab as plot
import matplotlib
import random
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.axes_grid1 import make_axes_locatable
import math
import time
import os

import torch

## (A) Dataset prep

In [6]:
# merge all datasets to create master_df

root = os.path.dirname(os.path.dirname(os.getcwd()))

df_aq = pd.read_csv(root + "/data/cleaned/air_quality_NO2.csv", index_col=0)[['value','latitude', 'longitude']]
df_met = pd.read_csv(root + "/data/cleaned/nO2_met.csv", index_col=0)
df_fac = pd.read_csv(root + "/data/cleaned/no2_fac_data.csv", index_col=0)
# df_fac.drop(df_fac.columns[df_fac.columns.str.contains('_emsdist')], axis=1, inplace=True)
df_traffic = pd.read_csv(root + "/data/cleaned/intersection_final.csv", index_col=0)

df_m1 = df_aq.merge(df_met, on = ['latitude', 'longitude'], how = 'inner')
df_m2 = df_m1.merge(df_fac, on = ['latitude', 'longitude'], how = 'inner')
df_merged = df_m2.merge(df_traffic, on = ['latitude', 'longitude'], how = 'inner')
df_merged.drop(columns = ['latitude', 'longitude'], inplace=True)

X = df_merged.drop("value",1) 
y = df_merged["value"]


In [7]:
# function to scale and transform input data
# def get_data(X, y):
#     X = X.values
#     y = y.values
#     # scaling the data
#     feature_scaler = StandardScaler()
#     X = feature_scaler.fit_transform(X)
#     return X, y

# # acquiring transformed data
# X_arr, y_arr = get_data(X, y)

# # splitting into test and train
# X_train, X_test, y_train, y_test = train_test_split(X_arr, y_arr, test_size=0.3)
# cols = np.array(X.columns)

In [9]:
# splitting into test and train
X_train, X_test, y_train, y_test = train_test_split(np.array(X), np.array(y), test_size=0.3, random_state=30)
cols = np.array(X.columns)

## (B) Modeling

In [10]:
# dataset class for feeding in data
torch.manual_seed(237943)
class AirQualityDataset(torch.utils.data.Dataset):

  def __init__(self, X_arr, y_arr):
    self.x_data = torch.tensor(X_arr, \
      dtype=torch.float32)
    self.y_data = torch.tensor(y_arr, \
      dtype=torch.float32)

  def __len__(self):
    return len(self.x_data)

  def __getitem__(self, idx):
    preds = self.x_data[idx,:]  # or just [idx]
    conc = self.y_data[idx] 
    return (preds, conc)       # tuple of matrices

# prepping data for training
batch_size = 5
train_ds = AirQualityDataset(X_train, y_train)
test_ds = AirQualityDataset(X_test, y_test)
train_ldr = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_ldr = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=True)

In [11]:
# network architecture
class NeuralNetwork(torch.nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear_tanh_stack = torch.nn.Sequential(
            torch.nn.Linear(82, 60),
            torch.nn.Tanh(),
            torch.nn.Linear(60, 60),
            torch.nn.Tanh(),
            torch.nn.Linear(60, 40),
            torch.nn.Tanh(),
            torch.nn.Linear(40, 30),
            torch.nn.Tanh(),
#             torch.nn.Dropout(p=0.1),
            torch.nn.Linear(30, 20),
            torch.nn.Tanh(),
            torch.nn.Linear(20, 20),
            torch.nn.Tanh(),
            torch.nn.Linear(20,10),
            torch.nn.Tanh(),
            torch.nn.Linear(10, 1),
        )

    def forward(self, x):
        pred_conc = self.linear_tanh_stack(x)
        return pred_conc

In [12]:
# creating model instance
model = NeuralNetwork()
print(model)

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")
    
# initialising hyperparameters
learning_rate = 1e-2
epochs = 30

# initializing the loss function
loss_fn = torch.nn.MSELoss()

# initializing the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

NeuralNetwork(
  (linear_tanh_stack): Sequential(
    (0): Linear(in_features=82, out_features=60, bias=True)
    (1): Tanh()
    (2): Linear(in_features=60, out_features=60, bias=True)
    (3): Tanh()
    (4): Linear(in_features=60, out_features=40, bias=True)
    (5): Tanh()
    (6): Linear(in_features=40, out_features=30, bias=True)
    (7): Tanh()
    (8): Linear(in_features=30, out_features=20, bias=True)
    (9): Tanh()
    (10): Linear(in_features=20, out_features=20, bias=True)
    (11): Tanh()
    (12): Linear(in_features=20, out_features=10, bias=True)
    (13): Tanh()
    (14): Linear(in_features=10, out_features=1, bias=True)
  )
)
Layer: linear_tanh_stack.0.weight | Size: torch.Size([60, 82]) | Values : tensor([[-0.0805,  0.0095,  0.0651, -0.0526, -0.0743, -0.0443,  0.0905, -0.0364,
          0.0597, -0.0998,  0.0228,  0.0688,  0.0926, -0.0531, -0.0602,  0.0254,
          0.0146,  0.0328,  0.0015, -0.0469, -0.0938,  0.0495,  0.0333, -0.0580,
         -0.0489,  0.0238,  0.0

In [13]:
# defining train and test loops

def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss}  [{current}/{size}]")


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss = 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()

    test_loss /= num_batches
    print(f"Test Error: Avg loss: {test_loss} \n")

In [14]:
# executing training and testing
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_ldr, model, loss_fn, optimizer)
    test_loop(test_ldr, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.13525862991809845  [0/7511]
loss: 1.3660488548339345e-05  [500/7511]


  return F.mse_loss(input, target, reduction=self.reduction)


loss: 2.6278855784767075e-06  [1000/7511]
loss: 1.5683312085457146e-05  [1500/7511]
loss: 9.571062946633901e-06  [2000/7511]
loss: 7.448089945683023e-06  [2500/7511]
loss: 3.920215021935292e-06  [3000/7511]
loss: 7.8261773523991e-06  [3500/7511]
loss: 1.3340118130145129e-05  [4000/7511]
loss: 9.444209354114719e-06  [4500/7511]
loss: 6.588407359231496e-06  [5000/7511]
loss: 5.463972456709598e-07  [5500/7511]
loss: 1.2945363323524361e-06  [6000/7511]
loss: 8.421853635809384e-06  [6500/7511]
loss: 1.2145474101998843e-06  [7000/7511]
loss: 6.746405460944516e-07  [7500/7511]


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Test Error: Avg loss: 7.883603984185755e-06 

Epoch 2
-------------------------------
loss: 0.00025659549282863736  [0/7511]
loss: 3.317557911941549e-06  [500/7511]
loss: 8.786929356574547e-06  [1000/7511]
loss: 1.0637850209604949e-05  [1500/7511]
loss: 8.748455002205446e-06  [2000/7511]
loss: 2.618554617583868e-06  [2500/7511]
loss: 7.790486961312126e-06  [3000/7511]
loss: 1.1090861335105728e-05  [3500/7511]
loss: 2.4572159418312367e-06  [4000/7511]
loss: 5.238321136857849e-06  [4500/7511]
loss: 1.3669018699147273e-06  [5000/7511]
loss: 1.1783767149609048e-05  [5500/7511]
loss: 1.684222070252872e-06  [6000/7511]
loss: 1.6944255548878573e-05  [6500/7511]
loss: 1.4617349734180607e-05  [7000/7511]
loss: 7.78377000187902e-07  [7500/7511]
Test Error: Avg loss: 5.75178847311531e-06 

Epoch 3
-------------------------------
loss: 4.9541004045750014e-06  [0/7511]
loss: 1.3472847513185116e-06  [500/7511]
loss: 2.063224201265257e-06  [1000/7511]
loss: 3.150746351820999e-06  [1500/7511]
loss: 6.

Test Error: Avg loss: 3.227148685586329e-06 

Epoch 13
-------------------------------
loss: 4.5425062467074895e-07  [0/7511]
loss: 6.114819370850455e-07  [500/7511]
loss: 3.589718744478887e-07  [1000/7511]
loss: 4.429116131632327e-07  [1500/7511]
loss: 2.8383028620737605e-06  [2000/7511]
loss: 4.248506684234599e-06  [2500/7511]
loss: 7.824251042620745e-07  [3000/7511]
loss: 1.4578512264051824e-06  [3500/7511]
loss: 2.452104581607273e-06  [4000/7511]
loss: 2.474247366990312e-06  [4500/7511]
loss: 1.1983579497609753e-06  [5000/7511]
loss: 8.731848311072099e-08  [5500/7511]
loss: 8.788616128185822e-07  [6000/7511]
loss: 3.69525139376492e-07  [6500/7511]
loss: 1.1111561093457567e-07  [7000/7511]
loss: 4.549098093775683e-07  [7500/7511]
Test Error: Avg loss: 3.138438196533762e-06 

Epoch 14
-------------------------------
loss: 8.584222541685449e-07  [0/7511]
loss: 3.777168728902325e-07  [500/7511]
loss: 9.241132374882e-07  [1000/7511]
loss: 7.537892088294029e-07  [1500/7511]
loss: 3.02647

Test Error: Avg loss: 2.3018707876996403e-06 

Epoch 24
-------------------------------
loss: 2.0069852268989052e-07  [0/7511]
loss: 8.352332088179537e-07  [500/7511]
loss: 1.4336268350234604e-07  [1000/7511]
loss: 1.9781614923886082e-07  [1500/7511]
loss: 1.3698008842766285e-06  [2000/7511]
loss: 3.5461960123939207e-06  [2500/7511]
loss: 8.727766385163704e-07  [3000/7511]
loss: 5.322415290720528e-07  [3500/7511]
loss: 5.139382324159669e-07  [4000/7511]
loss: 1.0305306119562374e-07  [4500/7511]
loss: 1.1403089672512579e-07  [5000/7511]
loss: 1.9670936524107674e-07  [5500/7511]
loss: 1.3925675546033744e-07  [6000/7511]
loss: 2.0155330560100992e-07  [6500/7511]
loss: 8.841123531055928e-07  [7000/7511]
loss: 1.6845323216330144e-06  [7500/7511]
Test Error: Avg loss: 2.265897747346356e-06 

Epoch 25
-------------------------------
loss: 4.318594903907069e-07  [0/7511]
loss: 2.2658720411072863e-07  [500/7511]
loss: 1.903474498021751e-07  [1000/7511]
loss: 1.3928477926583582e-07  [1500/7511]


## Appendix

In [None]:
# First NN model
# network architecture
# class NeuralNetwork(torch.nn.Module):
#     def __init__(self):
#         super(NeuralNetwork, self).__init__()
#         self.linear_relu_stack = torch.nn.Sequential(
#             torch.nn.Linear(82, 30),
#             torch.nn.ReLU(),
#             torch.nn.Linear(30, 30),
#             torch.nn.ReLU(),
#             torch.nn.Linear(30, 1),
#         )

#     def forward(self, x):
#         pred_conc = self.linear_relu_stack(x)
#         return pred_conc