In [57]:
#Import basic python packages for data analysis and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.lines as mlines
import pylab as plot
import matplotlib
import random
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.axes_grid1 import make_axes_locatable
import math
import time
import os

import torch

## Pytorch practice

In [58]:
data = [[1, 2],[3, 4]]
x_data = torch.tensor(data)
x_data

shape = (2,3,)
rand_tensor = torch.rand(shape)
rand_tensor

tensor = torch.ones(4, 4)
tensor.shape

t1 = torch.cat([tensor, tensor, tensor], dim=-2)

t = torch.ones(5)
t

n = t.numpy()
print(f"n: {n}")

t.add_(1)
print(f"t: {t}")
print(f"n: {n}")

n: [1. 1. 1. 1. 1.]
t: tensor([2., 2., 2., 2., 2.])
n: [2. 2. 2. 2. 2.]


In [59]:
t.sum()

tensor(10.)

## (A) Dataset prep

In [60]:
# merge all datasets to create master_df

root = os.path.dirname(os.path.dirname(os.getcwd()))

df_aq = pd.read_csv(root + "/data/cleaned/air_quality_NO2.csv", index_col=0)[['value','latitude', 'longitude']]
df_met = pd.read_csv(root + "/data/cleaned/nO2_met.csv", index_col=0)
df_fac = pd.read_csv(root + "/data/cleaned/no2_fac_data.csv", index_col=0)
# df_fac.drop(df_fac.columns[df_fac.columns.str.contains('_emsdist')], axis=1, inplace=True)
df_traffic = pd.read_csv(root + "/data/cleaned/intersection_final.csv", index_col=0)

df_m1 = df_aq.merge(df_met, on = ['latitude', 'longitude'], how = 'inner')
df_m2 = df_m1.merge(df_fac, on = ['latitude', 'longitude'], how = 'inner')
df_merged = df_m2.merge(df_traffic, on = ['latitude', 'longitude'], how = 'inner')
df_merged.drop(columns = ['latitude', 'longitude'], inplace=True)

X = df_merged.drop("value",1) 
y = df_merged["value"]

In [61]:
# function to scale and transform input data
# def get_data(X, y):
#     X = X.values
#     y = y.values
#     # scaling the data
#     feature_scaler = StandardScaler()
#     X = feature_scaler.fit_transform(X)
#     return X, y

# # acquiring transformed data
# X_arr, y_arr = get_data(X, y)

# # splitting into test and train
# X_train, X_test, y_train, y_test = train_test_split(X_arr, y_arr, test_size=0.3)
# cols = np.array(X.columns)

In [62]:
# splitting into test and train
X_train, X_test, y_train, y_test = train_test_split(np.array(X), np.array(y), test_size=0.3)
cols = np.array(X.columns)

## (B) Modeling

In [63]:
# dataset class for feeding in data
class AirQualityDataset(torch.utils.data.Dataset):

  def __init__(self, X_arr, y_arr):
    self.x_data = torch.tensor(X_arr, \
      dtype=torch.float32)
    self.y_data = torch.tensor(y_arr, \
      dtype=torch.float32)

  def __len__(self):
    return len(self.x_data)

  def __getitem__(self, idx):
    preds = self.x_data[idx,:]  # or just [idx]
    conc = self.y_data[idx] 
    return (preds, conc)       # tuple of matrices

# prepping data for training
batch_size = 64
train_ds = AirQualityDataset(X_train, y_train)
test_ds = AirQualityDataset(X_test, y_test)
train_ldr = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_ldr = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=True)

In [64]:
# network architecture
# class NeuralNetwork(torch.nn.Module):
#     def __init__(self):
#         super(NeuralNetwork, self).__init__()
#         self.linear_relu_stack = torch.nn.Sequential(
#             torch.nn.Linear(82, 30),
#             torch.nn.ReLU(),
#             torch.nn.Linear(30, 30),
#             torch.nn.ReLU(),
#             torch.nn.Linear(30, 1),
#         )

#     def forward(self, x):
#         pred_conc = self.linear_relu_stack(x)
#         return pred_conc

In [65]:
# network architecture
class NeuralNetwork(torch.nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear_tanh_stack = torch.nn.Sequential(
            torch.nn.Linear(82, 30),
            torch.nn.Tanh(),
            torch.nn.Linear(30, 30),
            torch.nn.Tanh(),
#             torch.nn.Dropout(p=0.1),
            torch.nn.Linear(30, 20),
            torch.nn.Tanh(),
            torch.nn.Linear(20, 20),
            torch.nn.Tanh(),
            torch.nn.Linear(20, 1),
        )

    def forward(self, x):
        pred_conc = self.linear_tanh_stack(x)
        return pred_conc

In [66]:
# creating model instance
model = NeuralNetwork()
print(model)

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")
    
# initialising hyperparameters
learning_rate = 1e-2
epochs = 30

# initializing the loss function
loss_fn = torch.nn.MSELoss()

# initializing the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

NeuralNetwork(
  (linear_tanh_stack): Sequential(
    (0): Linear(in_features=82, out_features=30, bias=True)
    (1): Tanh()
    (2): Linear(in_features=30, out_features=30, bias=True)
    (3): Tanh()
    (4): Linear(in_features=30, out_features=20, bias=True)
    (5): Tanh()
    (6): Linear(in_features=20, out_features=20, bias=True)
    (7): Tanh()
    (8): Linear(in_features=20, out_features=1, bias=True)
  )
)
Layer: linear_tanh_stack.0.weight | Size: torch.Size([30, 82]) | Values : tensor([[ 0.0335,  0.0573, -0.0334, -0.0609,  0.0700,  0.0830, -0.1000, -0.0420,
         -0.0081, -0.0940,  0.0851, -0.0881, -0.0015, -0.0726, -0.0904,  0.0521,
          0.0468, -0.0423,  0.0429,  0.0464,  0.0338,  0.0069, -0.0225,  0.0194,
          0.0895,  0.0083,  0.0848, -0.1052, -0.0272,  0.0152, -0.0951,  0.0038,
         -0.1002, -0.0529, -0.1088, -0.0146, -0.1061, -0.0795, -0.1037,  0.0682,
          0.0776, -0.0852,  0.0695,  0.0854,  0.0945,  0.1000,  0.0358, -0.0457,
         -0.0376,  0.

In [67]:
# defining train and test loops

def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss = 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()

    test_loss /= num_batches
    print(f"Test Error: Avg loss: {test_loss:>8f} \n")

In [68]:
# executing training and testing
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_ldr, model, loss_fn, optimizer)
    test_loop(test_ldr, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.006713  [    0/ 7511]
loss: 0.000070  [ 6400/ 7511]
Test Error: Avg loss: 0.000037 

Epoch 2
-------------------------------
loss: 0.000011  [    0/ 7511]
loss: 0.000015  [ 6400/ 7511]
Test Error: Avg loss: 0.000020 

Epoch 3
-------------------------------
loss: 0.000048  [    0/ 7511]
loss: 0.000012  [ 6400/ 7511]
Test Error: Avg loss: 0.000016 

Epoch 4
-------------------------------
loss: 0.000003  [    0/ 7511]
loss: 0.000021  [ 6400/ 7511]
Test Error: Avg loss: 0.000015 

Epoch 5
-------------------------------
loss: 0.000055  [    0/ 7511]
loss: 0.000008  [ 6400/ 7511]
Test Error: Avg loss: 0.000015 

Epoch 6
-------------------------------
loss: 0.000008  [    0/ 7511]
loss: 0.000001  [ 6400/ 7511]
Test Error: Avg loss: 0.000014 

Epoch 7
-------------------------------
loss: 0.000084  [    0/ 7511]
loss: 0.000001  [ 6400/ 7511]
Test Error: Avg loss: 0.000014 

Epoch 8
-------------------------------
loss: 0.000002  [    0/ 7511]