In [4]:
import numpy as np
import pandas as pd
from datetime import date
from datetime import datetime
%matplotlib inline

In this notebook, we will be looking at one location (fips == 8013) which is near Boulder, CO based on longitude and latitude. We use an LSTM model and XGBoost to predict drought classification.

### Import data

In [5]:
path = 'archive/'

In [6]:
train = pd.read_csv(path + 'train_timeseries.csv')

In [7]:
valid = pd.read_csv(path + 'validation_timeseries.csv')

In [8]:
test = pd.read_csv(path + 'test_timeseries.csv')

### Machine learning

In [36]:
import torch
import torch.nn as nn
import sklearn

### LSTM

In [9]:
# the location we will be looking at
fips = 8013

In [28]:
id1 = train[train['fips'] == fips]

In [227]:
# we have to get 14 day timeseries, so reformat the data
x1 = id1[4:-4]
y1 = x1['score']
x1 = x1.drop(columns = ['fips', 'date', 'score'])

In [228]:
# reshape to correct format
x1r = x1.to_numpy()
x1r = (x1r - np.mean(x1r)) / np.std(x1r)
x1r = x1r.reshape(443, 14, 18)

In [229]:
#get every 14th drought value
y1 = y1[13::14]
y1 = y1.to_numpy().reshape(443, 1)

In [230]:
# reformat everything as tensors
x1t = torch.tensor(x1r).float()
y1t = torch.tensor(y1).float()

In [334]:
# implementation from: https://www.jessicayung.com/lstms-for-time-series-in-pytorch/
class LSTM_nn(nn.Module):
    
    def __init__(self, input_dim = 18, hidden_dim = 18, batch_size = 443, output_dim = 1, num_layers = 1):
        super(LSTM_nn, self).__init__()
        # we have an LSTM -> linear
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first = True)
        self.linear = nn.Linear(hidden_dim, output_dim)
 
    def forward(self, input):
        # Forward pass
        _, (h1, _)  = self.lstm(input)
        y_pred = self.linear(h1)
        return y_pred[0]

In [335]:
model = LSTM_nn(input_dim = 18, hidden_dim = 20, output_dim = 1, num_layers = 5)

# use MSE loss to fit model. again, this is mostly taken from https://www.jessicayung.com/lstms-for-time-series-in-pytorch/
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.1) # learning rate is 0.1

epochs = 200

for epoch in range(epochs):
    # zero the gradient, then do forward pass
    model.zero_grad()
    y_pred = model(x1t)
    loss = loss_fn(y_pred, y1t)
    if (epoch % 20 == 0):
        print("step: ", epoch, "loss: ", loss.item())
    # then update parameters
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

step:  0 loss:  1.9517865180969238
step:  20 loss:  1.117758870124817
step:  40 loss:  1.1133390665054321
step:  60 loss:  1.1024142503738403
step:  80 loss:  1.0831773281097412
step:  100 loss:  1.0526504516601562
step:  120 loss:  1.0240780115127563
step:  140 loss:  0.998091995716095
step:  160 loss:  0.9991955757141113
step:  180 loss:  0.938646137714386


In [336]:
# get the f1 macro:
from sklearn.metrics import f1_score

# we'll treat this as classification by taking the floor of each drought score. any negative values output will count as 0
def relu(x):
    return np.maximum(0, x)

f1macro = sklearn.metrics.f1_score(relu(np.floor(y_pred.detach().numpy())), np.floor(y1t.detach().numpy()), average = 'macro')
print("f1 macro train: ", f1macro)

f1 macro train:  0.22952126199536066


In [337]:
acc = np.mean(relu(np.floor(y_pred.detach().numpy())) == np.floor(y1t.detach().numpy()))
print("accuracy: ", acc)

accuracy:  0.654627539503386


### Testing

In [47]:
id1_test = test[test['fips'] == fips]

In [312]:
x1_test = id1_test[1:-2]
y1_test = id1_test['score']
x1_test = x1_test.drop(columns = ['fips', 'date', 'score'])

In [313]:
# reshape testing X to sequence format
x1r_test = x1_test.to_numpy()
x1r_test = (x1r_test - np.mean(x1r_test)) / np.std(x1r_test)
x1r_test = x1r_test.reshape(52, 14, 18)

In [314]:
# get drought classification for every two weeks
y1_test = y1_test[13::14]
y1_test = y1_test.to_numpy().reshape(52, 1)

In [315]:
# convert to tensors
x1t_test = torch.tensor(x1r_test).float()
y1t_test = torch.tensor(y1_test).float()

In [338]:
y_pred_test = model(x1t_test)

In [339]:
f1macro = sklearn.metrics.f1_score(relu(np.floor(y_pred_test.detach().numpy())), np.floor(y1t_test.detach().numpy()), average = 'macro')
print("f1 macro test: ", f1macro)

f1 macro test:  0.23678861788617883


In [340]:
loss_fn(y_pred_test, y1t_test)

tensor(1.7159, grad_fn=<MseLossBackward>)

In [341]:
# accuracy
np.mean(relu(np.floor(y_pred_test.detach().numpy())) == np.floor(y1t_test.detach().numpy()))

0.6346153846153846

In [342]:
#deterministic model
np.mean(np.floor(y1t_test.detach().numpy()) == 0)

0.6538461538461539

### XGBoost

In [46]:
import xgboost

In [48]:
# get floor for classification, drop na values
id1['drought_level'] = np.floor(id1['score'])
id1.drop(columns = ['fips', 'date', 'score'], inplace = True)
id1.dropna(inplace = True)

id1_test['drought_level'] = np.floor(id1_test['score']) 
id1_test.drop(columns = ['fips', 'date', 'score'], inplace = True) 
id1_test.dropna(inplace = True)

In [33]:
# get the correct columns
X_train2 = id1.iloc[:,0:-1]
y_train2 = id1.iloc[:,-1]

X_test2 = id1_test.iloc[:,0:-1]
y_test2 = id1_test.iloc[:,-1]

In [42]:
xg_classifier = xgboost.XGBClassifier(n_estimators = 2)
xg_classifier.fit(X_train2, y_train2)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=2, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [55]:
y_preds = xg_classifier.predict(X_test2)
sklearn.metrics.f1_score(y_test2, y_preds, average = 'macro')

0.18264705882352944

In [56]:
# accuracy:
np.mean(y_test2 == y_preds)

0.6476190476190476

In [102]:
# Random stratifier:
dummy_classifier = sklearn.dummy.DummyClassifier(strategy = "stratified")
dummy_classifier.fit(X_train2, y_train2) # fit classifier on training data

# get predictions for testing data
ydummy_pred = dummy_classifier.predict(X_test2)
sklearn.metrics.f1_score(ydummy_pred, y_test2, average = 'macro')