In [2]:
import sys
sys.path.append("models/")

import itertools
import numpy as np
import pandas as pd
import pickle as pkl
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from setup import *
from dataloader import SurveyDataset
import mnl

%load_ext autoreload
%autoreload 2

# Load Model Embeddings

In [3]:
model_type = 'SAE'
load_model_name = 'Autoencoder'
load_model_file = 'sae'
zoomlevel = 'zoom13'
output_dim = 1
model_run_date = '22020901'
model_code = 'M1_D1'

variable_names = ['active','auto','mas','pt', 'trpgen']

demo_variables = ['tot_population','pct25_34yrs','pct35_50yrs','pctover65yrs',
         'pctwhite_alone','pct_nonwhite','pctblack_alone',
         'pct_col_grad','avg_tt_to_work','inc_per_capita']

In [4]:
with open(proj_dir+"latent_space/"+model_type+"_"+zoomlevel+"_"+str(output_dim**2*2048)+"_"+
                       model_run_date+".pkl", "rb") as f:
    encoder_output = pkl.load(f)
    im = pkl.load(f)
    ct = pkl.load(f)

In [5]:
# Aggregate Embeddings
unique_ct = list(set(ct))
unique_ct.sort()
ct = np.array(ct)
aggregate_embeddings = []
for i in unique_ct:
    aggregate_embeddings.append(np.mean(encoder_output[ct == i], axis=0))
aggregate_embeddings = np.array(aggregate_embeddings)

# Load Trip Data

In [6]:
tp = pd.read_csv(data_dir+"trips.csv")
n_alts = 4

In [7]:
print(tp['mode'].value_counts()/len(tp))

2    0.713060
1    0.132001
4    0.111893
3    0.043046
Name: mode, dtype: float64


In [8]:
tp['tract_1'] = tp['state_fips_1'].astype(str) + '_' + tp['county_fips_1'].astype(str)+ '_' + tp['tract_fips_1'].astype(str)
tp['tract_2'] = tp['state_fips_2'].astype(str) + '_' + tp['county_fips_2'].astype(str)+ '_' + tp['tract_fips_2'].astype(str)

tp['morning'] = (tp['dep_hour'] > 6) & (tp['dep_hour'] < 10)
tp['afternoon'] = (tp['dep_hour'] > 15) & (tp['dep_hour'] < 19)
tp['morning'] = tp['morning'].astype(int)
tp['afternoon'] = tp['afternoon'].astype(int)

tp['const'] = 1

def normalize_features(df, cols):
    for c in cols:
        df[c] = df[c]/df[c].max()
    return df

In [9]:
unique_ct = np.array(unique_ct)

x_embed = []
trip_filter = []
for t1, t2 in zip(tp['tract_1'], tp['tract_2']):
    if sum(unique_ct == t1) == 1 and sum(unique_ct == t2) == 1:
        x_embed.append(np.hstack((aggregate_embeddings[unique_ct == t1], aggregate_embeddings[unique_ct == t2])).flatten())
        trip_filter.append(True)
    else:
        trip_filter.append(False)
        

In [10]:
len(unique_ct)

1337

In [11]:
trip_filter = np.array(trip_filter)
x_embed = np.array(x_embed)
x_trip = tp[['morning','afternoon','companion', 'distance', 
         'from_home', 'to_home', 'purp_work', 'purp_school', 'purp_errand', 'purp_recreation', 
         'ontime_important', '12_18yrs', '18_25yrs', '25_55yrs', '55+yrs', 'no_age', 
         'disability', 'educ_col', 'educ_grad', 
         'race_white', 'race_black', 'race_asian', 
         'male', 'female', 
         'emply_park', 'emply_transit', 'emply_veh', 'emply_wfh', 'emply_flex', 'emply_hours', 
         'license', 'person_trips', 'person_transit', 'person_freq_transit', 
         'hh_inc_0_30', 'hh_inc_30_60', 'hh_inc_60_100', 'hh_inc_100_150', 'hh_inc_150', 
         'avg_pr_veh', 'home_own', 'home_house', 'home_condo']].to_numpy()[trip_filter]

x = np.concatenate([x_trip, x_embed], axis=1)

y = tp['mode'].astype(int).to_numpy() - 1
y = y[trip_filter]

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [13]:
trainset = SurveyDataset(torch.tensor(x_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
trainloader = DataLoader(trainset, batch_size=256, shuffle=True)

testset = SurveyDataset(torch.tensor(x_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))
testloader = DataLoader(testset, batch_size=len(testset), shuffle=False)

In [14]:
x.shape

(75248, 4139)

In [15]:
loss_fn = nn.CrossEntropyLoss(reduction='mean')

wd_list = [0.00005,0.0005]
lr_list = [0.005]
do_list = [0, 0.2, 0.5]

for (lr, wd, do) in itertools.product(lr_list, wd_list, do_list):

    model = mnl.MNL2(n_alts=n_alts, dim_embed=x_embed.shape[-1], dim_demo=x_trip.shape[-1], dropout=do)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    ref1 = 0
    ref2 = 0

    for epoch in range(100):
        loss_ = 0
        correct = 0
        for batch, (x_batch, y_batch) in enumerate(trainloader):
            # Compute prediction and loss
            util = model(x_batch)
            loss = loss_fn(util, y_batch)
            loss_ += loss.item() * len(x_batch)

            pred = torch.argmax(util, dim=1)
            correct += torch.sum(pred == y_batch)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


        if epoch % 3 == 0:
            loss_ /= len(trainset)
            train_acc = correct/len(trainset)
            print(f"[epoch: {epoch:>3d}] Train loss: {loss_:.4f} accuracy: {train_acc:.3f}")
            
            correct = 0
            test_loss_ = 0
            for batch, (x_batch, y_batch) in enumerate(testloader):
                util = model(x_batch)
                loss = loss_fn(util, y_batch)
                test_loss_ += loss.item()
                pred = torch.argmax(util, dim=1)
                correct += torch.sum(pred == y_batch)
            assert batch == 0 # there is only one batch in test
            test_acc = correct/len(testset)            
            print(f"[epoch: {epoch:>3d}] Test loss: {test_loss_:.4f} accuracy: {test_acc:.3f}")

            if epoch > 15:
                if (np.abs(loss_ - ref1)/ref1<ref1*0.01) & (np.abs(loss_ - ref2)/ref2<ref2*0.01):
                    print("Early stopping at epoch", epoch)
                    break
                if (ref1 < loss_) & (ref1 < ref2):
                    print("Diverging. stop.")
                    break
                if loss_ < best:
                    best = loss_
                    best_test = test_loss_
                    best_epoch = epoch
                    best_train_acc = train_acc
                    best_test_acc = test_acc
            else:
                best = loss_
                best_test = test_loss_
                best_epoch = epoch
                best_train_acc = train_acc
                best_test_acc = test_acc
                
            ref2 = ref1
            ref1 = loss_

    with open(out_dir+model_code+".csv", "a") as f:
        f.write("%s,%s,%s,%s,%.4f,%.5f,%.1f,%d,%.4f,%.4f,%.4f,%.4f\n" % \
            (model_run_date, model_type, zoomlevel, "MNL2", lr, wd, do, 
             best_epoch, best, best_test, best_train_acc, best_test_acc))
        

[epoch:   0] Train loss: 0.4810 accuracy: 0.831
[epoch:   0] Test loss: 0.3669 accuracy: 0.876
[epoch:   3] Train loss: 0.3494 accuracy: 0.879
[epoch:   3] Test loss: 0.3405 accuracy: 0.884
[epoch:   6] Train loss: 0.3362 accuracy: 0.883
[epoch:   6] Test loss: 0.3310 accuracy: 0.887
[epoch:   9] Train loss: 0.3287 accuracy: 0.885
[epoch:   9] Test loss: 0.3192 accuracy: 0.890
[epoch:  12] Train loss: 0.3306 accuracy: 0.883
[epoch:  12] Test loss: 0.3419 accuracy: 0.884
[epoch:  15] Train loss: 0.3263 accuracy: 0.886
[epoch:  15] Test loss: 0.3193 accuracy: 0.889
[epoch:  18] Train loss: 0.3245 accuracy: 0.886
[epoch:  18] Test loss: 0.3318 accuracy: 0.886
[epoch:  21] Train loss: 0.3198 accuracy: 0.888
[epoch:  21] Test loss: 0.3394 accuracy: 0.883
[epoch:  24] Train loss: 0.3153 accuracy: 0.889
[epoch:  24] Test loss: 0.3150 accuracy: 0.891
[epoch:  27] Train loss: 0.3223 accuracy: 0.887
[epoch:  27] Test loss: 0.3301 accuracy: 0.885
Diverging. stop.
[epoch:   0] Train loss: 0.4715 a