In [1]:
import sys
sys.path.append("models/")

%load_ext autoreload
%autoreload 2
from collections import OrderedDict
import os
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import pickle as pkl
import numpy as np

import itertools
import glob

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn import linear_model
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_validate, GroupKFold
import statsmodels.api as sm


from dataloader import SurveyDataset, load_aggregate_travel_behavior, load_demo_v1
from util_model import load_model
import mnl
from setup import out_dir, data_dir, image_dir, model_dir, proj_dir


In [2]:
data_version = '1571'

model_type = 'SSD'
sampling = 's'

zoomlevel = 'zoom15'
output_dim = 3
model_run_date = '2208'

v2 = 1

variable_names = ['active','auto','mas','pt', 'trpgen']

demo_variables = ['tot_population','pct25_34yrs','pct35_50yrs','pctover65yrs',
         'pctwhite_alone','pct_nonwhite','pctblack_alone',
         'pct_col_grad','avg_tt_to_work','inc_per_capita']


# Load Model Embeddings

In [3]:
with open(proj_dir+"latent_space/SSD_"+zoomlevel+"_"+str(output_dim**2*2048)+"_"+str(v2)+"_"+
                       str(model_run_date)+".pkl", "rb") as f:
    encoder_output = pkl.load(f)
    im = pkl.load(f)
    ct = pkl.load(f)

In [9]:
# Aggregate Embeddings
unique_ct = list(set(ct))
unique_ct.sort()
ct = np.array(ct)
aggregate_embeddings = []
for i in unique_ct:
    aggregate_embeddings.append(np.mean(encoder_output[ct == i], axis=0))
aggregate_embeddings = np.array(aggregate_embeddings)

x = aggregate_embeddings[:,[0,1,2,3,4,6,7,8,9]]

# Load Trip Behavior

In [5]:
file = "origin_trip_behavior.csv"
df_pivot = load_aggregate_travel_behavior(file, data_version)

y = df_pivot[variable_names].to_numpy()[:,:4]

groups = df_pivot['train_test']
group_split = GroupKFold(n_splits=5)

In [10]:
x.shape

(1571, 9)

# 1. Linear Regression

### 1.1 Auto Share

In [11]:
y_index = 1

In [12]:
lr = linear_model.LinearRegression()
cross_results = cross_validate(lr, x, y[:,y_index], cv=group_split, groups=groups, scoring='r2', return_train_score=True)
print("Train score: %.4f \t Cross val score: %.4f \t " % 
          (cross_results['train_score'].mean(), cross_results['test_score'].mean()))

Train score: 0.5946 	 Cross val score: 0.5873 	 


In [13]:
# Lasso
for a in (1e-4)*np.array([0.1,1,2,3,4,5,6,7,8,10,20,50]):
    lasso = linear_model.Lasso(alpha=a)
    cross_results = cross_validate(lasso, x, y[:,y_index], cv=group_split, groups=groups, scoring='r2', return_train_score=True, return_estimator=True)
    nz = 0
    for m in cross_results['estimator']:
        nz += sum(m.coef_ != 0)
    nz /= 5
    
    print("Parameter: %.2e \t Train score: %.4f \t Cross val score: %.4f \t Nonzero coef: %d" % 
          (a, cross_results['train_score'].mean(), cross_results['test_score'].mean(), nz))

Parameter: 1.00e-05 	 Train score: 0.5945 	 Cross val score: 0.5871 	 Nonzero coef: 9
Parameter: 1.00e-04 	 Train score: 0.5834 	 Cross val score: 0.5765 	 Nonzero coef: 7
Parameter: 2.00e-04 	 Train score: 0.5530 	 Cross val score: 0.5455 	 Nonzero coef: 6
Parameter: 3.00e-04 	 Train score: 0.5477 	 Cross val score: 0.5407 	 Nonzero coef: 4
Parameter: 4.00e-04 	 Train score: 0.5436 	 Cross val score: 0.5367 	 Nonzero coef: 4
Parameter: 5.00e-04 	 Train score: 0.5390 	 Cross val score: 0.5321 	 Nonzero coef: 3
Parameter: 6.00e-04 	 Train score: 0.5373 	 Cross val score: 0.5311 	 Nonzero coef: 3
Parameter: 7.00e-04 	 Train score: 0.5369 	 Cross val score: 0.5307 	 Nonzero coef: 3
Parameter: 8.00e-04 	 Train score: 0.5364 	 Cross val score: 0.5302 	 Nonzero coef: 3
Parameter: 1.00e-03 	 Train score: 0.5352 	 Cross val score: 0.5291 	 Nonzero coef: 3
Parameter: 2.00e-03 	 Train score: 0.5252 	 Cross val score: 0.5190 	 Nonzero coef: 3
Parameter: 5.00e-03 	 Train score: 0.4633 	 Cross val 

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


### 1.2 PT

In [14]:
y_index = 3

In [15]:
lr = linear_model.LinearRegression()
cross_results = cross_validate(lr, x, y[:,y_index], cv=group_split, groups=groups, scoring='r2', return_train_score=True)
print("Train score: %.4f \t Cross val score: %.4f \t " % 
          (cross_results['train_score'].mean(), cross_results['test_score'].mean()))

Train score: 0.4718 	 Cross val score: 0.4612 	 


In [16]:
# Lasso
for a in (1e-4)*np.array([0.1,1,2,3,4,5,6,7,8,10,20,50]):
    lasso = linear_model.Lasso(alpha=a)
    cross_results = cross_validate(lasso, x, y[:,y_index], cv=group_split, groups=groups, scoring='r2', return_train_score=True, return_estimator=True)
    nz = 0
    for m in cross_results['estimator']:
        nz += sum(m.coef_ != 0)
    nz /= 5
    
    print("Parameter: %.2e \t Train score: %.4f \t Cross val score: %.4f \t Nonzero coef: %d" % 
          (a, cross_results['train_score'].mean(), cross_results['test_score'].mean(), nz))

Parameter: 1.00e-05 	 Train score: 0.4711 	 Cross val score: 0.4610 	 Nonzero coef: 8
Parameter: 1.00e-04 	 Train score: 0.4427 	 Cross val score: 0.4382 	 Nonzero coef: 4
Parameter: 2.00e-04 	 Train score: 0.4389 	 Cross val score: 0.4358 	 Nonzero coef: 3
Parameter: 3.00e-04 	 Train score: 0.4378 	 Cross val score: 0.4349 	 Nonzero coef: 4
Parameter: 4.00e-04 	 Train score: 0.4364 	 Cross val score: 0.4337 	 Nonzero coef: 4
Parameter: 5.00e-04 	 Train score: 0.4346 	 Cross val score: 0.4320 	 Nonzero coef: 4
Parameter: 6.00e-04 	 Train score: 0.4323 	 Cross val score: 0.4299 	 Nonzero coef: 4
Parameter: 7.00e-04 	 Train score: 0.4296 	 Cross val score: 0.4273 	 Nonzero coef: 4
Parameter: 8.00e-04 	 Train score: 0.4265 	 Cross val score: 0.4244 	 Nonzero coef: 4
Parameter: 1.00e-03 	 Train score: 0.4191 	 Cross val score: 0.4171 	 Nonzero coef: 4
Parameter: 2.00e-03 	 Train score: 0.3798 	 Cross val score: 0.3789 	 Nonzero coef: 2
Parameter: 5.00e-03 	 Train score: 0.1608 	 Cross val 

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


### 1.3 Active

In [17]:
y_index = 0

In [18]:
lr = linear_model.LinearRegression()
cross_results = cross_validate(lr, x, y[:,y_index], cv=group_split, groups=groups, scoring='r2', return_train_score=True)
print("Train score: %.4f \t Cross val score: %.4f \t " % 
          (cross_results['train_score'].mean(), cross_results['test_score'].mean()))

Train score: 0.4793 	 Cross val score: 0.4682 	 


In [19]:
# Lasso
for a in (1e-4)*np.array([0.1,1,2,3,4,5,6,7,8,10,20,50]):
    lasso = linear_model.Lasso(alpha=a)
    cross_results = cross_validate(lasso, x, y[:,y_index], cv=group_split, groups=groups, scoring='r2', return_train_score=True, return_estimator=True)
    nz = 0
    for m in cross_results['estimator']:
        nz += sum(m.coef_ != 0)
    nz /= 5
    
    print("Parameter: %.2e \t Train score: %.4f \t Cross val score: %.4f \t Nonzero coef: %d" % 
          (a, cross_results['train_score'].mean(), cross_results['test_score'].mean(), nz))

Parameter: 1.00e-05 	 Train score: 0.4791 	 Cross val score: 0.4681 	 Nonzero coef: 8
Parameter: 1.00e-04 	 Train score: 0.4572 	 Cross val score: 0.4470 	 Nonzero coef: 7
Parameter: 2.00e-04 	 Train score: 0.4401 	 Cross val score: 0.4318 	 Nonzero coef: 5
Parameter: 3.00e-04 	 Train score: 0.4320 	 Cross val score: 0.4241 	 Nonzero coef: 4
Parameter: 4.00e-04 	 Train score: 0.4291 	 Cross val score: 0.4219 	 Nonzero coef: 3
Parameter: 5.00e-04 	 Train score: 0.4285 	 Cross val score: 0.4213 	 Nonzero coef: 3
Parameter: 6.00e-04 	 Train score: 0.4277 	 Cross val score: 0.4206 	 Nonzero coef: 3
Parameter: 7.00e-04 	 Train score: 0.4269 	 Cross val score: 0.4198 	 Nonzero coef: 3
Parameter: 8.00e-04 	 Train score: 0.4259 	 Cross val score: 0.4188 	 Nonzero coef: 3
Parameter: 1.00e-03 	 Train score: 0.4235 	 Cross val score: 0.4165 	 Nonzero coef: 3
Parameter: 2.00e-03 	 Train score: 0.4033 	 Cross val score: 0.3966 	 Nonzero coef: 3
Parameter: 5.00e-03 	 Train score: 0.3185 	 Cross val 

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


# 2. MNL for Mode Share

In [23]:
from util_aggregate_models import mnl_torch

# dataloader and model definition

lr_list = [0.01, 0.1]
wd_list = [0.1, 0.01, 0.001, 0.0001]

results = {}
for i in range(5):
    
    train_filter = groups != i
    test_filter = groups == i
    x_train = x[train_filter]
    y_train = y[train_filter]
    x_test = x[test_filter]
    y_test = y[test_filter]
    sst_train = np.sum(np.power(y_train - np.mean(y_train, axis=0), 2), axis=0)
    sst_test = np.sum(np.power(y_test - np.mean(y_test, axis=0), 2), axis=0)
    
    trainset = SurveyDataset(torch.tensor(x_train,  dtype=torch.float), torch.tensor(y_train, dtype=torch.float))
    trainloader = DataLoader(trainset, batch_size=len(trainset), shuffle=False)

    testset = SurveyDataset(torch.tensor(x_test, dtype=torch.float), torch.tensor(y_test, dtype=torch.float))
    testloader = DataLoader(testset, batch_size=len(testset), shuffle=False)

    ret_dict = mnl_torch(trainloader, testloader, x_train.shape[-1], sst_train, sst_test, lr_list=lr_list, wd_list=wd_list)
    
    results[i] = ret_dict
    

[lr: 1.00e-02, wd: 1.00e-01]
Early stopping at epoch 865
[epoch: 860] Train KL loss: 0.149 Train R2 score: 0.408 0.519 -0.007 0.413 
[epoch: 860] Test KL loss: 0.153 Test R2 score: 0.411 0.529 0.010 0.400 

[lr: 1.00e-02, wd: 1.00e-02]
Early stopping at epoch 750
[epoch: 745] Train KL loss: 0.150 Train R2 score: 0.396 0.514 -0.008 0.416 
[epoch: 745] Test KL loss: 0.154 Test R2 score: 0.398 0.523 0.009 0.401 

[lr: 1.00e-02, wd: 1.00e-03]
Early stopping at epoch 695
[epoch: 690] Train KL loss: 0.151 Train R2 score: 0.396 0.508 -0.006 0.429 
[epoch: 690] Test KL loss: 0.153 Test R2 score: 0.411 0.519 0.009 0.418 

[lr: 1.00e-02, wd: 1.00e-04]
Early stopping at epoch 565
[epoch: 560] Train KL loss: 0.150 Train R2 score: 0.394 0.509 -0.007 0.429 
[epoch: 560] Test KL loss: 0.154 Test R2 score: 0.400 0.519 0.006 0.418 

[lr: 1.00e-01, wd: 1.00e-01]
Early stopping at epoch 250
[epoch: 245] Train KL loss: 0.146 Train R2 score: 0.417 0.532 -0.008 0.445 
[epoch: 245] Test KL loss: 0.150 Test R

In [24]:
df = []

for (lr, wd) in itertools.product(lr_list, wd_list):

    new = []
    
    for i in range(5):
        new2 = []
        for k,v in results[i][(lr,wd)].items():
            new2.append(results[i][(lr,wd)][k]) 
        new.append(new2)
        
    new = np.array(new) 
    
    df.append([lr] + [wd] + list(np.mean(new, axis=0)))


pd.DataFrame(np.array(df), columns = ['learning_rate','weight_decay','train_kl_loss','test_kl_loss','train_r2_auto','train_r2_active','train_r2_pt',
                                     'test_r2_auto','test_r2_active','test_r2_pt']).sort_values(by='test_kl_loss')

Unnamed: 0,learning_rate,weight_decay,train_kl_loss,test_kl_loss,train_r2_auto,train_r2_active,train_r2_pt,test_r2_auto,test_r2_active,test_r2_pt
6,0.1,0.001,0.14477,0.146086,0.542693,0.433732,0.442078,0.535272,0.42401,0.436634
7,0.1,0.0001,0.145304,0.14646,0.539212,0.426965,0.444917,0.533079,0.418992,0.439982
5,0.1,0.01,0.145766,0.146904,0.537077,0.42516,0.443117,0.529898,0.416205,0.436896
4,0.1,0.1,0.147639,0.148762,0.526823,0.412921,0.438133,0.520142,0.405058,0.433686
1,0.01,0.01,0.150929,0.151837,0.513846,0.398519,0.416795,0.507366,0.391418,0.409801
2,0.01,0.001,0.150918,0.151948,0.510972,0.398926,0.420583,0.505334,0.391266,0.419709
0,0.01,0.1,0.151083,0.152168,0.510579,0.399992,0.420442,0.504485,0.391264,0.416231
3,0.01,0.0001,0.151116,0.152208,0.51075,0.397183,0.419251,0.504901,0.388673,0.415584
