# MergedChoiceTable feature testing

Sam Maurer, August 2018

In [1]:
import sys
print(sys.version)

3.6.4 |Anaconda custom (x86_64)| (default, Jan 16 2018, 12:04:33) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


In [2]:
import numpy as np
import pandas as pd
import random

In [3]:
import choicemodels

  from pandas.core import datetools


## Performance comparison

`random.choices`: replacement, optional weights  
`random.sample`: no replacement  
`np.random.choice`: optional replacement, optional weights

For each one, draw 100 samples of 10 alternatives from a universe of 100,000

In [39]:
n = int(1e5)
vals = np.random.rand(n)
weights = np.random.rand(n)
scaled_weights = weights/weights.sum(0)  # probs that sum to 1

In [29]:
%%timeit 3
    for i in range(100):
        random.choices(vals, k=10)

1000 loops, best of 3: 302 µs per loop


In [33]:
%%timeit 3
    for i in range(100):
        random.choices(vals, weights, k=10)

1 loop, best of 3: 727 ms per loop


In [32]:
%%timeit 3
    for i in range(100):
        random.sample(vals.tolist(), k=10)

10 loops, best of 3: 153 ms per loop


In [36]:
%%timeit 3
    for i in range(100):
        np.random.choice(vals, replace=True, size=10)

1000 loops, best of 3: 701 µs per loop


In [42]:
%%timeit 3
    for i in range(100):
        np.random.choice(vals, replace=False, size=10)

10 loops, best of 3: 136 ms per loop


In [43]:
%%timeit 3
    for i in range(100):
        np.random.choice(vals, replace=True, p=scaled_weights, size=10)

10 loops, best of 3: 70.2 ms per loop


In [44]:
%%timeit 3
    for i in range(100):
        np.random.choice(vals, replace=False, p=scaled_weights, size=10)

10 loops, best of 3: 78.8 ms per loop


Here are the winners, with times scaled to be relative:

```
1 ms    replacement, core python  
200 ms  replacement with weights, numpy

400 ms  no replacement, numpy
240 ms  no replacement with weights, numpy
```

In [45]:
# What's the real-world hit?

n = int(5e6)
vals = np.random.rand(n)
weights = np.random.rand(n)
scaled_weights = weights/weights.sum(0)  # probs that sum to 1

In [46]:
%%timeit 3
    for i in range(100):
        np.random.choice(vals, replace=False, p=scaled_weights, size=100)

1 loop, best of 3: 5.39 s per loop


So drawing 100k samples of 100 without replacement from a universe of 5 million, with weights, would take 90 minues on a fast iMac

## ChoiceModels testing

In [4]:
obs = pd.DataFrame(np.random.rand(10,1), columns=['a'])

In [5]:
obs.head(3)

Unnamed: 0,a
0,0.466642
1,0.496494
2,0.255071


In [6]:
alts = pd.DataFrame(np.random.rand(5,2), columns=['b','weight'])

In [7]:
alts.head(3)

Unnamed: 0,b,weight
0,0.715194,0.767812
1,0.385832,0.051211
2,0.563915,0.468283


In [8]:
choicemodels.tools.MCT(obs, alts, sample_size=3).to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,weight
obs_id,alt_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,0.466642,0.385832,0.051211
0,3,0.466642,0.515843,0.701565
0,4,0.466642,0.370468,0.163068
1,2,0.496494,0.563915,0.468283
1,2,0.496494,0.563915,0.468283
1,4,0.496494,0.370468,0.163068
2,0,0.255071,0.715194,0.767812
2,2,0.255071,0.563915,0.468283
2,2,0.255071,0.563915,0.468283
3,0,0.117001,0.715194,0.767812


In [9]:
df = choicemodels.tools.MCT(obs, alts, sample_size=3, weights='weight').to_frame()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,weight
obs_id,alt_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0.466642,0.715194,0.767812
0,0,0.466642,0.715194,0.767812
0,3,0.466642,0.515843,0.701565
1,3,0.496494,0.515843,0.701565
1,3,0.496494,0.515843,0.701565
1,0,0.496494,0.715194,0.767812
2,2,0.255071,0.563915,0.468283
2,2,0.255071,0.563915,0.468283
2,0,0.255071,0.715194,0.767812
3,0,0.117001,0.715194,0.767812


In [9]:
choicemodels.tools.MCT(obs, alts, sample_size=6, replace=False).to_frame()

ValueError: Cannot sample without replacement with sample_size 6 and n_alts 5

In [11]:
isinstance("hello", str)

True

In [55]:
df[df.index.get_level_values('obs_id').isin([0])]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,weight
obs_id,alt_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0.526994,0.686536,0.290465
0,2,0.526994,0.22475,0.711542
0,1,0.526994,0.49873,0.348613


In [58]:
df[df.index.get_level_values('obs_id').isin([0])].weight

obs_id  alt_id
0       0         0.290465
        2         0.711542
        1         0.348613
Name: weight, dtype: float64

In [60]:
df.weight/df.weight.sum()

obs_id  alt_id
0       0         0.017042
        2         0.041747
        1         0.020454
1       4         0.045093
        4         0.045093
        2         0.041747
2       4         0.045093
        2         0.041747
        4         0.045093
3       2         0.041747
        0         0.017042
        2         0.041747
4       2         0.041747
        2         0.041747
        2         0.041747
5       3         0.003128
        4         0.045093
        1         0.020454
6       4         0.045093
        3         0.003128
        1         0.020454
7       1         0.020454
        4         0.045093
        1         0.020454
8       4         0.045093
        2         0.041747
        4         0.045093
9       2         0.041747
        0         0.017042
        0         0.017042
Name: weight, dtype: float64

In [9]:
np.repeat([1,2,3], 4).tolist()

[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]

In [1]:
if True:
    a = "yes"
print(a)

yes


In [14]:
np.tile(np.append([1], np.repeat(0, 2)), 3)

array([1, 0, 0, 1, 0, 0, 1, 0, 0])

In [4]:
[1] + [2]

[1, 2]

In [12]:
print(np.append([1], np.repeat(0, 2)))

[1 0 0]


In [3]:
df = pd.DataFrame(np.random.randn(10000,4), columns=list('ABCD'))

In [5]:
df.head(3)

Unnamed: 0,A,B,C,D
0,0.049604,-0.107076,-1.040623,-0.546538
1,1.140552,1.434264,-0.110053,-1.238815
2,1.884469,0.543573,1.562645,-0.358955


In [8]:
len(df.A.sample(50000, replace=True))

50000

In [62]:
def a():
    return

type(a)

function

In [65]:
callable(a)

True

And re our discussion of sampling weights.. Here’s the notebook i mentioned where i estimate an MNL model that includes both (a) custom logic for sampling of alternatives (in this case it does an availability calculation on the fly, but weights can by applied similarly) and (b) attributes that vary based on the interaction of choosers and alternatives (in this case a euclidean distance lookup). I do this by building the long-format estimation table manually -- not much code, but a pain to get it right.

The first thing i’ll do to make this more structured is to have ChoiceModels accept either matrices or generator functions for availability, weights, and interaction terms. (Generator functions can be more efficient because you don’t have to compute the whole NxM matrix, but matrices are better if you want to re-use the weights.)

But for use in UrbanSim, neither matrices nor generator functions are particularly convenient, because they’re hard to store.. we could store weights as big lookup tables and generator functions as Orca injectables, but i think we should do some experimenting first.

So here’s my proposal: First, i’ll update the ChoiceModels MergedChoiceTable API to take these inputs. Then, i’ll create a backdoor in the Large MNL template so we can pass in a custom MergedChoiceTable when we’re working with the class object directly, like in a notebook. This won’t work for saving models and re-running them later, but it will let us experiment pretty easily.

https://gist.github.com/smmaurer/c3b4f2f7c4d612a4520de119f9f497cf



1. reimplement the interaction dataset
2. check accuracy
3. check speed
4. implement weights