# US Unemployment Example

Example adopted from:

* Kurihara K. (2004). Classification of geospatial lattice data and their graphical representation. *Classification, Clustering, and Data Mining Applications*, (Edited by D.Banks et al.) Springer, 251–258.

The following connections were bracketed in the above article (those are geographically disconnect but considered as connected in this analysis).

* AK---WA
* CA---HI

In [1]:
# %load_ext autoreload
# %autoreload 2
# %matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
import echelon
echelon.__version__

'1.0.3'

In [3]:
from IPython.display import Markdown, display

import numpy as np
import pandas as pd

df = pd.DataFrame([
    ("AL", 'Alabama', 51, ['FL', 'GA', 'MS', 'TN']),
    ("AK", 'Alaska', 79, ['WA']),
    ('AZ', 'Arizona', 46, ['CA', 'CO', 'NM', 'NV', 'UT']),
    ('AR', 'Arkansas', 53, ['LA', 'MS', 'MO', 'OK', 'TN', 'TX']),
    ('CA', 'California', 63, ['AZ', 'NV', 'OR', 'HI']),
    ('CO', 'Colorado', 33, ['AZ', 'KS', 'NE', 'NM', 'OK', 'UT', 'WY']),
    ('CT', 'Connecticut', 51, ['MA', 'NY', 'RI']),
    ('DE', 'Delaware', 40, ['MD', 'NJ', 'PA']),
    ('FL', 'Florida', 48, ['AL', 'GA']),
    ('GA', 'Georgia', 45, ['AL', 'FL', 'NC', 'SC', 'TN']),
    ('HI', 'Hawaii', 64, ['CA']),
    ('ID', 'Idaho', 53, ['MT', 'NV', 'OR', 'UT', 'WA', 'WY']),
    ('IL', 'Illinois', 47, ['IA', 'IN', 'KY', 'MO', 'WI']),
    ('IN', 'Indiana', 35, ['IL', 'KY', 'MI', 'OH']),
    ('IA', 'Iowa', 33, ['IL', 'MN', 'MO', 'NE', 'SD', 'WI']),
    ('KS', 'Kansas', 38, ['CO', 'MO', 'NE', 'OK']),
    ('KY', 'Kentucky', 54, ['IL', 'IN', 'MO', 'OH', 'TN', 'VA', 'WV']),
    ('LA', 'Louisiana', 61, ['AR', 'MS', 'TX']),
    ('ME', 'Maine', 54, ['NH']),
    ('MD', 'Maryland', 51, ['DE', 'PA', 'VA', 'WV']),
    ('MA', 'Massachusetts', 40, ['CT', 'NH', 'NY', 'RI', 'VT']),
    ('MI', 'Michigan', 42, ['IN', 'OH', 'WI']),
    ('MN', 'Minnesota', 33, ['IA', 'ND', 'SD', 'WI']),
    ('MS', 'Mississippi', 57, ['AL', 'AR', 'LA', 'TN']),
    ('MO', 'Missouri', 42, ['AR', 'IA', 'IL', 'KS', 'KY', 'NE', 'OK', 'TN']),
    ('MT', 'Montana', 54, ['ID', 'ND', 'SD', 'WY']),
    ('NE', 'Nebraska', 26, ['CO', 'IA', 'KS', 'MO', 'SD', 'WY']),
    ('NV', 'Nevada', 41, ['AZ', 'CA', 'ID', 'OR', 'UT']),
    ('NH', 'New Hampshire', 31, ['MA', 'ME', 'VT']),
    ('NJ', 'New Jersey', 51, ['DE', 'NY', 'PA']),
    ('NM', 'New Mexico', 62, ['AZ', 'CO', 'OK', 'TX', 'UT']),
    ('NY', 'New York', 64, ['CT', 'MA', 'NJ', 'PA', 'VT']),
    ('NC', 'North Carolina', 36, ['GA', 'SC', 'TN', 'VA']),
    ('ND', 'North Dakota', 25, ['MN', 'MT', 'SD']),
    ('OH', 'Ohio', 46, ['IN', 'KY', 'MI', 'PA', 'WV']),
    ('OK', 'Oklahoma', 41, ['AR', 'CO', 'KS', 'MO', 'NM', 'TX']),
    ('OR', 'Oregon', 58, ['CA', 'ID', 'NV', 'WA']),
    ('PA', 'Pennsylvania', 52, ['DE', 'MD', 'NJ', 'NY', 'OH', 'WV']),
    ('RI', 'Rhode Island', 53, ['CT', 'MA']),
    ('SC', 'South Carolina', 45, ['GA', 'NC']),
    ('SD', 'South Dakota', 31, ['IA', 'MN', 'MT', 'ND', 'NE', 'WY']),
    ('TN', 'Tennessee', 54, ['AL', 'AR', 'GA', 'KY', 'MO', 'MS', 'NC', 'VA']),
    ('TX', 'Texas', 54, ['AR', 'LA', 'NM', 'OK']),
    ('UT', 'Utah', 31, ['AZ', 'CO', 'ID', 'NM', 'NV', 'WY']),
    ('VT', 'Vermont', 40, ['MA', 'NH', 'NY']),
    ('VA', 'Virginia', 40, ['KY', 'MD', 'NC', 'TN', 'WV']),
    ('WA', 'Washington', 48, ['ID', 'OR', 'AK']),
    ('WV', 'West Virginia', 69, ['KY', 'MD', 'OH', 'PA', 'VA']),
    ('WI', 'Wisconsin', 37, ['IA', 'IL', 'MI', 'MN']),
    ('WY', 'Wyoming', 51, ['CO', 'ID', 'MT', 'NE', 'SD', 'UT'])
], columns=['code', 'name', 'unemployment rate', 'adjacent_codes'])
df

Unnamed: 0,code,name,unemployment rate,adjacent_codes
0,AL,Alabama,51,"[FL, GA, MS, TN]"
1,AK,Alaska,79,[WA]
2,AZ,Arizona,46,"[CA, CO, NM, NV, UT]"
3,AR,Arkansas,53,"[LA, MS, MO, OK, TN, TX]"
4,CA,California,63,"[AZ, NV, OR, HI]"
5,CO,Colorado,33,"[AZ, KS, NE, NM, OK, UT, WY]"
6,CT,Connecticut,51,"[MA, NY, RI]"
7,DE,Delaware,40,"[MD, NJ, PA]"
8,FL,Florida,48,"[AL, GA]"
9,GA,Georgia,45,"[AL, FL, NC, SC, TN]"


## Echelon construction

In [4]:
from echelon.api import DataFrameEchelonAnalysis
analyzer = DataFrameEchelonAnalysis()
result = analyzer(df, 'unemployment rate', 'code', 'adjacent_codes')
result

Result_EchelonAnalysis(peak_echelons=[['AK'], ['WV'], ['HI', 'CA', 'OR'], ['NY'], ['NM'], ['LA', 'MS'], ['ME'], ['MT'], ['RI']], foundation_echelons=[['KY', 'TN', 'TX', 'AR'], ['ID', 'WY'], ['PA'], ['AL', 'NJ', 'MD', 'CT', 'FL', 'IL'], ['WA'], ['AZ', 'OH', 'GA', 'SC', 'MI', 'MO', 'OK', 'NV', 'VT', 'VA', 'MA', 'DE', 'KS', 'WI', 'NC', 'IN', 'IA', 'MN', 'CO'], ['NH', 'SD', 'UT', 'NE', 'ND']], hierarchy_tree=Node('/15'), oracle=<echelon.oracle.DataFrameEchelonOracle object at 0x7fb49022b240>)

In [5]:
display(Markdown('### Peak Echelons'))
display(pd.DataFrame(result.peak_echelons).fillna(''))

display(Markdown('### Foundation Echelons'))
display(pd.DataFrame(result.foundation_echelons).fillna(''))

### Peak Echelons

Unnamed: 0,0,1,2
0,AK,,
1,WV,,
2,HI,CA,OR
3,NY,,
4,NM,,
5,LA,MS,
6,ME,,
7,MT,,
8,RI,,


### Foundation Echelons

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,Unnamed: 20
0,KY,TN,TX,AR,,,,,,,,,,,,,,,,
1,ID,WY,,,,,,,,,,,,,,,,,,
2,PA,,,,,,,,,,,,,,,,,,,
3,AL,NJ,MD,CT,FL,IL,,,,,,,,,,,,,,
4,WA,,,,,,,,,,,,,,,,,,,
5,AZ,OH,GA,SC,MO,MI,OK,NV,VT,VA,MA,DE,KS,WI,NC,IN,IA,MN,CO,MN
6,NH,UT,SD,NE,ND,,,,,,,,,,,,,,,


## Echelon dendrogram

In [6]:
print(analyzer.dendrogram(result, plot_config_dict={}))

E16(NH,SD,UT): [ND, NE, SD, UT, NH]
 (max: 31)
├── E15(AZ,OH): [CO, MN, IA, IN, NC, WI, KS, DE, MA, VA, VT, NV, OK, MO, MI, SC, GA, OH, AZ]
│    (max: 46)
│   ├── E14(WA): [WA]
│   │    (max: 48)
│   │   ├── E11(ID): [WY, ID]
│   │   │    (max: 53)
│   │   │   ├── E8(MT): [MT]
│   │   │   │    (max: 54)
│   │   │   └── E3(HI): [OR, CA, HI]
│   │   │        (max: 64)
│   │   └── E1(AK): [AK]
│   │        (max: 79)
│   └── E13(AL,CT,MD,NJ): [IL, FL, CT, MD, NJ, AL]
│        (max: 51)
│       ├── E12(PA): [PA]
│       │    (max: 52)
│       │   ├── E10(KY,TN,TX): [AR, TX, TN, KY]
│       │   │    (max: 54)
│       │   │   ├── E6(LA): [MS, LA]
│       │   │   │    (max: 61)
│       │   │   ├── E5(NM): [NM]
│       │   │   │    (max: 62)
│       │   │   └── E2(WV): [WV]
│       │   │        (max: 69)
│       │   └── E4(NY): [NY]
│       │        (max: 64)
│       └── E9(RI): [RI]
│            (max: 53)
└── E7(ME): [ME]
     (max: 54)


## Echelon Clusters

In [7]:
pd.options.display.max_columns = None # Do not omit columns

_df = analyzer.cluster(result)
_df['representatives'] = _df['representatives'].map(lambda x: str(x[0]) + ' Zone')
pd.DataFrame(_df['indices'].to_list()).fillna('').set_index(_df['representatives'].rename('Zone'))

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,Unnamed: 30_level_0
Zone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
AK Zone,AK,WA,,,,,,,,,,,,,,,,,,,,,,,,,,,,
WV Zone,WV,KY,TN,AR,PA,AL,NJ,MD,FL,IL,OH,GA,SC,MO,MI,OK,VA,DE,KS,WI,NC,IN,IA,MN,CO,MN,UT,SD,NE,ND
HI Zone,HI,CA,OR,ID,WY,WA,AZ,NV,CO,UT,SD,NE,ND,,,,,,,,,,,,,,,,,
NY Zone,NY,PA,NJ,MD,CT,OH,MI,VT,VA,MA,DE,WI,NC,IN,IA,MN,NH,SD,NE,ND,,,,,,,,,,
NM Zone,NM,TX,AR,AZ,MO,OK,NV,KS,IA,MN,CO,MN,UT,SD,NE,ND,,,,,,,,,,,,,,
LA Zone,LA,MS,TN,TX,AR,AL,FL,GA,SC,MO,OK,VA,KS,NC,IA,MN,CO,MN,UT,SD,NE,ND,,,,,,,,
ME Zone,ME,NH,,,,,,,,,,,,,,,,,,,,,,,,,,,,
MT Zone,MT,ID,WY,WA,NV,CO,UT,SD,NE,ND,,,,,,,,,,,,,,,,,,,,
RI Zone,RI,CT,MA,VT,NH,,,,,,,,,,,,,,,,,,,,,,,,,


## Hotspot detection

In [8]:
from IPython.display import Markdown, display
display(Markdown('### Poisson score'))
display(analyzer.hotspots(result))

display(Markdown('### Binomial score'))
df['total'] = np.ones(len(df)) * 100
display(analyzer.hotspots(result, (df, 'code', 'total', 'unemployment rate'), score='binomial'))

### Poisson score

Unnamed: 0,spot,score,c(Z),log_lambda
24,"[PA, KY, TN, TX, AR, LA, MS, NM, WV, NY, RI, A...",2.24745,932,2.24745
23,"[PA, KY, TN, TX, AR, LA, MS, NM, WV, NY, RI, A...",2.237845,885,2.237845
22,"[PA, KY, TN, TX, AR, LA, MS, NM, WV, NY, RI, A...",2.226046,837,2.226046
21,"[PA, KY, TN, TX, AR, LA, MS, NM, WV, NY, RI, A...",2.211152,786,2.211152
20,"[PA, KY, TN, TX, AR, LA, MS, NM, WV, NY, RI, A...",2.193617,735,2.193617
19,"[PA, KY, TN, TX, AR, LA, MS, NM, WV, NY, RI, AL]",2.173171,684,2.173171
18,"[KY, TN, TX, AR, LA, MS, NM, WV, NY, PA]",2.120911,580,2.120911
25,"[ID, WY, MT, HI, CA, OR, AK, WA]",2.045829,470,2.045829
13,"[LA, MS, NM, WV, KY, TN, TX, AR]",2.040994,464,2.040994
12,"[LA, MS, NM, WV, KY, TN, TX]",1.994161,411,1.994161


### Binomial score

Unnamed: 0,spot,score,c(Z),log_lambda
22,"[PA, KY, TN, TX, AR, LA, MS, NM, WV, NY, RI, A...",16.844985,837,16.844985
23,"[PA, KY, TN, TX, AR, LA, MS, NM, WV, NY, RI, A...",16.558891,885,16.558891
21,"[PA, KY, TN, TX, AR, LA, MS, NM, WV, NY, RI, A...",16.446466,786,16.446466
20,"[PA, KY, TN, TX, AR, LA, MS, NM, WV, NY, RI, A...",16.117422,735,16.117422
24,"[PA, KY, TN, TX, AR, LA, MS, NM, WV, NY, RI, A...",16.106899,932,16.106899
19,"[PA, KY, TN, TX, AR, LA, MS, NM, WV, NY, RI, AL]",15.863038,684,15.863038
18,"[KY, TN, TX, AR, LA, MS, NM, WV, NY, PA]",15.07384,580,15.07384
25,"[ID, WY, MT, HI, CA, OR, AK, WA]",13.006068,470,13.006068
13,"[LA, MS, NM, WV, KY, TN, TX, AR]",11.432044,464,11.432044
12,"[LA, MS, NM, WV, KY, TN, TX]",11.02472,411,11.02472
