<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports-and-Constants" data-toc-modified-id="Imports-and-Constants-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports and Constants</a></span></li><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load Data</a></span><ul class="toc-item"><li><span><a href="#Load-labels,-locs,-years" data-toc-modified-id="Load-labels,-locs,-years-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Load labels, locs, years</a></span></li><li><span><a href="#Load-LSMS-DataFrame" data-toc-modified-id="Load-LSMS-DataFrame-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Load LSMS DataFrame</a></span></li><li><span><a href="#Load-delta-pairs" data-toc-modified-id="Load-delta-pairs-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Load delta pairs</a></span></li><li><span><a href="#Create-country-labels" data-toc-modified-id="Create-country-labels-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Create country labels</a></span></li><li><span><a href="#Load-folds" data-toc-modified-id="Load-folds-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>Load folds</a></span></li><li><span><a href="#For-Analysis" data-toc-modified-id="For-Analysis-2.6"><span class="toc-item-num">2.6&nbsp;&nbsp;</span>For Analysis</a></span></li></ul></li><li><span><a href="#Vanilla-DHS-Models---OOC" data-toc-modified-id="Vanilla-DHS-Models---OOC-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Vanilla DHS Models - OOC</a></span><ul class="toc-item"><li><span><a href="#Changes-over-time" data-toc-modified-id="Changes-over-time-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Changes over time</a></span></li></ul></li><li><span><a href="#Models-Fine-Tuned-on-LSMS---OOC" data-toc-modified-id="Models-Fine-Tuned-on-LSMS---OOC-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Models Fine-Tuned on LSMS - OOC</a></span><ul class="toc-item"><li><span><a href="#Changes-over-time" data-toc-modified-id="Changes-over-time-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Changes over time</a></span></li></ul></li><li><span><a href="#TODO:-Vanilla-DHS-Models---Incountry" data-toc-modified-id="TODO:-Vanilla-DHS-Models---Incountry-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>TODO: Vanilla DHS Models - Incountry</a></span><ul class="toc-item"><li><span><a href="#Changes-over-time" data-toc-modified-id="Changes-over-time-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Changes over time</a></span></li></ul></li><li><span><a href="#TODO:-Models-Fine-Tuned-on-LSMS---Incountry" data-toc-modified-id="TODO:-Models-Fine-Tuned-on-LSMS---Incountry-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>TODO: Models Fine-Tuned on LSMS - Incountry</a></span><ul class="toc-item"><li><span><a href="#Changes-over-time" data-toc-modified-id="Changes-over-time-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Changes over time</a></span></li></ul></li><li><span><a href="#Models-Trained-on-LSMS---Incountry" data-toc-modified-id="Models-Trained-on-LSMS---Incountry-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Models Trained on LSMS - Incountry</a></span><ul class="toc-item"><li><span><a href="#Changes-over-time" data-toc-modified-id="Changes-over-time-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Changes over time</a></span></li></ul></li></ul></div>

# Imports and Constants

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from collections import defaultdict
import os
import pickle
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sys.path.append('../')
from batchers import dataset_constants
from utils.analysis import calc_score, evaluate, evaluate_df
from utils.general import colordisplay, load_npz
from utils.plot import symmetric_heatmap

In [None]:
CKPT_ROOT_DIR = '../ckpts/'
LOGS_ROOT_DIR = '../logs/'

BANDS = ['ms', 'msnl', 'nl', 'rgb', 'rgbnl']
MODEL_BANDS = ['ms', 'msnl', 'nl', 'rgb', 'rgbnl', 'msnl_concat', 'rgbnl_concat']
FOLDS = ['A', 'B', 'C', 'D', 'E']
SPLITS = ['train', 'val', 'test']

# Load Data

## Load labels, locs, years

In [None]:
npz = load_npz('../data/lsms_image_hists.npz')
labels = npz['labels']
locs = npz['locs']
years = npz['years']
num_examples = len(labels)

## Load LSMS DataFrame

In [None]:
# file that Anne sent to me
LSMS_CSV_PATH = '../data/lsms_labels_agg.csv'
lsms_df = pd.read_csv(LSMS_CSV_PATH, float_precision='high')
lsms_df['lat'] = lsms_df['lat'].astype(np.float32)
lsms_df['lon'] = lsms_df['lon'].astype(np.float32)
lsms_df['index'] = lsms_df['index'].astype(np.float32)
lsms_df['year'] = lsms_df['year'].astype(np.float32)

with pd.option_context('display.max_rows', 8):
    print(lsms_df.dtypes)
    display(lsms_df)

## Load delta pairs

In [None]:
delta_pairs_df = pd.read_csv('../data/lsms_deltas_pairs.csv', index_col=False)
with pd.option_context('display.max_rows', 4):
    display(delta_pairs_df)

## Create country labels

In [None]:
CID_TO_C = {
    'et': 'ethiopia',
    'mw': 'malawi',
    'ng': 'nigeria',
    'tz': 'tanzania',
    'ug': 'uganda',
}
LSMS_COUNTRIES = dataset_constants.LSMS_COUNTRIES

country_to_years = {
    'ethiopia': [2011, 2015],
    'malawi': [2010, 2016],
    'nigeria': [2010, 2015],
    'tanzania': [2008, 2012],
    'uganda': [2005, 2009, 2013],
}

CYs = dataset_constants.SURVEY_NAMES['LSMS']

In [None]:
# create mapping: loc => country
loc_to_country = (lsms_df
    .groupby(['lat', 'lon'])
    .apply(lambda df: df['country'].iloc[0])
    .map(CID_TO_C)
    .to_dict())

In [None]:
country_indices = defaultdict(list)  # country => np.array of indices
country_labels = np.zeros(num_examples, dtype=np.int32)  # np.array of country labels

for i, loc in enumerate(locs):
    country = loc_to_country[tuple(loc)]
    country_indices[country].append(i)

for i, country in enumerate(LSMS_COUNTRIES):
    country_indices[country] = np.asarray(country_indices[country])
    indices = country_indices[country]
    country_labels[indices] = i

In [None]:
cy_sizes = [dataset_constants.SIZES['LSMS'][cy] for cy in CYs]
NUM_LSMS = np.sum(cy_sizes)
cy_indices = np.split(np.arange(NUM_LSMS), np.cumsum(cy_sizes))
CY_INDICES = {cy: cy_indices[i] for i, cy in enumerate(CYs)}
CY_TO_INDEX = {cy: i for i, cy in enumerate(CYs)}

## Load folds

```
folds = {
    'A': { 'test': array([7,   10,   11, ...]),
           'val': array([2,    8,   17, ...]),
           'train': array([0,    1,    3, ...])},
    'B': { 'test': array([   2,    8,   17, ...]),
           ... },
    ...
}
```

In [None]:
with open('../data/lsms_incountry_folds.pkl', 'rb') as f:
    folds = pickle.load(f)

## For Analysis

In [None]:
def calc_diff(df, y1, y2, diff_cols, kind):
    '''
    Args
    - df: pd.DataFrame
    '''
    a = df.loc[df['year'] == y1, diff_cols].iloc[0]
    b = df.loc[df['year'] == y2, diff_cols].iloc[0]
    if kind == 'raw':
        return b - a
    elif kind == 'relative':
        return (b - a) / np.abs(a)
    else:
        raise ValueError('Unknown `kind` argument')

def diff_scatter(diff_df, xlabel=None, title=None):
    '''
    Args
    - diff_df: pd.DataFrame, contains columns ['wealthpooled'] + MODEL_BANDS
    - xlabel: str
    - title: str, figure title
    '''
    ncols = len(MODEL_BANDS)
    fig, axs = plt.subplots(nrows=1, ncols=ncols, sharey=True, figsize=[ncols*2.5, 3])
    for i, model_bands in enumerate(MODEL_BANDS):
        ax = axs[i]
        r2 = calc_score(labels=diff_df['wealthpooled'], preds=diff_df[model_bands], metric='r2')
        label = 'r2: {:.2f}'.format(r2)
        sns.regplot(x=diff_df['wealthpooled'], y=diff_df[model_bands],
                    ax=ax, label=label, scatter_kws={'s': 3})
        # ax.scatter(diff_df['wealthpooled'], diff_df[model_bands], s=3)
        ax.grid(True)
        ax.plot([-2, 3], [-2, 3], color='black')
        ax.set_aspect('equal')
        ax.set_ylabel(model_bands)
        ax.legend()
        if xlabel is not None:
            ax.set_xlabel(xlabel)
    if title is not None:
        fig.suptitle(title, y=1.0)
    fig.tight_layout()

def analyze_changes(preds_df, country, y1, y2, model=None):
    '''
    Args
    - preds_df: pd.DataFrame, columns ['country', 'lat', 'lon', 'year', 'wealthpooled'] + MODEL_BANDS
    - country: str
    - y1, y2: ints, years
    - model: str, one of ['OOC', 'incountry']
    '''
    grouped = (
        preds_df
        .groupby('country')
        .get_group(country)
        .groupby(['lat', 'lon'])
        .filter(lambda df: (y1 in df['year'].values) and (y2 in df['year'].values))
        .groupby(['lat', 'lon']))

    # diff: pd.DataFrame
    # - columns: ['wealthpooled'] + MODEL_BANDS,
    # - rows: each row corresponds to a (lat, lon)
    # - values: the difference in the model's prediction, y2 - y1
    diff_cols = ['wealthpooled'] + MODEL_BANDS
    diff_raw = grouped.apply(lambda df: calc_diff(df, y1, y2, diff_cols, kind='raw'))
#     diff_rel = grouped.apply(lambda df: calc_diff(df, y1, y2, diff_cols, kind='relative'))

    title = f'{country} ({y2}-{y1})'
    if model is not None:
        title += f' {model}'

    # print counts of +,0,-
    signs_df = np.sign(diff_raw)
    label_signs = signs_df['wealthpooled']
    unique, unique_counts = np.unique(label_signs, return_counts=True)

    # percentage match in sign
    perc_series = signs_df[MODEL_BANDS].apply(
            lambda col: np.sum(label_signs == col),
            axis=0) / len(label_signs)
    perc_series['majority'] = np.max(unique_counts) / len(label_signs)
    ax = perc_series.plot.barh(width=0.9, grid=True, title=title, figsize=[3, 2])
    ax.set_xlabel('% match in sign')
    plt.show()

    # correlational heatmaps
    xlabels = ['wealthpooled'] + MODEL_BANDS
    symmetric_heatmap(diff_raw.corr().values, labels=xlabels, title=title, format_spec='{:.2f}')
#     heatmap(diff_rel.corr().values, xlabels=xlabels, ylabels=xlabels, title=title)

    diff_scatter(diff_raw, title=title + ' raw', xlabel='y2 - y1')
#     diff_scatter(diff_rel, title=title + ' rel', xlabel='(y2 - y1)/y2')

# Vanilla DHS Models - OOC

In [None]:
# load saved preds
# preds = {
#     'ms': np.array, shape [N],
#     'msnl': ...,
#     ...
# }
preds = {
    bands: load_npz(
        path=os.path.join(LOGS_ROOT_DIR, 'lsms_from_dhs', f'ooc_{bands}_preds.npz'),
        check=dict(locs=locs, labels=labels))['preds']
    for bands in MODEL_BANDS
}

In [None]:
preds_df = pd.DataFrame(
    columns=['lat', 'lon', 'country_year', 'country', 'year', 'wealthpooled'] + MODEL_BANDS)
preds_df['lat'] = locs[:, 0]
preds_df['lon'] = locs[:, 1]

for cy, cy_indices in CY_INDICES.items():
    preds_df.loc[cy_indices, 'country_year'] = cy
    c, y = cy.rsplit('_', maxsplit=1)
    preds_df.loc[cy_indices, 'country'] = c
    preds_df.loc[cy_indices, 'year'] = int(y)

preds_df['wealthpooled'] = labels

for model_name in MODEL_BANDS:
    preds_df[model_name] = preds[model_name]

# csv_path = os.path.join(BASE_CSV_DIR, 'lsms_preds_ooc.csv')
# preds_df.to_csv(csv_path, index=False)

with pd.option_context('display.max_rows', 8):
    print(preds_df.dtypes)
    display(preds_df)

In [None]:
ooc_df = pd.DataFrame(columns=['country_year', 'model', 'r2', 'R2', 'mse', 'rank'], dtype=np.float64)
ooc_df.set_index(['country_year', 'model'], inplace=True)

for cy in CYs:
    cy_indices = CY_INDICES[cy]
    for i, model_name in enumerate(MODEL_BANDS):
        model_preds = preds[model_name][cy_indices]
        ooc_df.loc[(cy, model_name), :] = evaluate(
            preds=model_preds, labels=labels[cy_indices])

with pd.option_context('display.max_rows', 5):
    display(ooc_df)

In [None]:
display(ooc_df['r2'].unstack(level='model'))
display(ooc_df['R2'].unstack(level='model'))

## Changes over time

In [None]:
idx1 = delta_pairs_df['index1']
idx2 = delta_pairs_df['index2']
bands = MODEL_BANDS + ['wealthpooled']
delta_preds_df = preds_df.loc[idx2, bands].reset_index(drop=True) - preds_df.loc[idx1, bands].reset_index(drop=True)
delta_preds_df.rename({'wealthpooled': 'label'}, axis=1, inplace=True)
display(evaluate_df(delta_preds_df, cols=MODEL_BANDS))

In [None]:
analyze_changes(preds_df, 'ethiopia', y1=2011, y2=2015, model='OOC')

In [None]:
analyze_changes(preds_df, 'malawi', y1=2010, y2=2016, model='OOC')

In [None]:
analyze_changes(preds_df, 'nigeria', y1=2010, y2=2015, model='OOC')

In [None]:
analyze_changes(preds_df, 'tanzania', y1=2008, y2=2012, model='OOC')

In [None]:
analyze_changes(preds_df, 'uganda', y1=2005, y2=2009, model='OOC')

In [None]:
analyze_changes(preds_df, 'uganda', y1=2005, y2=2013, model='OOC')

In [None]:
analyze_changes(preds_df, 'uganda', y1=2009, y2=2013, model='OOC')

# Models Fine-Tuned on LSMS - OOC

In [None]:
# load saved preds
# preds = {
#     'ms': np.array, shape [N],
#     'msnl': ...,
#     ...
# }
preds = {
    bands: load_npz(
        path=os.path.join(LOGS_ROOT_DIR, 'lsms_from_dhs_finetune', f'ooc_{bands}_preds.npz'),
        check=dict(locs=locs, labels=labels))['preds']
    for bands in MODEL_BANDS
}

In [None]:
preds_df = pd.DataFrame(
    columns=['lat', 'lon', 'country_year', 'country', 'year', 'wealthpooled'] + MODEL_BANDS)
preds_df['lat'] = locs[:, 0]
preds_df['lon'] = locs[:, 1]

for cy, cy_indices in CY_INDICES.items():
    preds_df.loc[cy_indices, 'country_year'] = cy
    c, y = cy.rsplit('_', maxsplit=1)
    preds_df.loc[cy_indices, 'country'] = c
    preds_df.loc[cy_indices, 'year'] = int(y)

preds_df['wealthpooled'] = labels

for model_name in MODEL_BANDS:
    preds_df[model_name] = preds[model_name]

# csv_path = os.path.join(BASE_CSV_DIR, 'lsms_preds_ooc.csv')
# preds_df.to_csv(csv_path, index=False)

with pd.option_context('display.max_rows', 8):
    print(preds_df.dtypes)
    display(preds_df)

In [None]:
ooc_df = pd.DataFrame(columns=['country_year', 'model', 'r2', 'R2', 'mse', 'rank'], dtype=np.float64)
ooc_df.set_index(['country_year', 'model'], inplace=True)

for cy in CYs:
    cy_indices = CY_INDICES[cy]
    for i, model_name in enumerate(MODEL_BANDS):
        model_preds = preds[model_name][cy_indices]
        ooc_df.loc[(cy, model_name), :] = evaluate(
            preds=model_preds, labels=labels[cy_indices])

with pd.option_context('display.max_rows', 8):
    display(ooc_df)

In [None]:
r2s_df = ooc_df['r2'].unstack(level='model')
display(r2s_df)

R2s_df = ooc_df['R2'].unstack(level='model')
display(R2s_df)

## Changes over time

In [None]:
idx1 = delta_pairs_df['index1']
idx2 = delta_pairs_df['index2']
bands = MODEL_BANDS + ['wealthpooled']
delta_preds_df = preds_df.loc[idx2, bands].reset_index(drop=True) - preds_df.loc[idx1, bands].reset_index(drop=True)
delta_preds_df.rename({'wealthpooled': 'label'}, axis=1, inplace=True)
display(evaluate_df(delta_preds_df, cols=MODEL_BANDS))

In [None]:
analyze_changes(preds_df, 'ethiopia', y1=2011, y2=2015, model='OOC')

In [None]:
analyze_changes(preds_df, 'malawi', y1=2010, y2=2016, model='OOC')

In [None]:
analyze_changes(preds_df, 'nigeria', y1=2010, y2=2015, model='OOC')

In [None]:
analyze_changes(preds_df, 'tanzania', y1=2008, y2=2012, model='OOC')

In [None]:
analyze_changes(preds_df, 'uganda', y1=2005, y2=2009, model='OOC')

In [None]:
analyze_changes(preds_df, 'uganda', y1=2005, y2=2013, model='OOC')

In [None]:
analyze_changes(preds_df, 'uganda', y1=2009, y2=2013, model='OOC')

# TODO: Vanilla DHS Models - Incountry
Run each of the 5 incountry models, take the mean prediction

TODO: this will need to be re-run after re-training the non-overlap DHS incountry models

In [None]:
# load labels and locs
# npz_path = os.path.join(LOGS_ROOT_DIR, 'lsms', 'incountry_ms_preds.npz')
# _ = load_npz(npz_path, check_labels=labels, check_locs=locs)

In [None]:
# load saved preds
# preds = {
#     'ms': np.array, shape [M, N] where M = number of folds (ie. 1 row per fold)
#     'msnl': ...,
#     ...
# }
# preds = {}
# for bands in MODEL_BANDS:
#     npz = load_npz(
#         path=os.path.join(LOGS_ROOT_DIR, 'lsms', f'incountry_{bands}_preds.npz'),
#         check_locs=locs, check_labels=labels
#     )
#     preds[bands] = np.stack([npz[f'{bands}_{fold}'] for fold in FOLDS], axis=0)

In [None]:
# mean_preds_df = pd.DataFrame(
#     columns=['lat', 'lon', 'country_year', 'wealthpooled'] + MODEL_BANDS)
# mean_preds_df['lat'] = locs[:, 0]
# mean_preds_df['lon'] = locs[:, 1]

# for cy, cy_indices in CY_INDICES.items():
#     mean_preds_df.loc[cy_indices, 'country_year'] = cy
#     c, y = cy.rsplit('_', maxsplit=1)
#     mean_preds_df.loc[cy_indices, 'country'] = c
#     mean_preds_df.loc[cy_indices, 'year'] = int(y)

# mean_preds_df['wealthpooled'] = labels

# for model_name in MODEL_BANDS:
#     mean_preds_df[model_name] = np.mean(preds[model_name], axis=0)

# csv_path = os.path.join(BASE_CSV_DIR, 'lsms_mean_preds_incountry.csv')
# # mean_preds_df.to_csv(csv_path, index=False)


# median_preds_df = mean_preds_df.copy(deep=True)
# for model_name in MODEL_BANDS:
#     median_preds_df[model_name] = np.median(preds[model_name], axis=0)

# csv_path = os.path.join(BASE_CSV_DIR, 'lsms_median_preds_incountry.csv')
# # median_preds_df.to_csv(csv_path, index=False)

In [None]:
# incountry_df = pd.DataFrame(columns=['country_year', 'model', 'fold', 'metric', 'score'], dtype=np.float64)
# incountry_df.set_index(['country_year', 'model', 'fold', 'metric'], inplace=True)

# for cy in CYs:
#     cy_indices = CY_INDICES[cy]
#     for model_name in MODEL_BANDS:
#         for metric in ['r2', 'R2', 'mse', 'rank']:
#             for j, fold in enumerate(FOLDS):
#                 incountry_df.loc[(cy, model_name, fold, metric)] = calc_score(
#                     preds=preds[model_name][j][cy_indices], labels=labels[cy_indices], metric=metric)
#             incountry_df.loc[(cy, model_name, 'mean', metric)] = calc_score(
#                 preds=np.mean(preds[model_name], axis=0)[cy_indices], labels=labels[cy_indices], metric=metric)
#             incountry_df.loc[(cy, model_name, 'median', metric)] = calc_score(
#                 preds=np.median(preds[model_name], axis=0)[cy_indices], labels=labels[cy_indices], metric=metric)

In [None]:
# with pd.option_context('display.max_rows', 14):
#     display(incountry_df.unstack('metric'))

In [None]:
# display(
#     incountry_df.xs(['r2', 'mean'], level=['metric', 'fold']).unstack(level='model')
# )
# display(
#     incountry_df.xs(['r2', 'median'], level=['metric', 'fold']).unstack(level='model')
# )

In [None]:
# display(
#     incountry_df.xs(['R2', 'mean'], level=['metric', 'fold']).unstack(level='model')
# )
# display(
#     incountry_df.xs(['R2', 'median'], level=['metric', 'fold']).unstack(level='model')
# )

## Changes over time

In [None]:
# analyze_changes(mean_preds_df, 'ethiopia', y1=2011, y2=2015, model='incountry')

In [None]:
# analyze_changes(mean_preds_df, 'malawi', y1=2010, y2=2016, model='incountry')

In [None]:
# analyze_changes(mean_preds_df, 'nigeria', y1=2010, y2=2015, model='incountry')

In [None]:
# analyze_changes(mean_preds_df, 'tanzania', y1=2008, y2=2012, model='incountry')

In [None]:
# analyze_changes(mean_preds_df, 'uganda', y1=2005, y2=2009, model='incountry')

In [None]:
# analyze_changes(mean_preds_df, 'uganda', y1=2005, y2=2013, model='incountry')

In [None]:
# analyze_changes(mean_preds_df, 'uganda', y1=2009, y2=2013, model='incountry')

# TODO: Models Fine-Tuned on LSMS - Incountry
Run each of the 5 incountry models, take the mean prediction

TODO: this will need to be re-run after re-training the non-overlap DHS incountry models

In [None]:
# # load labels and locs
# npz_path = os.path.join(LOGS_ROOT_DIR, 'lsms_ridge', 'incountry_ms_preds.npz')
# _ = load_npz(npz_path, check_labels=labels, check_locs=locs)

In [None]:
# # load saved preds
# # preds = {
# #     'ms': np.array, shape [M, N] where M = number of folds (ie. 1 row per fold)
# #     'msnl': ...,
# #     ...
# # }
# preds = {}
# for bands in MODEL_BANDS:
#     npz = load_npz(
#         path=os.path.join(LOGS_ROOT_DIR, 'lsms_ridge', f'incountry_{bands}_preds.npz'),
#         check_locs=locs, check_labels=labels
#     )
#     preds[bands] = np.stack([npz[f'{bands}_{fold}'] for fold in FOLDS], axis=0)

In [None]:
# mean_preds_df = pd.DataFrame(
#     columns=['lat', 'lon', 'country_year', 'wealthpooled'] + MODEL_BANDS)
# mean_preds_df['lat'] = locs[:, 0]
# mean_preds_df['lon'] = locs[:, 1]

# for cy, cy_indices in CY_INDICES.items():
#     mean_preds_df.loc[cy_indices, 'country_year'] = cy
#     c, y = cy.rsplit('_', maxsplit=1)
#     mean_preds_df.loc[cy_indices, 'country'] = c
#     mean_preds_df.loc[cy_indices, 'year'] = int(y)

# mean_preds_df['wealthpooled'] = labels

# for model_name in MODEL_BANDS:
#     mean_preds_df[model_name] = np.mean(preds[model_name], axis=0)

# csv_path = os.path.join(BASE_CSV_DIR, 'lsms_mean_preds_incountry.csv')
# # mean_preds_df.to_csv(csv_path, index=False)


# median_preds_df = mean_preds_df.copy(deep=True)
# for model_name in MODEL_BANDS:
#     median_preds_df[model_name] = np.median(preds[model_name], axis=0)

# csv_path = os.path.join(BASE_CSV_DIR, 'lsms_median_preds_incountry.csv')
# # median_preds_df.to_csv(csv_path, index=False)

In [None]:
# incountry_df = pd.DataFrame(columns=['country_year', 'model', 'fold', 'metric', 'score'], dtype=np.float64)
# incountry_df.set_index(['country_year', 'model', 'fold', 'metric'], inplace=True)

# for cy in CYs:
#     cy_indices = CY_INDICES[cy]
#     for model_name in MODEL_BANDS:
#         for metric in ['r2', 'R2', 'mse', 'rank']:
#             for j, fold in enumerate(FOLDS):
#                 incountry_df.loc[(cy, model_name, fold, metric)] = calc_score(
#                     preds=preds[model_name][j][cy_indices], labels=labels[cy_indices], metric=metric)
#             incountry_df.loc[(cy, model_name, 'mean', metric)] = calc_score(
#                 preds=np.mean(preds[model_name], axis=0)[cy_indices], labels=labels[cy_indices], metric=metric)
#             incountry_df.loc[(cy, model_name, 'median', metric)] = calc_score(
#                 preds=np.median(preds[model_name], axis=0)[cy_indices], labels=labels[cy_indices], metric=metric)

In [None]:
# with pd.option_context('display.max_rows', 14):
#     display(incountry_df.unstack('metric'))

In [None]:
# display(
#     incountry_df.xs(['r2', 'mean'], level=['metric', 'fold']).unstack(level='model')
# )
# display(
#     incountry_df.xs(['r2', 'median'], level=['metric', 'fold']).unstack(level='model')
# )

In [None]:
# display(
#     incountry_df.xs(['R2', 'mean'], level=['metric', 'fold']).unstack(level='model')
# )
# display(
#     incountry_df.xs(['R2', 'median'], level=['metric', 'fold']).unstack(level='model')
# )

## Changes over time

In [None]:
# analyze_changes(mean_preds_df, 'ethiopia', y1=2011, y2=2015, model='incountry')

In [None]:
# analyze_changes(mean_preds_df, 'malawi', y1=2010, y2=2016, model='incountry')

In [None]:
# analyze_changes(mean_preds_df, 'nigeria', y1=2010, y2=2015, model='incountry')

In [None]:
# analyze_changes(mean_preds_df, 'tanzania', y1=2008, y2=2012, model='incountry')

In [None]:
# analyze_changes(mean_preds_df, 'uganda', y1=2005, y2=2009, model='incountry')

In [None]:
# analyze_changes(mean_preds_df, 'uganda', y1=2005, y2=2013, model='incountry')

In [None]:
# analyze_changes(mean_preds_df, 'uganda', y1=2009, y2=2013, model='incountry')

# Models Trained on LSMS - Incountry
Run each of the 5 incountry models, take the mean prediction

In [None]:
# load saved preds
# preds = {
#     'lsms_incountry_resnet_ms': np.array, shape [num_examples]
#     ...
# }
preds = {}
for bands in MODEL_BANDS:
    preds[bands] = load_npz(
        path=os.path.join(LOGS_ROOT_DIR, f'lsms_incountry_resnet', bands, 'test_preds.npz'),
        check=dict(locs=locs, labels=labels)
    )['test_preds']

In [None]:
preds_df = pd.DataFrame(
    columns=['lat', 'lon', 'country', 'country_year', 'year', 'wealthpooled'] + MODEL_BANDS)
preds_df['lat'] = locs[:, 0]
preds_df['lon'] = locs[:, 1]

for country in country_indices:
    preds_df.loc[country_indices[country], 'country'] = country

for cy in CY_INDICES:
    preds_df.loc[CY_INDICES[cy], 'country_year'] = cy

preds_df['wealthpooled'] = labels
preds_df['year'] = years

for model_name in MODEL_BANDS:
    preds_df[model_name] = preds[model_name]

# csv_path = os.path.join(BASE_CSV_DIR, 'preds_incountry.csv')
# preds_df.to_csv(csv_path, index=False)

In [None]:
with pd.option_context('display.max_rows', 8):
    display(preds_df)

In [None]:
overall_df = evaluate_df(preds_df.rename({'wealthpooled': 'label'}, axis=1), cols=MODEL_BANDS)
overall_df.sort_values(by='r2', ascending=False, inplace=True)
colordisplay(overall_df)

In [None]:
incountry_df = pd.DataFrame(columns=['country_year', 'model', 'r2', 'R2', 'mse', 'rank'], dtype=np.float64)
incountry_df.set_index(['country_year', 'model'], inplace=True)

for cy, cy_indices in CY_INDICES.items():
    for model_name in MODEL_BANDS:
        incountry_df.loc[(cy, model_name), :] = evaluate(
            preds=preds[model_name][cy_indices], labels=labels[cy_indices])

In [None]:
with pd.option_context('display.max_rows', 14):
    display(incountry_df)

In [None]:
display(incountry_df['r2'].unstack(level='model'))
display(incountry_df['R2'].unstack(level='model'))

## Changes over time

In [None]:
idx1 = delta_pairs_df['index1']
idx2 = delta_pairs_df['index2']
bands = MODEL_BANDS + ['wealthpooled']
delta_preds_df = preds_df.loc[idx2, bands].reset_index(drop=True) - preds_df.loc[idx1, bands].reset_index(drop=True)
delta_preds_df.rename({'wealthpooled': 'label'}, axis=1, inplace=True)
display(evaluate_df(delta_preds_df, cols=MODEL_BANDS))

In [None]:
analyze_changes(preds_df, 'ethiopia', y1=2011, y2=2015, model='incountry')

In [None]:
analyze_changes(preds_df, 'malawi', y1=2010, y2=2016, model='incountry')

In [None]:
analyze_changes(preds_df, 'nigeria', y1=2010, y2=2015, model='incountry')

In [None]:
analyze_changes(preds_df, 'tanzania', y1=2008, y2=2012, model='incountry')

In [None]:
analyze_changes(preds_df, 'uganda', y1=2005, y2=2009, model='incountry')

In [None]:
analyze_changes(preds_df, 'uganda', y1=2005, y2=2013, model='incountry')

In [None]:
analyze_changes(preds_df, 'uganda', y1=2009, y2=2013, model='incountry')