# Remap Local Model Analysis

This notebook analyzes metrics about the local linear model (i.e. only trained on the project under analysis) we're using as of 2024-10-26. The results here may determine if we want to pursue a global model across all validate projects.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd

from pysal.explore import esda
from pysal.lib import weights

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [None]:
project_id = "-O2_J_0_ChfONXhYeRoU"
#project_id = "-NEaR6DbJAbkpYJ_BDCH"  # 70k task project
#project_id = "-O7hFcC2pKTnTh01SGds"

In [None]:
from mapswipe.workflows.project_remap import analyze_project
analysis_results = analyze_project(project_id)

In [None]:
list(analysis_results.keys())

In [None]:
df_agg_moran_w = analysis_results["df_agg_moran_w"]

In [None]:
df_agg_moran_w.head()

In [None]:
def get_top_pct_ids(df, sort_col, id_col, pct):
    df = df.sort_values(sort_col)
    return set(df.tail(int(len(df) * pct))[id_col].values)

def jaccard(s1, s2):
    return len(s1 & s2) / len(s1 | s2)

def jaccard_matrix(df, measure_cols, id_col, pct):
    jmat = np.diag([1.0] * len(measure_cols))
    for i in range(len(measure_cols)):
        for j in range(i+1):
            si = get_top_pct_ids(df, measure_cols[i], id_col, pct)
            sj = get_top_pct_ids(df, measure_cols[j], id_col, pct)
            jmat[i, j] = jmat[j, i] = jaccard(si, sj)
    return jmat

def jaccard_df(df, measure_cols, id_col, pct):
    jmat = jaccard_matrix(df, measure_cols, id_col, pct)
    return pd.DataFrame(jmat, index=measure_cols, columns=measure_cols)

# Project Metrics

In [None]:
df_agg_moran_w["remap_score_uw"].describe()

In [None]:
jaccard_df(df_agg_moran_w, ["0_share", "remap_score_uw", "remap_score", "adjusted_remap_score"], "task_id", 0.05)

In [None]:
jaccard_df(df_agg_moran_w, ["0_share", "remap_score_uw", "remap_score", "adjusted_remap_score"], "task_id", 0.1)

In [None]:
jaccard_df(df_agg_moran_w, ["0_share", "remap_score_uw", "remap_score", "adjusted_remap_score"], "task_id", 0.2)

# Generate 0_share table for multiple projects

This calculates the average 0_share for tasks ranked by the various metrics we build during the workflow. This will (hopefully) demonstrate how the various steps add value to the process.

In [None]:
project_list = [
    "-O2_J_0_ChfONXhYeRoU",
    "-O7cI0yciePCYL8pXeu1",
    "-O6MPjXrwBGIbLm1pWro",
    "-O7hFcC2pKTnTh01SGds",
    "-NEaR6DbJAbkpYJ_BDCH",
]

In [None]:
from mapswipe.workflows.project_remap import analyze_project

def gen_ranked_zero_share_table(project_ids, metric_cols, id_col):
    project_dfs = []
    for project_id in project_ids:
        print(f"Generating {project_id}")
        results = analyze_project(project_id)
        df_agg_moran_w = results["df_agg_moran_w"]
        for metric_col in metric_cols:
            metric_rows = []
            for pct in (0.05, 0.1, 0.2):
                ids = get_top_pct_ids(df_agg_moran_w, metric_col, id_col, pct)
                mean_0_share = df_agg_moran_w[df_agg_moran_w[id_col].isin(ids)][metric_col].mean()
                metric_rows.append({
                    "Project ID": project_id,
                    "Top N%": f"{int(pct * 100)}%",
                    "# Tasks": len(ids),
                    "Metric": metric_col,
                    "Avg % No Responses Across Tasks": mean_0_share,
                })
            project_dfs.append(pd.DataFrame(data=metric_rows))
    df = pd.concat(project_dfs)
    df = df.pivot(columns=["Metric"], index=["Project ID", "# Tasks", "Top N%"], values=["Avg % No Responses Across Tasks"])
    # TODO reorder Metric columns to reflect the actual progression of the workflow
    return df

In [None]:
df_projs = gen_ranked_zero_share_table(project_list, ["remap_score_uw", "remap_score", "adjusted_remap_score"], "task_id")

In [None]:
df_projs

In [None]:
df_projs.index

In [None]:
row_index = pd.MultiIndex.from_tuples(
    [
        ('-O7cI0yciePCYL8pXeu1', 43, '5%'),
        ('-O7cI0yciePCYL8pXeu1', 87, '10%'),
        ('-O7cI0yciePCYL8pXeu1', 174, '20%'),
        ('-O7hFcC2pKTnTh01SGds', 156, '5%'),
        ('-O7hFcC2pKTnTh01SGds', 313, '10%'),
        ('-O7hFcC2pKTnTh01SGds', 626, '20%'),
        ('-O2_J_0_ChfONXhYeRoU', 388, '5%'),
        ('-O2_J_0_ChfONXhYeRoU', 777, '10%'),
        ('-O2_J_0_ChfONXhYeRoU', 1555, '20%'),
        ('-O6MPjXrwBGIbLm1pWro', 2014, '5%'),
        ('-O6MPjXrwBGIbLm1pWro', 4029, '10%'),
        ('-O6MPjXrwBGIbLm1pWro', 8059, '20%'),
        # ('-NEaR6DbJAbkpYJ_BDCH', 3541, '5%'),
        # ('-NEaR6DbJAbkpYJ_BDCH', 7083, '10%'),
        # ('-NEaR6DbJAbkpYJ_BDCH', 14167, '20%'),
    ],
    names=df_projs.index.names
)
row_index

In [None]:
sorted(df_projs.index)

In [None]:
list(df_projs.columns)

In [None]:
col_index = pd.MultiIndex.from_tuples(
    [
        ('Avg % No Responses Across Tasks', 'remap_score_uw'),
        ('Avg % No Responses Across Tasks', 'remap_score'),
        ('Avg % No Responses Across Tasks', 'adjusted_remap_score'),
    ],
    names=df_projs.columns.names
)
col_index

In [None]:
df_projs_edit = df_projs.copy()
df_projs_edit = df_projs_edit.reindex(row_index, axis=0)
df_projs_edit = df_projs_edit.reindex(col_index, axis=1)

In [None]:
df_projs_edit

In [None]:
df_projs_edit[df_projs_edit["Project ID"].isin(["-O7cI0yciePCYL8pXeu1", "-O7hFcC2pKTnTh01SGds", "-O2_J_0_ChfONXhYeRoU", "-O6MPjXrwBGIbLm1pWro"])]

In [None]:
df_projs.pivot(columns=["Metric"], index=["Project ID", "# Tasks", "Top N%"], values=["Avg % No Responses Across Tasks"])