# Metrics Across Validate Projects
**TBD add details**

# Setup

In [None]:
import pandas as pd
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [None]:
# TBD

# Metrics for all projects

## Calculate the metrics

There are 74 projects scoped for this analysis out of 88 total validate projects:
* 6 not finished yet (inactive or active)
* 1 whose geoms were squares instead of building footprints - https://download.geoservice.dlr.de/WSF2019/
* 7 whose data couldn't be downloaded

In [None]:
from mapswipe.data import read_scoped_projects_list, get_project_data
df_projects = read_scoped_projects_list()
validate_projects = list(df_projects["project_id"])
df_projects[["project_type", "status"]].value_counts().sort_index()

In [None]:
import diskcache
from mapswipe.data import CACHE_PATH, CACHE_SIZE
with diskcache.Cache(directory=CACHE_PATH, size_limit=CACHE_SIZE) as cache:
    all_proj_data = cache["all_proj_data"]

In [None]:
list(all_proj_data.keys())[50]

In [None]:
project_id = "-MxuKEABaIRO1bvsDGpM"
data = all_proj_data[project_id]

In [None]:
df_projects[df_projects["project_id"] == project_id]

In [None]:
def has_offset(row):
    return any(all_proj_data[row["project_id"]]["agg"]["3_count"])

df_projects["has_offset"] = df_projects.apply(has_offset, axis=1)
df_projects["has_offset"].value_counts()

`custom_options` always shows offset as an option in the project, even for projects where nobody has used it.

In [None]:
df_projects[["has_offset", "custom_options"]].value_counts()

## User Stats

In [None]:
df_full_all = pd.concat([all_proj_data[project_id]["full"] for project_id in validate_projects])

### Who is the validate userbase?

In [None]:
df_user_proj = df_full_all.drop_duplicates().groupby("user_id").agg(
    involved_project_count=("project_id", "nunique"),
    first_seen=("timestamp", "min"),
    last_seen=("timestamp", "max"),
)
df_user_proj["first_seen"] = pd.to_datetime(df_user_proj["first_seen"], format="mixed").dt.floor("min")
df_user_proj["last_seen"] = pd.to_datetime(df_user_proj["last_seen"], format="mixed").dt.floor("min")
df_user_proj["tenure_days"] = (df_user_proj["last_seen"] - df_user_proj["first_seen"]).apply(lambda x: x.days + 1)

Almost 27,000 users have contributed to validate projects

In [None]:
df_user_proj[["involved_project_count", "tenure_days"]].describe()

In [None]:
df_user_proj.head()

The userbase follows a power law distribution - a small number of very involved power users, and a large number of users who worked on a single project for one day

In [None]:
df_user_proj.reset_index().sort_values("involved_project_count", ascending=False).plot(x="user_id", y="involved_project_count").get_xaxis().set_visible(False)

In [None]:
df_user_proj.reset_index().sort_values("tenure_days", ascending=False).plot(x="user_id", y="tenure_days").get_xaxis().set_visible(False)

### Power users

These are your most engaged users.

**Policy question - should these users' contributions have more weight than less involved users?**

In [None]:
df_user_proj[df_user_proj["involved_project_count"] > 20]

### Project-level user involvement

In [None]:
df_proj_user = df_full_all.drop_duplicates().groupby("project_id").agg(
    user_count=("user_id", "nunique"),
    first_seen=("timestamp", "min"),
    last_seen=("timestamp", "max"),
)
df_proj_user["duration_days"] = (pd.to_datetime(df_proj_user["last_seen"], format="mixed") - pd.to_datetime(df_proj_user["first_seen"], format="mixed")).apply(lambda x: x.days + 1)

In [None]:
df_proj_user.describe()

In [None]:
df_proj_user.plot.scatter(x="user_count", y="duration_days")

## Task-Level Stats

In [None]:
def agg_all(project_id, df):
    df["project_id"] = project_id
    return df

df_agg_all = pd.concat([agg_all(project_id, all_proj_data[project_id]["agg"]) for project_id in validate_projects if all_proj_data[project_id]["agg"] is not None])
df_agg_all = df_agg_all.drop("idx", axis=1).set_index(["project_id", "task_id"])

df_full_user = df_full_all.groupby(["project_id", "task_id"]).agg(
    user_count=("user_id", "nunique"),
    first_seen=("timestamp", "min"),
    last_seen=("timestamp", "max"),
)
df_agg_all = df_agg_all.join(df_full_user).reset_index()
del df_full_user


import h3

def to_h3(row, resolution):
    rp = row.geometry.representative_point()
    #return h3.geo_to_h3(row.geometry.y, row.geometry.x, resolution)
    try:
        return h3.geo_to_h3(rp.y, rp.x, resolution)
    except:
        return None

df_agg_all["h3_hex"] = df_agg_all.apply(to_h3, axis=1, resolution=8)
df_h3_count = df_agg_all[["h3_hex", "task_id"]].groupby("h3_hex").nunique().rename({"task_id": "h3_building_count"}, axis=1).reset_index()
df_agg_all = df_agg_all.merge(df_h3_count, on="h3_hex")

# TODO fine-tune this
df_agg_all["is_urban"] = (df_agg_all["h3_building_count"] >= 150)

In [None]:
df_h3_count.describe()

### Task Basics

In [None]:
len(df_agg_all)

This is the distribution of users who have seen an individual task

In [None]:
with pd.option_context("display.float_format", lambda x: "%.3f" % x):
    print(df_agg_all["user_count"].describe())

In [None]:
df_agg_all["is_urban"].describe()

In [None]:
with pd.option_context("display.float_format", lambda x: "%.3f" % x):
    print(df_agg_all[["correct_score", "nearby_building_count", "building_area_m2", "user_count"]].describe())

These measures aren't very useful

### Correlation across metrics

In [None]:
from scipy.stats import pearsonr, spearmanr
print(pearsonr(df_agg_all["correct_score"], df_agg_all["nearby_building_count"]))
print(spearmanr(df_agg_all["correct_score"], df_agg_all["nearby_building_count"]))

In [None]:
from scipy.stats import pearsonr, spearmanr

def calc_corr_df(df_input, corr_cols, target_col):
    corr_rows = []
    
    for is_urban in (False, True):
        df = df_input[df_input["is_urban"] == is_urban]
        for c in corr_cols:
            p = pearsonr(df[c], df[target_col])
            s = spearmanr(df[c], df[target_col])
            corr_rows.append({
                "col1_name": c,
                "col2_name": target_col,
                "is_urban": is_urban,
                "pearson_stat": p.statistic,
                "pearson_pval": p.pvalue,
                "spearman_stat": s.statistic,
                "spearman_pval": s.pvalue,
            })
    return pd.DataFrame(corr_rows)

In [None]:
from scipy.stats import pearsonr, spearmanr

def calc_corr_df(df_input, corr_cols, target_col):
    corr_rows = []
    
    for is_urban in (False, True):
        df = df_input[df_input["is_urban"] == is_urban]
        for c in corr_cols:
            p = pearsonr(df[c], df[target_col])
            s = spearmanr(df[c], df[target_col])
            corr_rows.append({
                "col1_name": c,
                "col2_name": target_col,
                "is_urban": is_urban,
                "pearson_stat": p.statistic,
                "pearson_pval": p.pvalue,
                "spearman_stat": s.statistic,
                "spearman_pval": s.pvalue,
            })
    return pd.DataFrame(corr_rows)

Looks like there's some mild negative correlation between the correct_score and the number of users involved in the task. The more users, the less certainty about the correctness of the footprint. Interestingly, we don't see the same correlation between user_count and agreement.

In [None]:
df_corr_score = calc_corr_df(df_agg_all, ["nearby_building_count", "h3_building_count", "building_area_m2", "user_count"], "correct_score")
df_corr_score

In [None]:
df_corr_agree = calc_corr_df(df_agg_all[~(df_agg_all["agreement"].isna())], ["nearby_building_count", "h3_building_count", "building_area_m2", "user_count"], "agreement")
df_corr_agree

There's maybe a slight correlation between adding users and the number of "not sure" responses, which somewhat reinforces my suspicion that the slate of answers doesn't capture enough nuance for a clearer signal and users are picking "not sure" in those situations.

In [None]:
df_corr_unsure = calc_corr_df(df_agg_all[~(df_agg_all["2_share"].isna())], ["nearby_building_count", "h3_building_count", "building_area_m2", "user_count"], "2_share")
df_corr_unsure

Which projects have 500+ nearby buildings?

In [None]:
import folium
import branca.colormap as cm

def create_task_map(gdf, center_pt=None, color_col="1_share"):
    
    geojson_data = gdf.drop('lastEdit', axis=1).to_json()

    if center_pt is None:
        center_pt = gdf.to_crs(gdf.estimate_utm_crs()).dissolve().centroid.to_crs(4326)
    map = folium.Map(location=[center_pt.y, center_pt.x], zoom_start=8)
    map._repr_html_ = lambda: map._parent._repr_html_(
    include_link=False, width='75%', height='400px'
    )

    colormap = cm.linear.YlOrRd_09.scale(gdf[color_col].min(), gdf[color_col].max())

    def style_function(feature):
        return {
            'fillColor': colormap(feature['properties'][color_col]),
            'color': 'black',
            'weight': 0.5,
            'fillOpacity': 0.8
        }

    

    folium.GeoJson(
        geojson_data,
        style_function=style_function,
        name="geojson"
    ).add_to(map)

    colormap.add_to(map)

    return map

In [None]:
df_agg_all[df_agg_all["nearby_building_count"] >= 500]["project_id"].drop_duplicates().head()

In [None]:
df_projects[df_projects["project_id"].isin(df_agg_all[df_agg_all["nearby_building_count"] >= 500]["project_id"].drop_duplicates().head())]

Which projects have large buildings?

In [None]:
df_agg_all[df_agg_all["building_area_m2"] > 10000.0][["project_id", "building_area_m2"]].groupby("project_id").sum().sort_values("building_area_m2")

In [None]:
project_id = "-NEaR6DbJAbkpYJ_BDCH"
#create_task_map(all_proj_data[project_id]["agg"])
create_task_map(df_agg_all[df_agg_all["project_id"] == project_id].replace({"is_urban": {True: 1.0, False: 0.0}}), color_col="is_urban")

In [None]:
df_agg_all[df_agg_all["project_id"] == project_id]["h3_hex"].value_counts()