In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

# Load project data

In [None]:
from mapswipe.data import get_project_data, read_scoped_projects_list, CACHE_PATH, CACHE_SIZE
df_projects = read_scoped_projects_list()
print(len(df_projects))
validate_projects = list(df_projects[df_projects["project_type"] == 2]["project_id"])

In [None]:
df_projects[["project_type", "status"]].value_counts().sort_index()

## Inspect project summaries

In [None]:
df_projects[df_projects["project_type"] == 2].describe()

In [None]:
len(df_projects[~df_projects["project_id"].isin(["-MRL3frZWPOCR94ehFnp"])])

In [None]:
df_projects[df_projects["project_type"] == 2].sort_values("area_sqkm").tail(1)["project_details"].iat[0]

In [None]:
df_projects[df_projects["project_type"] == 2].sort_values("area_sqkm").tail(5)

In [None]:
df_projects[df_projects["project_id"] == "-MRL3frZWPOCR94ehFnp"]

# Load an individual project
This uses the disk cache so make sure `mapswipe.data.CACHE_PATH` is set correctly

In [None]:
data = get_project_data("-NQz3e_OZzUvi22pL0Ul")

In [None]:
data["agg"].tail()

In [None]:
data["agg"].describe()

## Seed the cache

In [None]:
for project_id in validate_projects:
    try:
        get_project_data(project_id)
    except:
        print(f"Error getting {project_id}")
        raise

# Spatial Correlation

In [None]:
from functools import partial
from splot.esda import plot_moran
from mapswipe.project_stats import calc_moran_for_knn, safe_calc_moran, calc_moran_for_dist, calc_moran_for_dist_debug, calc_moran_for_dist_weighted

## Debugging tools

In [None]:
project_id = "-NQz3e_OZzUvi22pL0Ul"

data = get_project_data(project_id)

In [None]:
data["agg"].head()

In [None]:
data["agg"].describe()

In [None]:
agree_k_func = partial(calc_moran_for_knn, col_name="correct_score", k_vals=(1, 2, 3, 5, 10, 15))
df_moran_agree = safe_calc_moran(project_id, agree_k_func)

Scree plot for a single project

In [None]:
correct_dist_func = partial(calc_moran_for_dist, col_name="correct_score", dist_vals=(5.0, 10.0, 20.0, 25.0, 50.0, 75.0, 100.0, 200.0, 300.0))
df_moran_dist_correct = safe_calc_moran(project_id, correct_dist_func)

In [None]:
import matplotlib.pyplot as plt
plt.plot(df_moran_dist_correct["dist"], df_moran_dist_correct["moran_i"], "ro-", linewidth=2)

## agreement / KNN

In [None]:
import pandas as pd
import multiprocessing as mp
from functools import partial

moran_tables = []

#moran_func = partial(safe_calc_moran, col_name="agreement", k=(1, 2, 3, 5, 10, 15))
agree_k_func = partial(calc_moran_for_knn, col_name="agreement", k_vals=(1, 2, 3, 5, 10, 15))

with mp.Pool(processes=8) as pool:
    #map_args = [(p, "agreement", (1, 2, 3, 5, 10, 15)) for p in validate_projects]
    map_args = [(p, agree_k_func) for p in validate_projects]
    moran_tables = pool.starmap(safe_calc_moran, map_args)
    moran_tables = [t for t in moran_tables if t is not None]

df_moran_agreement = pd.concat(moran_tables)

In [None]:
df_moran_agreement.boxplot(column="moran_i", by="k")

## correct_score / KNN

In [None]:
import pandas as pd
import multiprocessing as mp
from functools import partial

moran_tables = []

correct_k_func = partial(calc_moran_for_knn, col_name="correct_score", k_vals=(1, 2, 3, 5, 10, 15))

with mp.Pool(processes=8) as pool:
    map_args = [(p, correct_k_func) for p in validate_projects]
    moran_tables = pool.starmap(safe_calc_moran, map_args)
    moran_tables = [t for t in moran_tables if t is not None]

df_moran_correct_score = pd.concat(moran_tables)

In [None]:
df_moran_correct_score.boxplot(column="moran_i", by="k")

## correct_score / DistanceBand
Calculating moran by distance is very memory-intensive so keep the parallelism low. This takes a long time for 80+ projects

In [None]:
import pandas as pd
import multiprocessing as mp
from functools import partial

moran_tables = []

correct_dist_func = partial(calc_moran_for_dist, col_name="correct_score", dist_vals=(5.0, 10.0, 20.0, 25.0, 50.0, 75.0, 100.0, 200.0, 300.0))

with mp.Pool(processes=1) as pool:
    map_args = [(p, correct_dist_func) for p in validate_projects]
    moran_tables = pool.starmap(safe_calc_moran, map_args)
    moran_tables = [t for t in moran_tables if t is not None]

df_moran_correct_dist = pd.concat(moran_tables)

In [None]:
df_moran_correct_dist.boxplot(column="moran_i", by="dist").set_ylim(top=1.0)

In [None]:
df_moran_correct_dist[df_moran_correct_dist["moran_i"] >= 2.0].head()

In [None]:
df_moran_correct_dist[df_moran_correct_dist["project_id"] == "-NC5JLsuWA0V6nbNfbRo"]

In [None]:
import pandas as pd
import multiprocessing as mp
from functools import partial

moran_tables = []

correct_dist_w_func = partial(calc_moran_for_dist_weighted, col_name="correct_score", dist_vals=(5.0, 10.0, 20.0, 25.0, 50.0, 75.0, 100.0, 200.0, 300.0))

with mp.Pool(processes=1) as pool:
    map_args = [(p, correct_dist_w_func) for p in validate_projects]
    moran_tables = pool.starmap(safe_calc_moran, map_args)
    moran_tables = [t for t in moran_tables if t is not None]

df_moran_correct_dist_w = pd.concat(moran_tables)

In [None]:
df_moran_correct_dist_w.boxplot(column="moran_i", by="dist").set_ylim(top=1.0)

# Population Statistics

# Visual Inspection

In [None]:
import folium
import branca.colormap as cm

def create_task_map(gdf, center_pt=None):
    
    geojson_data = gdf.drop('lastEdit', axis=1).to_json()

    if center_pt is None:
        center_pt = gdf.to_crs(gdf.estimate_utm_crs()).dissolve().centroid.to_crs(4326)
    map = folium.Map(location=[center_pt.y, center_pt.x], zoom_start=8)
    map._repr_html_ = lambda: map._parent._repr_html_(
    include_link=False, width='75%', height='400px'
    )

    colormap = cm.linear.YlOrRd_09.scale(gdf["1_share"].min(), gdf["1_share"].max())

    def style_function(feature):
        return {
            'fillColor': colormap(feature['properties']['1_share']),
            'color': 'black',
            'weight': 0.5,
            'fillOpacity': 0.8
        }

    folium.GeoJson(
        geojson_data,
        style_function=style_function,
        name="geojson"
    ).add_to(map)

    colormap.add_to(map)

    return map

## Investigate a correct_score / DistanceBand outlier

In [None]:
correct_dist_data = get_project_data("-NC5JLsuWA0V6nbNfbRo")
correct_dist_moran = calc_moran_for_dist_debug(correct_dist_data["agg"], "correct_score", 25.0)
plot_moran(correct_dist_moran)

In [None]:
create_task_map(correct_dist_data["agg"])

In [None]:
df_moran_correct_score[df_moran_correct_score["moran_i"] >= 0.8].head(20)

In [None]:
correct_dist_data["agg"].describe()

In [None]:
import matplotlib.pyplot as plt
#plt.scatter(correct_dist_data["agg"]["correct_score"], correct_dist_data["agg"]["nearby_building_count"])
#correct_dist_data["agg"].plot(x="correct_score", y="nearby_building_count", type="dot")
import numpy as np
print(np.corrcoef(correct_dist_data["agg"]["correct_score"], correct_dist_data["agg"]["nearby_building_count"]))

from scipy.stats import pearsonr, spearmanr
print(pearsonr(correct_dist_data["agg"]["correct_score"], correct_dist_data["agg"]["nearby_building_count"]))
print(spearmanr(correct_dist_data["agg"]["correct_score"], correct_dist_data["agg"]["nearby_building_count"]))

## Investigate a weird outlier
This project isn't a typical project, it's based on some AI-generated data:
https://download.geoservice.dlr.de/WSF2019/

In [None]:
weird_data = get_project_data("-MRL3frZWPOCR94ehFnp")

In [None]:
weird_data["agg"].estimate_utm_crs()

In [None]:
weird_data["agg"].dissolve().to_crs(weird_data["agg"].estimate_utm_crs()).centroid.to_crs(4326)

In [None]:
import shapely
# Taken from the project summary df
weird_center = shapely.Point(-5.100499450000003, 7.735347999999997)

In [None]:
create_task_map(weird_data["agg"].head(1000), weird_center)