# Project Analysis Prototype Notebook

This notebook is a testbed for approaches to the project analysis workflow.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from pysal.explore import esda
from pysal.lib import weights

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import seaborn as sns
sns.set(style='white',font_scale=1.0,rc={"axes.spines.top":False,"axes.spines.right":False, "lines.linewidth": 2.5,'lines.markersize': 10},color_codes=False,palette=sns.color_palette(['#27a3aa','#f76d23','#70d6e3','#ffbb31','#b1c96d','#cce18a','#1c4c5d','#787642']))

In [4]:
from mapswipe.workflows.project_remap import get_user_metrics, get_project_agg_weighted
from mapswipe.data_access import get_project_data  # todo replace with live call + augmentation

df_user_metrics = get_user_metrics()

This project has a good mix of attributes:
* Many buildings grouped in varying densities
* Large and small buildings

In [5]:
project_id = "-NEaR6DbJAbkpYJ_BDCH"
proj_data = get_project_data(project_id)
df_full = proj_data["full"]
df_agg = proj_data["agg"]
df_agg["project_id"] = project_id

In [None]:
df_full.head()

In [None]:
df_user_metrics.head()

In [None]:
df_agg.head()

In [6]:
df_agg_w = get_project_agg_weighted(df_agg, df_full, df_user_metrics)

In [None]:
len(df_agg_w[(df_agg_w["0_share_uw"] > df_agg_w["1_share_uw"]) & (df_agg_w["1_share"] > df_agg_w["0_share"])])

In [7]:
df_agg_w.head()

Unnamed: 0,project_id,task_id,idx,0_count_uw,1_count_uw,2_count_uw,3_count_uw,0_share_uw,1_share_uw,2_share_uw,3_share_uw,total_count_uw,lastEdit,osm_username,geometry,agreement,year,modal_answer,yes_building,incorrect_score_uw,nearby_building_count,building_area_m2,0_count,1_count,2_count,3_count,total_count,0_share,1_share,2_share,3_share,incorrect_score
0,-NEaR6DbJAbkpYJ_BDCH,t1,0.0,10.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,10.0,2016-10-11 15:54:06+00:00,,"MULTIPOLYGON (((33.8928 -13.90473, 33.89281 -1...",1.0,2016,0_count,False,1.0,45,8.191202,14.0,0.0,0.0,0.0,14.0,1.0,0.0,0.0,0.0,1.0
1,-NEaR6DbJAbkpYJ_BDCH,t10,1.0,6.0,4.0,0.0,0.0,0.6,0.4,0.0,0.0,10.0,2020-12-22 16:15:51+00:00,,"MULTIPOLYGON (((33.77507 -13.88669, 33.77507 -...",0.466667,2020,0_count,False,0.6,54,162.421008,10.0,4.0,0.0,0.0,14.0,0.714286,0.285714,0.0,0.0,0.714286
2,-NEaR6DbJAbkpYJ_BDCH,t11,2.0,7.0,2.0,1.0,0.0,0.7,0.2,0.1,0.0,10.0,2016-10-07 15:43:31+00:00,,"MULTIPOLYGON (((33.77577 -13.88667, 33.77577 -...",0.488889,2016,0_count,False,0.8,85,40.182259,11.0,2.0,1.0,0.0,14.0,0.785714,0.142857,0.071429,0.0,0.857143
3,-NEaR6DbJAbkpYJ_BDCH,t12,3.0,7.0,3.0,0.0,0.0,0.7,0.3,0.0,0.0,10.0,2016-10-07 16:58:08+00:00,,"MULTIPOLYGON (((33.77767 -13.88665, 33.77768 -...",0.533333,2016,0_count,False,0.7,96,98.738852,11.0,3.0,0.0,0.0,14.0,0.785714,0.214286,0.0,0.0,0.785714
4,-NEaR6DbJAbkpYJ_BDCH,t13,4.0,7.0,3.0,0.0,0.0,0.7,0.3,0.0,0.0,10.0,2020-12-23 15:43:35+00:00,,"MULTIPOLYGON (((33.77851 -13.88673, 33.7785 -1...",0.533333,2020,0_count,False,0.7,86,73.472307,11.0,3.0,0.0,0.0,14.0,0.785714,0.214286,0.0,0.0,0.785714


# Moran

In [None]:
def moran_sig_quads(ser_tasks, lisa):
    sig = 1 * (lisa.p_sim < 0.05)
    spots = lisa.q * sig
    return pd.Series(spots, index=ser_tasks)

def calc_moran_local_for_dist(gdf_agg, col_name, dist_vals):
    moran_vals = {}
    # Project to UTM for distance calculation
    task_ids = gdf_agg["task_id"]
    gdf = gdf_agg.to_crs(gdf_agg.estimate_utm_crs())
    for dist in dist_vals:
        w = weights.DistanceBand.from_dataframe(gdf, threshold=dist)
        w.transform = "R"
        moran = esda.moran.Moran_Local(gdf[col_name], w)
        moran_vals[f"moran_quad_{int(dist)}m"] = moran_sig_quads(task_ids, moran)
    return pd.DataFrame(data=moran_vals, index=task_ids)

In [None]:
df_moran_local = calc_moran_local_for_dist(df_agg, "incorrect_score", [150.0, 350.0])

In [None]:
for c in [c for c in df_moran_local.columns if c.startswith("moran_quad_")]:
    print("\n", df_moran_local[c].value_counts())

In [None]:
df_moran_local.head(20)

In [None]:
df_moran_local_w = calc_moran_local_for_dist(df_agg_w, "incorrect_score", [150.0, 350.0])

In [None]:
for c in [c for c in df_moran_local_w.columns if c.startswith("moran_quad_")]:
    print("\n", df_moran_local_w[c].value_counts())

# Viz

In [None]:
import folium
from scipy import stats
import h3
from shapely.geometry import Polygon
import geopandas as gpd
from folium.features import GeoJsonTooltip
from typing import Iterable
import branca.colormap as cm

# LISA colors
lc = {
    "ns": "#5c5c5c", # Values of 0
    "HH": "#d7191c",  # Values of 1
    "LH": "#abd9e9",  # Values of 2
    "LL": "#2c7bb6",  # Values of 3
    "HL": "#fdae61",  # Values of 4
}
lisa_colormap = [lc["ns"], lc["HH"], lc["LH"], lc["LL"], lc["HL"]]


def create_moran_quad_map(gdf, ser_quads, center_pt=None, head_ct=None):
    # TODO MAKE THIS BETTER
    color_col = "local_quadrant"
    gdf = gdf.set_index("task_id")
    gdf[color_col] = ser_quads
    
    if head_ct:
        gdf = gdf.head(head_ct).copy()

    gdf = gdf.reset_index()
    geojson_data = gdf.drop('lastEdit', axis=1).to_json()

    if center_pt is None:
        center_pt = gdf.to_crs(gdf.estimate_utm_crs()).dissolve().centroid.to_crs(4326)
    map = folium.Map(location=[center_pt.y, center_pt.x], zoom_start=8)
    map._repr_html_ = lambda: map._parent._repr_html_(
    include_link=False, width='75%', height='400px'
    )

    def style_function(feature):
        fillval = feature['properties'][color_col]
        fillval = int(fillval)
        return {
            'fillColor': lisa_colormap[fillval],
            'color': 'black',
            'weight': 0.25,
            'fillOpacity': 0.8
        }

    folium.GeoJson(
        geojson_data,
        style_function=style_function,
        name="geojson"
    ).add_to(map)

    #colormap.add_to(map)

    return map


def create_moran_quad_hex_map(gdf_agg, mode_col, h3_resolution):
    gdf = gdf_agg.copy(deep = True)
    gdf["geometry"] = gdf.centroid

    # Define hexagons
    def latlon_to_hexagon(row, resolution):
        return h3.geo_to_h3(row.geometry.y, row.geometry.x, resolution)

    gdf['hexagon'] = gdf.apply(latlon_to_hexagon, resolution=h3_resolution, axis=1)

    def _mode(s):
        m = s.mode()
        if isinstance(m, Iterable):
            m = m[0]
        return m
    
    hexagon_gdf = gdf.groupby('hexagon').agg({mode_col : _mode, "task_id" : "nunique"}).reset_index()
    hexagon_gdf[mode_col] = hexagon_gdf[mode_col].astype(int)

    def hexagon_to_geometry(hexagon):
        vertices = h3.h3_to_geo_boundary(hexagon, geo_json=True)
        return Polygon(vertices)

    hexagon_gdf['geometry'] = hexagon_gdf['hexagon'].apply(hexagon_to_geometry)

    hexagon_gdf = gpd.GeoDataFrame(hexagon_gdf, geometry='geometry').set_crs(4326)

    # Create the map
    m = folium.Map(location=[gdf.geometry.y.mean(), gdf.geometry.x.mean()], zoom_start=8)

    hexagon_geojson = hexagon_gdf.to_json()

    tooltip = GeoJsonTooltip(
        fields=['hexagon', 'task_id', mode_col],
        aliases=['Hexagon ID:', 'Building Count:', mode_col],  # These are the names that will appear in the tooltip
        localize=True,
        sticky=False,
        labels=True,
        style="""
            background-color: #F0EFEF;
            border: 2px solid black;
            border-radius: 3px;
            box-shadow: 3px;
        """,
        max_width=800,
    )

    # creating the custom ramp
    lisa_cm = cm.StepColormap(colors = lisa_colormap, vmin = 0, vmax = len(lisa_colormap)-1)

    def style_function(feature):
        fillval = feature['properties'][mode_col]
        fillval = int(fillval)
        return {
            'fillColor': lisa_colormap[fillval],
            'color': 'black',
            'weight': 0.25,
            'fillOpacity': 0.0
        }
    
    # Add Choropleth layer
    folium.Choropleth(
        geo_data=hexagon_geojson,
        name='choropleth',
        data=hexagon_gdf,
        columns=['hexagon', mode_col],
        key_on='feature.properties.hexagon',
        #style_function=style_function,
        fill_color="YlOrRd",
        fill_opacity=0.7,
        line_opacity=0.2,
        legend_name='dominant local Moran quadrant'
    ).add_to(m)
    
    folium.GeoJson(
        hexagon_geojson,
        #style_function=lambda x: {"fillColor": "YlOrRd", "color": "black", "weight": 1, "fillOpacity":0},
        style_function=style_function,
        tooltip=tooltip
    ).add_to(m)

    m._repr_html_ = lambda: m._parent._repr_html_(
    include_link=False, width='75%', height='400px'
    )
    return m

In [None]:
create_moran_quad_map(df_agg, ser_quads)

In [None]:
df_agg_moran_w = df_agg_w.set_index("task_id").join(df_moran_local_w, how="inner").reset_index()
len(df_agg_w), len(df_moran_local_w), len(df_agg_moran_w)

In [None]:
create_moran_quad_hex_map(df_agg_moran_w, mode_col="moran_quad_150m", h3_resolution=10)

# Regression Models

## Fixed Effect Regime

In [22]:
import h3
import math
from pysal.lib import weights
from pysal.model import spreg
    

def model_ols_fe(gdf_agg_w, y_col, feature_cols, fe_h3_resolution):
    gdf = gdf_agg_w[feature_cols + ["geometry", y_col]].copy()
    
    gdf["geometry"] = gdf.to_crs(gdf.estimate_utm_crs()).centroid.to_crs(gdf_agg_w.crs)
    
    def latlon_to_hexagon(row, resolution):
        return h3.geo_to_h3(row.geometry.y, row.geometry.x, resolution)

    gdf["fe_hexbin"] = gdf.apply(latlon_to_hexagon, resolution=fe_h3_resolution, axis=1)

    # X = gdf[feature_cols + ["fe_hexbin"]]
    # dummies = pd.get_dummies(gdf, columns=["fe_hexbin"], prefix='_d', drop_first=False)
    # X = pd.concat([X, dummies], axis=1)
    
    # y = gdf[y_col]
    #dist = math.sqrt(h3.hex_area(fe_h3_resolution, unit="m^2") / math.pi)
    # dist = 100.0
    # w = weights.DistanceBand.from_dataframe(gdf, threshold=dist, binary=False)
    # w.transform = "R"
    
    # Fit the model
    # model = spreg.OLS(
    #     y, 
    #     X, 
    #     w=w, 
    #     name_y=y_col, 
    #     name_x=X.columns.tolist(), 
    #     name_w='fe_neighbors'
    # )

    # spreg spatial fixed effect implementation
    model = spreg.OLS_Regimes(
        # Dependent variable
        y=gdf[[y_col]].values,
        # Independent variables
        x=gdf[feature_cols].values,
        # Variable specifying neighborhood membership
        regimes=gdf["fe_hexbin"].tolist(),
        # TODO adding w when fe_hexbin is basically the same might be a mistake
        # w=w,
        # Allow the constant term to vary by group/regime
        constant_regi="many",
        # Variables to be allowed to vary (True) or kept
        # constant (False). Here we set all to False
        cols2regi=[False] * len(feature_cols),
        # Allow separate sigma coefficients to be estimated
        # by regime (False so a single sigma)
        regime_err_sep=False,
        # Dependent variable name
        name_y=y_col,
        # Independent variables names
        name_x=feature_cols,
    )
    
    # Print the summary
    #print(model.summary)

    return model, gdf

In [23]:
m1, m1_dbg_gdf = model_ols_fe(df_agg_w, "incorrect_score", ["year", "building_area_m2", "nearby_building_count"], 8)

In [24]:
print(m1.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :incorrect_score                Number of Observations:       70839
Mean dependent var  :      0.8619                Number of Variables   :         245
S.D. dependent var  :      0.1922                Degrees of Freedom    :       70594
R-squared           :      0.1962
Adjusted R-squared  :      0.1935
Sum squared residual:     2103.23                F-statistic           :     70.6349
Sigma-square        :       0.030                Prob(F-statistic)     :           0
S.E. of regression  :       0.173                Log likelihood        :   24051.989
Sigma-square ML     :       0.030                Akaike info criterion :  -47613.977
S.E of regression ML:      0.1723                Schwarz criterion     :  -45367.777

-------------------------------------

In [40]:
import h3
import math
from pysal.lib import weights
from pysal.model import spreg
    

def model_ols_fe2(gdf_agg_w, y_col, feature_cols, fe_h3_resolution, w_dist_m):
    gdf = gdf_agg_w[feature_cols + ["geometry", y_col]].copy()
    
    gdf["geometry"] = gdf.to_crs(gdf.estimate_utm_crs()).centroid.to_crs(gdf_agg_w.crs)
    
    def latlon_to_hexagon(row, resolution):
        return h3.geo_to_h3(row.geometry.y, row.geometry.x, resolution)

    gdf["fe_hexbin"] = gdf.apply(latlon_to_hexagon, resolution=fe_h3_resolution, axis=1)

    w = weights.DistanceBand.from_dataframe(gdf.to_crs(gdf.estimate_utm_crs()), threshold=w_dist_m, binary=False)
    w.transform = "R"
    
    # Fit the model
    # model = spreg.OLS(
    #     y, 
    #     X, 
    #     w=w, 
    #     name_y=y_col, 
    #     name_x=X.columns.tolist(), 
    #     name_w='fe_neighbors'
    # )

    # spreg spatial fixed effect implementation
    model = spreg.OLS_Regimes(
        # Dependent variable
        y=gdf[[y_col]].values,
        # Independent variables
        x=gdf[feature_cols].values,
        # Variable specifying neighborhood membership
        regimes=gdf["fe_hexbin"].tolist(),
        # TODO adding w when fe_hexbin is basically the same might be a mistake
        w=w,
        # Allow the constant term to vary by group/regime
        constant_regi="many",
        # Variables to be allowed to vary (True) or kept
        # constant (False). Here we set all to False
        cols2regi=[False] * len(feature_cols),
        # Allow separate sigma coefficients to be estimated
        # by regime (False so a single sigma)
        regime_err_sep=False,
        # Dependent variable name
        name_y=y_col,
        # Independent variables names
        name_x=feature_cols,
    )
    
    # Print the summary
    #print(model.summary)

    return model, gdf

In [82]:
m1_1, m1_1_dbg_gdf = model_ols_fe2(
    df_agg_w, 
    "incorrect_score", 
    ["year", "building_area_m2", "nearby_building_count"], 
    7,
    w_dist_m=50.0,
)

 There are 1662 disconnected components.
 There are 565 islands with ids: 30, 200, 203, 206, 217, 304, 308, 424, 425, 428, 626, 1040, 1066, 1113, 1114, 1128, 1347, 1349, 1610, 1651, 1656, 2789, 3451, 3457, 3486, 3487, 3526, 3543, 3545, 4083, 4883, 4927, 4962, 4976, 4977, 5084, 5218, 5720, 5723, 5724, 5725, 5726, 5727, 5813, 5888, 5889, 5896, 5906, 5912, 5974, 5986, 5995, 6831, 7075, 7265, 7270, 7534, 7621, 7629, 7855, 7919, 7929, 7937, 8323, 8392, 8394, 8426, 8433, 9172, 9211, 9213, 9326, 9473, 9871, 9873, 9883, 9884, 9895, 9910, 10511, 10586, 10712, 10714, 10743, 10808, 10878, 11768, 12294, 12296, 12309, 12393, 12472, 12857, 12914, 12933, 12945, 13159, 13193, 13223, 13236, 13462, 13484, 13485, 13551, 13567, 13568, 14028, 14038, 14162, 14165, 14345, 14351, 14707, 14900, 14940, 15044, 15054, 15055, 15138, 15167, 15169, 15187, 15222, 15258, 15341, 15399, 15410, 15530, 15871, 15919, 15959, 15979, 15986, 16010, 16014, 16043, 16271, 16430, 16454, 16468, 16469, 16470, 16502, 16605, 17336, 17



  ci_result = sqrt(max_eigval / min_eigval)


In [83]:
print(m1_1.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :incorrect_score                Number of Observations:       70839
Mean dependent var  :      0.8619                Number of Variables   :          51
S.D. dependent var  :      0.1922                Degrees of Freedom    :       70788
R-squared           :      0.1168
Adjusted R-squared  :      0.1162
Sum squared residual:     2311.01                F-statistic           :    187.2777
Sigma-square        :       0.033                Prob(F-statistic)     :           0
S.E. of regression  :       0.181                Log likelihood        :   20715.056
Sigma-square ML     :       0.033                Akaike info criterion :  -41328.113
S.E of regression ML:      0.1806                Schwarz criterion     :  -40860.536

-------------------------------------

## Spatial lag of dependent variable
https://geographicdata.science/book/notebooks/11_regression.html#spatial-lag

In [8]:
import h3
import math
from pysal.lib import weights
from pysal.model import spreg
    

def model_ols_depvar(gdf_agg_w, y_col, feature_cols):
    gdf = gdf_agg_w[feature_cols + ["geometry", y_col]]
    
    # y = gdf[y_col]
    #dist = math.sqrt(h3.hex_area(fe_h3_resolution, unit="m^2") / math.pi)
    # dist = 100.0
    w = weights.KNN.from_dataframe(gdf, k=20)
    # w.transform = "R"

    model = spreg.GM_Lag(
        # Dependent variable
        y=gdf[[y_col]].values,
        # Independent variables
        x=gdf[feature_cols].values,
        w=w,
        # Dependent variable name
        name_y=y_col,
        # Independent variables names
        name_x=feature_cols,
    )
    
    # Print the summary
    #print(model.summary)

    return model

In [9]:
m2 = model_ols_depvar(df_agg_w, "incorrect_score", ["year", "building_area_m2", "nearby_building_count"])

 There are 50 disconnected components.
  W.__init__(self, neighbors, id_order=ids, **kwargs)


In [10]:
print(m2.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES
--------------------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :incorrect_score                Number of Observations:       70839
Mean dependent var  :      0.8619                Number of Variables   :           5
S.D. dependent var  :      0.1922                Degrees of Freedom    :       70834
Pseudo R-squared    :      0.2501
Spatial Pseudo R-squared:  0.0436

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       -27.03531         0.76814       -35.19559         0.00000
                year         0.01367         0.00039        35.28318         0.00000
    building_area_m2        -0.00006   

In [19]:
import h3
import math
from pysal.lib import weights
from pysal.model import spreg
    

def model_ols_depvar2(gdf_agg_w, y_col, feature_cols):
    gdf = gdf_agg_w[feature_cols + ["geometry", y_col]]
    gdf = gdf.to_crs(gdf.estimate_utm_crs())

    dist = 50.0
    w = weights.DistanceBand.from_dataframe(gdf, threshold=dist, binary=False)
    w.transform = "R"

    model = spreg.GM_Lag(
        # Dependent variable
        y=gdf[[y_col]].values,
        # Independent variables
        x=gdf[feature_cols].values,
        w=w,
        # Dependent variable name
        name_y=y_col,
        # Independent variables names
        name_x=feature_cols,
    )
    
    # Print the summary
    #print(model.summary)

    return model

In [20]:
m2_1 = model_ols_depvar2(df_agg_w, "incorrect_score", ["year", "building_area_m2", "nearby_building_count"])

 There are 1662 disconnected components.
 There are 565 islands with ids: 30, 200, 203, 206, 217, 304, 308, 424, 425, 428, 626, 1040, 1066, 1113, 1114, 1128, 1347, 1349, 1610, 1651, 1656, 2789, 3451, 3457, 3486, 3487, 3526, 3543, 3545, 4083, 4883, 4927, 4962, 4976, 4977, 5084, 5218, 5720, 5723, 5724, 5725, 5726, 5727, 5813, 5888, 5889, 5896, 5906, 5912, 5974, 5986, 5995, 6831, 7075, 7265, 7270, 7534, 7621, 7629, 7855, 7919, 7929, 7937, 8323, 8392, 8394, 8426, 8433, 9172, 9211, 9213, 9326, 9473, 9871, 9873, 9883, 9884, 9895, 9910, 10511, 10586, 10712, 10714, 10743, 10808, 10878, 11768, 12294, 12296, 12309, 12393, 12472, 12857, 12914, 12933, 12945, 13159, 13193, 13223, 13236, 13462, 13484, 13485, 13551, 13567, 13568, 14028, 14038, 14162, 14165, 14345, 14351, 14707, 14900, 14940, 15044, 15054, 15055, 15138, 15167, 15169, 15187, 15222, 15258, 15341, 15399, 15410, 15530, 15871, 15919, 15959, 15979, 15986, 16010, 16014, 16043, 16271, 16430, 16454, 16468, 16469, 16470, 16502, 16605, 17336, 17



In [21]:
print(m2_1.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES
--------------------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :incorrect_score                Number of Observations:       70839
Mean dependent var  :      0.8619                Number of Variables   :           5
S.D. dependent var  :      0.1922                Degrees of Freedom    :       70834
Pseudo R-squared    :      0.0333
Spatial Pseudo R-squared:  0.0437

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       -31.95291         0.72068       -44.33735         0.00000
                year         0.01626         0.00036        45.45357         0.00000
    building_area_m2        -0.00006   

## Model with endogenous features - KNN weights

In [11]:
import h3
import math
from pysal.lib import weights
from pysal.model import spreg
    

def model_ols_endo1(gdf_agg_w, y_col, exog_cols, endo_col, instrument_cols):
    gdf = gdf_agg_w[exog_cols + instrument_cols + ["geometry", endo_col, y_col]]

    w = weights.KNN.from_dataframe(gdf, k=10)
    
    model = spreg.GM_Lag(
        y=gdf[[y_col]].values,
        x=gdf[exog_cols].values,
        yend=gdf[[endo_col]].values,
        q=gdf[instrument_cols].values,
        w=w,
        name_y=y_col,
        name_x=exog_cols,
        name_yend=[endo_col],
        name_q=instrument_cols,
    )
    
    # Print the summary
    #print(model.summary)

    return model

In [12]:
m3 = model_ols_endo1(
    df_agg_w, 
    "incorrect_score", 
    ["year", "building_area_m2", "nearby_building_count"],
    "agreement",
    ["total_count_uw"],
)

 There are 155 disconnected components.
  W.__init__(self, neighbors, id_order=ids, **kwargs)


In [13]:
print(m3.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES
--------------------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :incorrect_score                Number of Observations:       70839
Mean dependent var  :      0.8619                Number of Variables   :           6
S.D. dependent var  :      0.1922                Degrees of Freedom    :       70833
Pseudo R-squared    :      0.4992
Spatial Pseudo R-squared:  0.4514

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT        -1.71777         0.87055        -1.97319         0.04847
                year         0.00096         0.00044         2.19660         0.02805
    building_area_m2        -0.00004   

## Model with endogenous features - DistanceBand weights

In [84]:
import h3
import math
from pysal.lib import weights
from pysal.model import spreg
    

def model_ols_endo2(gdf_agg_w, y_col, exog_cols, endo_cols, instrument_cols):
    gdf = gdf_agg_w[exog_cols + instrument_cols + endo_cols + ["geometry", y_col]]
    gdf = gdf.to_crs(gdf.estimate_utm_crs())

    dist = 150.0
    w = weights.DistanceBand.from_dataframe(gdf, threshold=dist, binary=False)
    w.transform = "R"
    # w = weights.KNN.from_dataframe(gdf, k=20)
    
    model = spreg.GM_Lag(
        y=gdf[[y_col]].values,
        x=gdf[exog_cols].values,
        yend=gdf[endo_cols].values,
        q=gdf[instrument_cols].values,
        w=w,
        name_y=y_col,
        name_x=exog_cols,
        name_yend=endo_cols,
        name_q=instrument_cols,
    )
    
    # Print the summary
    #print(model.summary)

    return model

In [85]:
m4 = model_ols_endo2(
    df_agg_w, 
    y_col="incorrect_score", 
    exog_cols=["year", "building_area_m2", "nearby_building_count"],
    endo_cols=["agreement"],
    instrument_cols=["total_count_uw"],
)

 There are 317 disconnected components.
 There are 51 islands with ids: 5813, 5906, 5912, 12472, 12914, 13462, 13485, 15167, 15169, 15187, 15530, 16010, 16430, 16468, 17373, 17374, 17626, 17627, 17628, 18245, 26462, 31828, 33745, 33814, 33949, 34065, 35579, 35580, 35714, 35726, 35727, 35786, 37536, 39699, 39702, 40950, 41331, 41562, 44701, 46399, 46400, 52216, 52312, 52385, 53301, 57824, 63374, 64673, 66189, 68205, 69629.
  w = W(neighbors, weights, ids, **kwargs)
 There are 317 disconnected components.
 There are 51 islands with ids: 5813, 5906, 5912, 12472, 12914, 13462, 13485, 15167, 15169, 15187, 15530, 16010, 16430, 16468, 17373, 17374, 17626, 17627, 17628, 18245, 26462, 31828, 33745, 33814, 33949, 34065, 35579, 35580, 35714, 35726, 35727, 35786, 37536, 39699, 39702, 40950, 41331, 41562, 44701, 46399, 46400, 52216, 52312, 52385, 53301, 57824, 63374, 64673, 66189, 68205, 69629.
  W.__init__(




In [86]:
print(m4.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES
--------------------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :incorrect_score                Number of Observations:       70839
Mean dependent var  :      0.8619                Number of Variables   :           6
S.D. dependent var  :      0.1922                Degrees of Freedom    :       70833
Pseudo R-squared    :      0.4555
Spatial Pseudo R-squared:  0.4444

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT        -2.77041         0.89943        -3.08020         0.00207
                year         0.00153         0.00045         3.39444         0.00069
    building_area_m2        -0.00004   