# Requirements + setup

In [163]:
# !pip install /wheels/*

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.ops import unary_union
import folium
from shapely.geometry import mapping
import json
from pyspark.sql import SparkSession
import unidecode
from addressing.utils import libpostal
from fuzzywuzzy import fuzz
import nltk
from nltk.corpus import stopwords
from addressing.automatic_matching import automatic_matching
from addressing.automatic_matching.rooftop.rooftop import haversine_distance
import re
import sys, os
import sqlalchemy
from datetime import date

county = 'miamidade'
data_path = "data/"
Updated_geometries_pickle = os.path.join(data_path,"APT_realigned_usa_ufl_miamidade.pkl")



In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/tandon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
DB = {
'host' : "10.137.173.84",
'port' : '5432',
'database' :  "STAN",
'user' : "strategicadmin",
'password' :  "TBmG4Yj3DdwOI+Aq"
}

class ReadAndWrite2PostgresDB:

    def __init__(self, engine):
        self.engine = engine

    def read_from_db(self, query, retry_num=3):

        for _ in range(retry_num):
            df = None
            try:
                df = pd.read_sql(query, self.engine)
                return df

            except Exception as e:
                print(e)

        return df

    def write_to_db(self, df, schema, table_name, retry_num=3):
        for _ in range(retry_num):
            try:
                
                df.to_sql(
                    table_name,
                    con=self.engine,
                    if_exists='append',
                    schema=schema,
                    index = False)
                print("Table stored!")
                return 1

            except Exception as e:
                print(e)

        return 0

In [4]:
engine = sqlalchemy.create_engine(f'postgresql+psycopg2://{DB["user"]}:{DB["password"]}@{DB["host"]}:{DB["port"]}/{DB["database"]}',echo = False)
raw2p = ReadAndWrite2PostgresDB(engine)
# change this to false if you need to run matching on the copy of MNR - TT map
delta = True

In [5]:
counties = ['harris', 'santaclara', 'miamidade', 'arapahoe', 'duval']
date = date.today()

Stopwords

In [6]:
countries_stopwords = {
  'br': stopwords.words('portuguese') + ['rua', 'avenida'], 
  'ca': stopwords.words('french') + stopwords.words('english') +  ['road', 'street', 'st.', 'st', 'rue', 'chemin', 'avenue'],
  'es': stopwords.words('spanish') + ['calle', 'avenida', 'callejón', 'paseo'],
  'fr': stopwords.words('french') + ['rue', 'chemin', 'avenue'],
  'gb': stopwords.words('english') + ['street', 'road', 'avenue', 'st.', 'st', 'drive'],
  'it': stopwords.words('italian') + ['via', 'viale', 'strada'],
  'mx': stopwords.words('spanish') + ['calle', 'avenida', 'callejón', 'paseo'],
  'us': stopwords.words('english') + ['street', 'road', 'avenue', 'st.', 'st', 'drive'],
  'be': stopwords.words('french') + ['rue', 'chemin', 'avenue'],
  'za': stopwords.words('english') + ['street', 'road', 'avenue', 'st.', 'st', 'drive']
}

countries_stopwords = {k:'|'.join(['\\b' + word + '\\b' for word in v]) for k, v in countries_stopwords.items()}

## Reading the data

### Sample addresses

In [7]:
sample_query = f"""SELECT * FROM "STAN_169".sample where county = '{county}'"""
sample_df = raw2p.read_from_db(query = sample_query)
sample_geom = gpd.GeoSeries.from_wkt(sample_df.geometry)
sample_gdf = gpd.GeoDataFrame(sample_df.drop('geometry', axis = 1),geometry = sample_geom,crs = 'EPSG:4326')

print("Sample Size :",sample_df.shape)
sample_gdf.head()

Sample Size : (2049, 29)


Unnamed: 0,country,searched_query_unidecode_sample,libpostal_query,libpostal_response,libpostal_house,libpostal_category,libpostal_near,libpostal_house_number,libpostal_road,libpostal_unit,...,libpostal_state_district,libpostal_state,libpostal_country_region,libpostal_country,libpostal_world_region,lat_sample,lon_sample,county,sample_id,geometry
0,USA,"12535 SW 60th Ct, Miami, FL 33156, USA","{""query"": ""12535 SW 60th Ct, Miami, FL 33156, ...","{'country': 'usa', 'city': 'miami', 'road': 's...",,,,12535,sw 60th ct,,...,,fl,,usa,,25.654567,-80.290398,miamidade,1555,"POLYGON ((-80.28141 25.65457, -80.28146 25.653..."
1,USA,"1150 8th St, Miami Beach, FL 33139, USA","{""query"": ""1150 8th St, Miami Beach, FL 33139,...","{'country': 'usa', 'city': 'miami beach', 'roa...",,,,1150,8th st,,...,,fl,,usa,,25.778059,-80.140367,miamidade,1556,"POLYGON ((-80.13138 25.77806, -80.13143 25.777..."
2,USA,"26608 SW 128th Ct, Homestead, FL 33032, USA","{""query"": ""26608 SW 128th Ct, Homestead, FL 33...","{'country': 'usa', 'city': 'homestead', 'road'...",,,,26608,sw 128th ct,,...,,fl,,usa,,25.520111,-80.399783,miamidade,1557,"POLYGON ((-80.39080 25.52011, -80.39084 25.519..."
3,USA,"15 W 9th St, Hialeah, FL 33010, USA","{""query"": ""15 W 9th St, Hialeah, FL 33010, USA""}","{'country': 'usa', 'city': 'hialeah', 'road': ...",,,,15,w 9th st,,...,,fl,,usa,,25.830181,-80.282095,miamidade,1558,"POLYGON ((-80.27311 25.83018, -80.27316 25.829..."
4,USA,"1795 W Okeechobee Rd, Hialeah, FL 33010, USA","{""query"": ""1795 W Okeechobee Rd, Hialeah, FL 3...","{'country': 'usa', 'city': 'hialeah', 'road': ...",,,,1795,w okeechobee rd,,...,,fl,,usa,,25.837865,-80.300429,miamidade,1559,"POLYGON ((-80.29145 25.83787, -80.29149 25.837..."


### Source

In [8]:
source_query = f"""SELECT * FROM "STAN_169".source_v0 where county = '{county}'"""
source_df = raw2p.read_from_db(query = source_query)
source_geom = gpd.GeoSeries.from_wkt(source_df.geometry)
source_gdf = gpd.GeoDataFrame(source_df.drop('geometry', axis = 1),
                                geometry = source_geom,
                                crs = 'EPSG:4326')
source_gdf.head()

Unnamed: 0,feat_id,postal_code,hsn,state,city,street_name,country_code,prefix,suffix,pre_dir,post_dir,x,y,county,geometry
0,c2510eea-e274-4cae-90eb-658e06637a45,33133,3625,FL,Miami,S Douglas Rd,USA,,Rd,S,,-80.252516,25.724369,miamidade,POINT (-80.25252 25.72437)
1,00005546-3500-2800-0000-000000531757,33186,8997,FL,Miami,SW 143rd Ct,USA,,Ct,SW,,-80.427754,25.683152,miamidade,POINT (-80.42775 25.68315)
2,ac17b1a1-9207-4e0e-a659-745e60b914a8,33180,20185,FL,Miami,E Country Club Dr,USA,,Dr,E,,-80.125493,25.963175,miamidade,POINT (-80.12549 25.96317)
3,00005546-3500-2800-0000-00000051cae8,33016,2764,FL,Hialeah,W 71st Pl,USA,,Pl,W,,-80.338867,25.887113,miamidade,POINT (-80.33887 25.88711)
4,00005546-3500-2800-0000-000000519a0b,33031,20851,FL,Homestead,SW 238th St,USA,,St,SW,,-80.529745,25.54482,miamidade,POINT (-80.52974 25.54482)


### Ingesting Deltas

Here goes the table of new changes to make to coordinates:

In [10]:
delta_table_read = pd.read_pickle(Updated_geometries_pickle)
print("Reading : ",os.path.basename(Updated_geometries_pickle))
print("data shape",delta_table_read.shape)
delta_table_read.head()

Reading :  APT_realigned_usa_ufl_miamidade.pkl
data shape (458514, 20)


Unnamed: 0,updated_geometries,feat_id,iso_script,iso_lang_c,postal_cod,house_numb,state_prov,locality,street_nam,country_co,prefix,suffix,predir,postdir,sn_body,APT_to_Centroid_distance,APT_lat,APT_long,updated_lat,updated_lon
0,POINT (-80.26611 25.97028),00005546-3500-2800-0000-00000078eb03,Latn,ENG,33055,21369,FL,Opa Locka,NW 39th Ave,USA,,Ave,NW,,39th,6.8,25.970261,-80.266174,25.97028,-80.266109
1,POINT (-80.26670 25.97014),00005546-3500-2800-0000-00000078eb01,Latn,ENG,33055,21343,FL,Opa Locka,NW 39th Ave,USA,,Ave,NW,,39th,4.4,25.970134,-80.266746,25.970136,-80.266701
2,POINT (-80.26666 25.97014),00005546-3500-2800-0000-00000078eb00,Latn,ENG,33055,21341,FL,Opa Locka,NW 39th Ave,USA,,Ave,NW,,39th,0.4,25.970139,-80.266658,25.970141,-80.266662
3,POINT (-80.26661 25.97013),00005546-3500-2800-0000-00000078eaff,Latn,ENG,33055,21339,FL,Opa Locka,NW 39th Ave,USA,,Ave,NW,,39th,6.7,25.970192,-80.266593,25.970133,-80.26661
5,POINT (-80.26728 25.97004),00005546-3500-2800-0000-00000057a884,Latn,ENG,33055,21475,FL,Opa Locka,NW 40th Circle Ct,USA,,Ct,NW,,40th Circle,3.4,25.97001,-80.267262,25.970037,-80.267279


In [11]:
delta_table_read = delta_table_read[['feat_id', 'updated_geometries']]
delta_table_read['datetime_version'] = pd.Timestamp.now(tz = 'utc')
delta_table_read.head()

Unnamed: 0,feat_id,updated_geometries,datetime_version
0,00005546-3500-2800-0000-00000078eb03,POINT (-80.26611 25.97028),2022-08-17 05:58:33.382616+00:00
1,00005546-3500-2800-0000-00000078eb01,POINT (-80.26670 25.97014),2022-08-17 05:58:33.382616+00:00
2,00005546-3500-2800-0000-00000078eb00,POINT (-80.26666 25.97014),2022-08-17 05:58:33.382616+00:00
3,00005546-3500-2800-0000-00000078eaff,POINT (-80.26661 25.97013),2022-08-17 05:58:33.382616+00:00
5,00005546-3500-2800-0000-00000057a884,POINT (-80.26728 25.97004),2022-08-17 05:58:33.382616+00:00


In [12]:
delta_table_write = delta_table_read[['feat_id', 'updated_geometries', 'datetime_version']]
delta_table_write['updated_geometries'] = delta_table_write['updated_geometries'].astype(str)

In [13]:
raw2p.write_to_db(df = delta_table_write, schema = 'STAN_169', table_name = 'delta_table')

Table stored!


1

### Reading Deltas

In [14]:
if delta:  
  delta_query = f"""
  SELECT * FROM "STAN_169".delta_table where county = '{county}'
  """
  delta_df = raw2p.read_from_db(query = delta_query)
  delta_geom = gpd.GeoSeries.from_wkt(delta_df.updated_geometries)
  delta_gdf = gpd.GeoDataFrame(delta_df.drop('updated_geometries', axis = 1),geometry = delta_geom,crs = 'EPSG:4326')

In [16]:
print("Delta Geometries Shape :",delta_gdf.shape)
delta_gdf.head()

Delta Geometries Shape : (456827, 4)


Unnamed: 0,feat_id,datetime_version,county,geometry
0,00005546-3500-2800-0000-00000078eb03,2022-08-09 09:31:45.757679+00:00,miamidade,POINT (-80.26611 25.97028)
1,00005546-3500-2800-0000-00000078eb01,2022-08-09 09:31:45.757679+00:00,miamidade,POINT (-80.26670 25.97014)
2,00005546-3500-2800-0000-00000078eb00,2022-08-09 09:31:45.757679+00:00,miamidade,POINT (-80.26666 25.97014)
3,00005546-3500-2800-0000-00000078eaff,2022-08-09 09:31:45.757679+00:00,miamidade,POINT (-80.26661 25.97013)
4,00005546-3500-2800-0000-00000057a884,2022-08-09 09:31:45.757679+00:00,miamidade,POINT (-80.26728 25.97004)


### Replacing Delta Changes

In [18]:
def replace_geometries(source_gdf, delta_gdf):
    '''
    Takes a 'source' geodataframe - copy of MNR database, a 'delta' geodataframe and replaces, for every APT (key: feat_id) in source dataframe, the 
    coordinates in the sorce with the coordinates in the new 

            Parameters:
                    source_gdf (gpd.GeoDataFrame): geodataframe containing MNR coordinates for every APT (feat_id)
                    delta_gdf (gpd.GeoDataFrame): geodataframe containing NEW coordinates for some APTs (feat_id)

            Returns:
                    source_gdf_new (gpd.GeoDataFrame): geodataframe containing MNR information for APT but with new coordinates
    '''
    # 
    delta_gdf_grouped = delta_gdf[delta_gdf.groupby('feat_id').datetime_version.transform('max') == delta_gdf.datetime_version]

    source_gdf_new = source_gdf.merge(delta_gdf_grouped[["feat_id", "geometry"]], on="feat_id", how="left")

    source_gdf_new.loc[~source_gdf_new.geometry_y.isna(), "geometry_x"] = source_gdf_new.loc[~source_gdf_new.geometry_y.isna(), "geometry_y"]

    source_gdf_new = source_gdf_new.drop(["geometry_y"], axis=1).rename({"geometry_x": "geometry"}, axis = 1)

    source_gdf_new = gpd.GeoDataFrame(source_gdf_new.drop('geometry', axis = 1), 
                                    geometry = source_gdf_new.geometry, crs = 'EPSG:4326')

    return source_gdf_new

In [19]:
if delta:
    source_delta = replace_geometries(source_gdf, delta_gdf)
    source_delta_gdf = gpd.GeoDataFrame(source_delta.drop('geometry', axis = 1), geometry = source_delta.geometry, 
                           crs = 'EPSG:4326')
    source_delta_gdf.x = source_delta_gdf.geometry.apply(lambda p: p.x)
    source_delta_gdf.y = source_delta_gdf.geometry.apply(lambda p: p.y)

### Joining Sample and Source - Spatial Join

In [20]:
if delta:
  joined_sample = source_delta_gdf.sjoin(sample_gdf, how='inner', predicate='intersects')
else:
  joined_sample = source_gdf.sjoin(sample_gdf, how = 'inner', predicate = 'intersects')

In [183]:
print("Joined sample shape",joined_sample.shape)
joined_sample.head()

Unnamed: 0,feat_id,postal_code,hsn,state,city,street_name,country_code,prefix,suffix,pre_dir,...,libpostal_island,libpostal_state_district,libpostal_state,libpostal_country_region,libpostal_country,libpostal_world_region,lat_sample,lon_sample,county_right,sample_id
0,c2510eea-e274-4cae-90eb-658e06637a45,33133,3625,FL,Miami,S Douglas Rd,USA,,Rd,S,...,,,fl,,usa,,25.720843,-80.245008,miamidade,423ext
113,de31c8fb-8bde-4d30-ad32-e22d795eb833,33133,2977,FL,Miami,McFarlane Rd,USA,,Rd,,...,,,fl,,usa,,25.720843,-80.245008,miamidade,423ext
259,aa302bc3-ed30-46f1-8c5d-d7a6610933c8,33133,2889,FL,Miami,McFarlane Rd,USA,,Rd,,...,,,fl,,usa,,25.720843,-80.245008,miamidade,423ext
1196,00005546-3500-2800-0000-00000054a67b,33133,3610,FL,Miami,William Ave,USA,,Ave,,...,,,fl,,usa,,25.720843,-80.245008,miamidade,423ext
2061,00005546-3500-2800-0000-00000054a652,33133,3573,FL,Miami,Hibiscus St,USA,,St,,...,,,fl,,usa,,25.720843,-80.245008,miamidade,423ext


In [21]:
if delta: 
  del source_delta_gdf
  del delta_gdf
del source_gdf
del sample_gdf

In [22]:
joined_sample.rename({
    'hsn': 'hsnum',
    'street_name': 'st_name',
    'postal_code': 'zip_code'
},
axis = 1, 
inplace = True)

# Parsing Joined Sample

In [186]:
def parse_joined_sample(spatial_joined_df: pd.DataFrame) -> pd.DataFrame:
    '''Function inversely parses the addresses to create a searched query format so that the addresses in the source 
    can be compared to the addresses in the sample.

    :param spatial_joined_df: DataFrame that contains the addresses from the source that are within the polygon of 
    the sample generated. It must contain the columns: ['hsn', 'unit_type', 'unit_num', 'pre_dir', 'prefix', 'suffix'
    'post_dir', 'city', 'state', 'zip_code']
    :type spatial_joined_df: pd.DataFrame
    :return: The same dataframe with a column that contains the full addresses inversely parsed.
    :rtype: pd.DataFrame
    '''

    df = spatial_joined_df.copy()

    dict_of_columns = {
        'hsnum': ' ', 'pre_dir': ' ', 'st_name': ' ', 'suffix': ', ', 'city': ' ', 'state': ' ', 'zip_code': ', ', 'country': ''
    }
    df['pre_dir'].fillna('', inplace=True)
    df['prefix'].fillna('', inplace=True)
    df['suffix'].fillna('', inplace=True)
    df['post_dir'].fillna('', inplace=True)

    for column in dict_of_columns.keys():

        df[column + '_modified'] = df[column].astype(str) + dict_of_columns[column]

    list_of_modified_columns = [col for col in df.columns if '_modified' in col]

    df['searched_query'] = df[list_of_modified_columns].sum(axis=1)

    df['street_name'] = df['pre_dir'] + ' ' + df['prefix'] + ' ' + df['st_name'] + ' ' + df['suffix'] + ' ' + df['post_dir']
    df['name'] = '' #df['state']

    df = df.rename(columns={
        'hsnum': 'hsn', 'searched_query': 'address', 'zip_code': 'postal_code', 'city': 'place_name', 
        'y': 'lat', 'x': 'lon'
    })

    return df

In [187]:
parsed_df = parse_joined_sample(joined_sample)
parsed_df.head()

Unnamed: 0,feat_id,postal_code,hsn,state,place_name,st_name,country_code,prefix,suffix,pre_dir,...,pre_dir_modified,st_name_modified,suffix_modified,city_modified,state_modified,zip_code_modified,country_modified,address,street_name,name
0,c2510eea-e274-4cae-90eb-658e06637a45,33133,3625,FL,Miami,S Douglas Rd,USA,,Rd,S,...,S,S Douglas Rd,"Rd,",Miami,FL,33133,USA,"3625 S S Douglas Rd Rd, Miami FL 33133, USA",S S Douglas Rd Rd,
113,de31c8fb-8bde-4d30-ad32-e22d795eb833,33133,2977,FL,Miami,McFarlane Rd,USA,,Rd,,...,,McFarlane Rd,"Rd,",Miami,FL,33133,USA,"2977 McFarlane Rd Rd, Miami FL 33133, USA",McFarlane Rd Rd,
259,aa302bc3-ed30-46f1-8c5d-d7a6610933c8,33133,2889,FL,Miami,McFarlane Rd,USA,,Rd,,...,,McFarlane Rd,"Rd,",Miami,FL,33133,USA,"2889 McFarlane Rd Rd, Miami FL 33133, USA",McFarlane Rd Rd,
1196,00005546-3500-2800-0000-00000054a67b,33133,3610,FL,Miami,William Ave,USA,,Ave,,...,,William Ave,"Ave,",Miami,FL,33133,USA,"3610 William Ave Ave, Miami FL 33133, USA",William Ave Ave,
2061,00005546-3500-2800-0000-00000054a652,33133,3573,FL,Miami,Hibiscus St,USA,,St,,...,,Hibiscus St,"St,",Miami,FL,33133,USA,"3573 Hibiscus St St, Miami FL 33133, USA",Hibiscus St St,


In [188]:
del joined_sample

# Matching Adresses

In [189]:
def apt_similarity_filter(
    #country:str,
    df:pd.DataFrame,
    #sample_df:pd.DataFrame,
    stopwords_pattern: str = '') -> pd.DataFrame:
    """Performs matching after making call in a given radius

    :param country: country to call in MNR
    :type country: str
    :param df: DataFrame containing the sample addresses (must have coordinates)
    :type df: pd.DataFrame
    :param sample_df: DataFrame containing libpostal components for sample (df) addresses
    :type sample_df: pd.DataFrame
    :param radius: radius of the buffer
    :type radius: float
    :param inner_radius: radius in meters of a smaller buffer. When bigger than zero, we are essentially getting the point in a disk, defaults to 0
    :type inner_radius: int or float, optional
    :param stopwords_pattern: regex pattern to remove stopwords, if needed. Optional, defaults to None
    :type stopwords_pattern: str
    :return: DataFrame with the APTs that matched
    :rtype: pd.DataFrame
    """
    apts_df = df.copy()

    # Fill NAs
    apts_df[['address', 'street_name', 'hsn', 'postal_code',
                    'place_name', 'name']] = apts_df[['address', 'street_name', 'hsn',
                                                                            'postal_code', 'place_name', 'name']].fillna('')

    # Drop duplicates
    #apts_df = apts_df.drop_duplicates(['searched_query', 'address']).reset_index(drop=True)
    
    
    # Create extra columns for stopwords, optional unidecode 
    cols_stopwords = ['address', 'street_name', 'place_name']
    for col in cols_stopwords:
        col_create = col + '_no_stopwords'
        apts_df[col_create] =  apts_df[col].str.replace(stopwords_pattern, '', case=False, regex=True)
        
    for col in cols_stopwords:
        col_create = col + '_no_stopwords_unidecode'
        apts_df[col_create] =  apts_df[col+'_no_stopwords'].apply(lambda x: unidecode.unidecode(x))
        
    
    # Merge to APTs
    #apts_df = apts_df.merge(sample_df.drop(columns=['country', 'searched_query_unidecode_sample']),
    #                                      how='left', 
    #                                      on=['searched_query'])
    apts_df['libpostal_road_no_stopwords'] = apts_df.libpostal_road.str.replace(stopwords_pattern, '', case=False, regex=True)


    # House number similarity: filter obvious non matches
    apts_df['hsn_similarity'] = list(map(fuzz.token_set_ratio, apts_df.libpostal_house_number, apts_df.hsn))
    apts_df['re_pattern'] = '\\b' + apts_df.hsn.astype(str) + '\\b'
    #apts_df['hsn_in_query'] = apts_df.apply(lambda x: bool(re.search(x.re_pattern, x.searched_query_unidecode_sample)), axis=1)
    #apts_df['hsn_similarity'] = np.where((apts_df.hsn_in_query), 100, apts_df.hsn_similarity)

    dropped_df = apts_df.loc[apts_df.hsn_similarity <= 60].reset_index(drop=True)
    apts_df = apts_df.loc[apts_df.hsn_similarity > 60].reset_index(drop=True)

    # Postal code similarity
    apts_df['postcode_similarity'] = list(map(fuzz.WRatio, 
                                                     apts_df.libpostal_postcode, 
                                                     apts_df.postal_code.fillna('').astype(str)))
    apts_df['postcode_similarity'] = np.where(apts_df.libpostal_postcode=='', np.nan,
                                                     np.where(apts_df.postal_code=='', 50, apts_df.postcode_similarity))

    
    # Road similarity
    apts_df['road_similarity'] = list(map(fuzz.token_set_ratio, 
                                                 apts_df.libpostal_road_no_stopwords, 
                                                 apts_df.street_name_no_stopwords))
    apts_df['road_similarity_unidecode'] = list(map(fuzz.token_set_ratio, 
                                                           apts_df.libpostal_road_no_stopwords, 
                                                           apts_df.street_name_no_stopwords_unidecode)) 
    apts_df['road_similarity'] = apts_df[['road_similarity', 'road_similarity_unidecode']].max(axis=1)
    
    # Locality similarity
    apts_df['searched_query_tokens'] = (apts_df.libpostal_road.astype(str) + ' ' + 
                                               apts_df.libpostal_house_number.astype(str) + ' ' + 
                                               apts_df.libpostal_postcode.astype(str))
    
    apts_df['provider_tokens'] = (apts_df.street_name.astype(str) + ' ' + 
                                         apts_df.hsn.astype(str) + ' ' + apts_df.postal_code.astype(str))
    apts_df['aux_searched_query'] = apts_df.apply(lambda x: automatic_matching.replace_tokens(x.searched_query_unidecode_sample, x.searched_query_tokens), axis=1)
    apts_df['aux_provider_address'] = apts_df.apply(lambda x: automatic_matching.replace_tokens(x.address, x.provider_tokens), axis=1)
    apts_df['aux_provider_address'] = apts_df.aux_provider_address.fillna('').apply(lambda x: unidecode.unidecode(x))
    apts_df['locality_wratio'] = apts_df.apply(lambda x: fuzz.WRatio(str(x.aux_searched_query).lower(), str(x.aux_provider_address).lower()), axis=1)
    apts_df['locality_city_state_ratio'] = apts_df.apply(lambda x: fuzz.WRatio(str(x.libpostal_city) + ' ' + str(x.libpostal_state),
                                                                                            str(x.place_name) + ' ' + str(x.name)), axis=1)
    apts_df['locality_similarity'] = apts_df[['locality_wratio', 'locality_city_state_ratio']].mean(axis=1)

    apts_df['mnr_query_distance'] = apts_df.apply(lambda x: haversine_distance(x.lat, x.lon,
                                                                                               x.lat_sample, x.lon_sample)
                                                                  if not np.isnan(x.lat) else 1e7
                                                                  , axis=1)

    # Compute mean similarity
    apts_df['mean_similarity'] = (apts_df[['locality_similarity', 'hsn_similarity', 
                                                         'postcode_similarity', 'road_similarity']].mean(axis=1)
                                        * np.where(apts_df.hsn_similarity >= 70 , 1, 0)
                                        * np.where(apts_df.road_similarity >= 60 , 1, 0)  
                                        * np.where(apts_df.mnr_query_distance > 1000, 0, 1)
                                        )


    apts_df_matching = (
        apts_df.sort_values(by='mnr_query_distance')
        .loc[apts_df.groupby(['searched_query_unidecode_sample'])
        .mean_similarity.idxmax()]
        .reset_index(drop=True)
    )


    # Compute matching
    apts_df_matching['match'] = pd.NaT
    #apts_df_matching.loc[apts_df_matching['hsn_similarity'] < 60, 'match'] = pd.NaT
    #apts_df_matching['match'] = np.where(apts_df_matching['hsn_similarity'] > 60, apts_df_matching['match'], 0)
    apts_df_matching['match'] = np.where(apts_df_matching.mean_similarity >= 70, 1, pd.NaT) #90 so far best

    address_matches = apts_df_matching['searched_query_unidecode_sample']

    non_matches = dropped_df[~dropped_df['searched_query_unidecode_sample'].isin(address_matches)]
    addresses_to_add = non_matches['searched_query_unidecode_sample'].unique()

    addresses_df = pd.DataFrame(
        {'searched_query_unidecode_sample': addresses_to_add, 'match': [pd.NaT] * len(addresses_to_add)}
    )

    cols_to_add = [col for col in apts_df_matching if col not in addresses_df.columns]

    addresses_df.loc[:, cols_to_add] = ''
    addresses_df_reordered = addresses_df[apts_df_matching.columns]

    apts_final = pd.concat([apts_df_matching, addresses_df_reordered])

    return apts_final #apts_df_matching, dropped_df

In [190]:
similarity_df = apt_similarity_filter(
    df = parsed_df, 
    stopwords_pattern = countries_stopwords.get('us'))

In [191]:
del parsed_df

In [192]:
similarity_df['match'] = similarity_df['match'].fillna(0)

match_proportion = np.mean(similarity_df['match'])
clean_proportion = round(match_proportion * 100, 2)
print(f'The proportion of matches is: {clean_proportion}%')

The proportion of matches is: 99.36%


In [193]:
similarity_df.head()

Unnamed: 0,feat_id,postal_code,hsn,state,place_name,st_name,country_code,prefix,suffix,pre_dir,...,searched_query_tokens,provider_tokens,aux_searched_query,aux_provider_address,locality_wratio,locality_city_state_ratio,locality_similarity,mnr_query_distance,mean_similarity,match
0,00005546-3500-2800-0000-0000005593e0,33132,1,FL,Miami,NE 3rd Ave,USA,,Ave,NE,...,se 3rd ave 1 33131,NE NE 3rd Ave Ave 1 33132,miami fl 333 usa,miami fl 3332 usa,97,73,85.0,1.0,88.75,1
1,00005546-3500-2800-0000-00000055d337,33138,100,FL,Miami,NE 96th St,USA,,St,NE,...,ne 96th st 100 33138,NE NE 96th St St 100 33138,miami shores fl usa,miami fl usa,86,56,71.0,6.2,92.75,1
2,00005546-3500-2800-0000-00000055a48e,33127,1000,FL,Miami,NW 41st St,USA,,St,NW,...,nw 41st st 1000 33127,NW NW 41st St St 1000 33127,miami fl usa,miami fl usa,100,86,93.0,17.6,98.25,1
3,00005546-3500-2800-0000-0000007a73eb,33169,1000,FL,Miami,Park Centre Blvd,USA,,Blvd,,...,park centre blvd 1000 33169,Park Centre Blvd Blvd 1000 33169,miami fl usa,miami fl usa,100,86,93.0,36.1,98.25,1
4,00005546-3500-2800-0000-00000058a665,33174,1000,FL,Miami,SW 96th Ave,USA,,Ave,SW,...,sw 96th ave 1000 33174,SW SW 96th Ave Ave 1000 33174,miami fl usa,miami fl usa,100,86,93.0,1.4,98.25,1


In [194]:
match_df = similarity_df[['feat_id', 'match', 'sample_id']]
match_df['county'] = county
match_df['datetime_run'] = pd.Timestamp.now(tz = 'utc')
match_df.rename({'match': 'asf'}, axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  match_df['county'] = county
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  match_df['datetime_run'] = pd.Timestamp.now(tz = 'utc')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [195]:
match_df.head()

Unnamed: 0,feat_id,asf,sample_id,county,datetime_run
0,00005546-3500-2800-0000-0000005593e0,1,1905,miamidade,2022-08-16 10:29:52.645148+00:00
1,00005546-3500-2800-0000-00000055d337,1,1241ext,miamidade,2022-08-16 10:29:52.645148+00:00
2,00005546-3500-2800-0000-00000055a48e,1,607ext,miamidade,2022-08-16 10:29:52.645148+00:00
3,00005546-3500-2800-0000-0000007a73eb,1,432ext,miamidade,2022-08-16 10:29:52.645148+00:00
4,00005546-3500-2800-0000-00000058a665,1,333ext,miamidade,2022-08-16 10:29:52.645148+00:00


#### Bootstrapping

In [196]:
def bootstrap_resample(df, agg_fun, times=1000, seed=0):
    reboot = []
    
    for t in range(times):
        df_boot = df.sample(frac = 1, replace=True, random_state = t+seed)
        reboot.append(agg_fun(df_boot))
        
    return reboot


def percentile_bootstrap(df, agg_fun, conf=0.9, times=1000, seed=0):
    """Generic Percentile Bootstrap
    This function returns a percentile bootstrap confidence interval for a statistic.
    Args:
        df (pandas.DataFrame): DataFrame with the observed random vectors. Each row represents an observation an each column is a random variable.
        agg_fun (function): Aggregation function. This function should receive as input a pandas.DataFrame (resamples) and return a 
        number with the computed statistic.
        conf (float, optional): Confidence level of the returned interval. Defaults to 0.9.
        times (int, optional): Bootstrap resamples. Defaults to 1000.
        seed (int, optional): Random seed. Defaults to 0.
    Returns:
        numpy.array: Percentile Boostrap CI [lower, upper]
    """    
    reboot = bootstrap_resample(df, agg_fun, times, seed)
    return np.quantile(reboot, [(1-conf)/2, (1-conf)/2+conf])

In [197]:
[lower_distance, upper_distance] = percentile_bootstrap(similarity_df['match'], np.mean)

In [198]:
results_sum = pd.DataFrame(
    data=[[lower_distance, match_proportion, upper_distance, '%', 'ASF', date, county]], 
    columns=['lower_bound', 'calculated_metric', 'upper_bound', 'units', 'metric', 'version', 'county'], index = None)
results_sum

Unnamed: 0,lower_bound,calculated_metric,upper_bound,units,metric,version,county
0,0.990695,0.993634,0.996572,%,ASF,2022-08-16,miamidade


## Positional Accuracy

### 90th percentile

We consider the 90th percentile of the distance of matches as a metric for the Positional Accuracy. The distance we obtain below is be the distance for which 90% of the data is lower. The interesting thing about this metric is that it's expressed in terms of distance.

In [199]:
matches_df = similarity_df[similarity_df['match'] == 1]

matches_df['mnr_query_distance'] = matches_df['mnr_query_distance'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches_df['mnr_query_distance'] = matches_df['mnr_query_distance'].astype(float)


In [200]:
positional_accuracy_distance = round(np.quantile(matches_df['mnr_query_distance'], 0.9), 2)
print(f'Positional Accuracy (90th percentile distance) is: {positional_accuracy_distance}m')

Positional Accuracy (90th percentile distance) is: 29.34m


#### Bootstrapping

In [201]:
[lower_percentile90, upper_percentile90] = percentile_bootstrap(
    matches_df['mnr_query_distance'], lambda x: np.quantile(x, 0.9)
)

In [202]:
new_result = pd.DataFrame(
    data=[[lower_percentile90, positional_accuracy_distance, upper_percentile90, 'meters', '90p', date, county]], 
    columns=['lower_bound', 'calculated_metric', 'upper_bound', 'units', 'metric', 'version', 'county'], index = None)
results_sum = pd.concat([results_sum, new_result])
results_sum

Unnamed: 0,lower_bound,calculated_metric,upper_bound,units,metric,version,county
0,0.990695,0.993634,0.996572,%,ASF,2022-08-16,miamidade
0,24.3,29.34,35.02,meters,90p,2022-08-16,miamidade


### % of matches below 50m

In [203]:
proportion_50m_matches = (matches_df['mnr_query_distance'] <= 50).mean()
nice_num_50m = round(proportion_50m_matches * 100, 1)
print(f'The calculated percentage of matches within 50 meters is {nice_num_50m}%')

The calculated percentage of matches within 50 meters is 92.7%


In [204]:
[lower_50m_pa, upper_50m_pa] = percentile_bootstrap(
    matches_df['mnr_query_distance'] <= 50, np.mean
)

In [205]:
new_result = pd.DataFrame(
     data=[[lower_50m_pa, proportion_50m_matches, upper_50m_pa, '%', 'APA', date, county]], 
    columns=['lower_bound', 'calculated_metric', 'upper_bound', 'units', 'metric', 'version', 'county'])
results_sum = pd.concat([results_sum, new_result])
results_sum

Unnamed: 0,lower_bound,calculated_metric,upper_bound,units,metric,version,county
0,0.990695,0.993634,0.996572,%,ASF,2022-08-16,miamidade
0,24.3,29.34,35.02,meters,90p,2022-08-16,miamidade
0,0.917201,0.927058,0.936422,%,APA,2022-08-16,miamidade


# Join Matches Table

In [206]:
matches_df[['sample_id', 'mnr_query_distance']]

Unnamed: 0,sample_id,mnr_query_distance
0,1905,1.0
1,1241ext,6.2
2,607ext,17.6
3,432ext,36.1
4,333ext,1.4
...,...,...
2035,1949,0.1
2036,634ext,96.6
2037,695ext,26.2
2038,1755,2.0


In [207]:
match_df = match_df.merge(matches_df[['sample_id', 'mnr_query_distance']], on = ['sample_id'], how = 'left')
match_df['apa'] = match_df.mnr_query_distance.apply(lambda x: 1 if x < 50 else 0)
match_df.drop('mnr_query_distance', axis = 1, inplace = True)

# Store Results @ psql

### Storing Results

In [208]:
results_sum

Unnamed: 0,lower_bound,calculated_metric,upper_bound,units,metric,version,county
0,0.990695,0.993634,0.996572,%,ASF,2022-08-16,miamidade
0,24.3,29.34,35.02,meters,90p,2022-08-16,miamidade
0,0.917201,0.927058,0.936422,%,APA,2022-08-16,miamidade


In [209]:
raw2p.write_to_db(results_sum, table_name = 'results', schema = 'STAN_169')

Table stored!


1

### Storing Matches

In [210]:
match_df['version'] = date

In [211]:
match_df.head()

Unnamed: 0,feat_id,asf,sample_id,county,datetime_run,apa,version
0,00005546-3500-2800-0000-0000005593e0,1,1905,miamidade,2022-08-16 10:29:52.645148+00:00,1,2022-08-16
1,00005546-3500-2800-0000-00000055d337,1,1241ext,miamidade,2022-08-16 10:29:52.645148+00:00,1,2022-08-16
2,00005546-3500-2800-0000-00000055a48e,1,607ext,miamidade,2022-08-16 10:29:52.645148+00:00,1,2022-08-16
3,00005546-3500-2800-0000-0000007a73eb,1,432ext,miamidade,2022-08-16 10:29:52.645148+00:00,1,2022-08-16
4,00005546-3500-2800-0000-00000058a665,1,333ext,miamidade,2022-08-16 10:29:52.645148+00:00,1,2022-08-16


In [212]:
raw2p.write_to_db(match_df, table_name = 'matches_table', schema = 'STAN_169')

Table stored!


1