In [None]:
import pandas as pd
import requests as req
import numpy as np
from shapely import wkt
import time
import geopandas as gpd
import re

### Load and Adjust Data

In [None]:
df_fairfax = pd.read_csv("data/df_fairfax_cleaned.csv" ,index_col=[0])
df_connecticut = pd.read_csv("data/df_connecticut_cleaned.csv", index_col=[0])

In [None]:
df_fairfax_addr = df_fairfax[["city", "addr", "county"]].drop_duplicates()
df_connecticut_addr = df_connecticut[["city", "addr", "county"]].drop_duplicates()

In [None]:
df_fairfax_addr["state"] = "Virginia"
df_connecticut_addr["state"] = "Connecticut"

In [None]:
df_combined = pd.concat([df_fairfax_addr, df_connecticut_addr])

In [None]:
df_combined["addr"] = df_combined.addr.str.title()
df_combined["city"] = df_combined.city.str.title()

### Use API

In [None]:
locations = df_combined.addr.str.replace(" ", "%20") + "%20" + df_combined.city + "%20" + df_combined.state

In [None]:
locations.iloc[2]

In [None]:
request_string = f"https://nominatim.openstreetmap.org/search?q={locations.iloc[2]}&format=json&addressdetails=1&limit=1&polygon_svg=1"

In [None]:
print(request_string)

In [None]:
responds = req.get(request_string)

In [None]:
lat_list = []
long_list = []
i = 0
for loc in locations:
    request_string = f"https://nominatim.openstreetmap.org/search?q={loc}&format=json&addressdetails=1&limit=1&polygon_svg=1"
    responds = req.get(request_string)

    if (responds.status_code == 200) & len(responds.json()) != 0:
        lat_list.append(responds.json()[0]["lat"])
        long_list.append(responds.json()[0]["lon"])
    else:
        lat_list.append(np.nan)
        long_list.append(np.nan)
    i += 1
    print(i)
    time.sleep(1)
    

In [None]:
lat_list

In [None]:
lat_list

In [None]:
long_list

In [None]:
len(lat_list)

In [None]:
len(long_list)

In [None]:
df_combined = df_combined.reset_index(drop=True)

In [None]:
df_geocoded_addr = df_combined.iloc[0:4278]

In [None]:
df_geocoded_addr.loc[:,"latitude"] = lat_list

In [None]:
df_geocoded_addr.loc[:,"longitude"] = long_list

In [None]:
df_geocoded_addr

In [None]:
df_geocoded_addr.to_csv("df_geocoded_addr.csv")

### Adress Matching

##### Save geojson as csv

In [None]:
counties = [
            "new_haven",
            "new_london",
            "middlesex",
            "litchfield",
            "hartford",
            "fairfield",
            "tolland",
            "windham",
            "fairfax1",
            ]

for c in counties:
    df = gpd.read_file(f"/Volumes/Seagate/bavillion/geocoding3/{c}.geojson")
    df = df[['addr:city',
        'addr:country',
        'addr:housename',
        'addr:housenumber',
        'addr:postcode',
        'addr:state',
        'addr:street', "geometry"]]
    df = df.dropna(subset=["addr:street"])
    df = df.rename(columns={"addr:housenumber": "housenr", "addr:city": "city", "addr:street":"addr"})
    df[["city", "addr", "housenr", 'addr:postcode', "geometry"]].to_csv(f"df_geo_{c}.csv")


In [None]:
df_combined = df_combined.dropna(subset=["addr"])

- Rt change to CT (Route) e.g. Rt-171 to Rt (CT-171)
- Dr change to Drive e.g. Hemlock Dr to Hemlock Drive
- Change st to Street
- Ext to Extension
- La to Lane
- Tpke to Turnpike
- Cir to Circle


- So -> South
- No -> North

- Resv -> Reservoir
- " Rd " -> " Road "
- " St " -> " Street "
- "Ln" -> "Lane"

- Nur wenn erster Teil eine Zahl enthält splitten e.g. 23A Street -> 23 Street; Street Drive -> Street Drive
- Zahlen am Ende von street entfernen
- Einzelne Buchstaben vor oder nach Street entfernen

- Boston Pike (nicht gefunden)


- Mt -> Mount



##### Cleaning Adress Column

In [None]:
def split_digits(street):
    first_split = street.split(' ')[0]
    if re.search('\d+', first_split):
        return ' '.join(street.split(' ')[1:])
    else:
        return street

In [None]:
def match(housenumber):
    match = re.match(r'^\d+', housenumber)
    if match:
        return match.group()
    else: 
        return match

In [None]:
def remove_end_digits(street):
    if "Route" in street:
        return street
    else:
        return re.sub(r'(?:\d+[+-]\d*|[-+]\d+|\d+[A-Za-z]*)(\s|$)', '', street)


In [None]:
def remove_start_digits(street):
    if "Route" in street:
        return street
    else:
        return re.sub(r'(\s|^)(?:\d+[+-]\d*|[-+]\d+|\d+[A-Za-z]*)', '', street)

In [None]:
def put_the_to_front(street):
    if re.search(r"\bThe\b", street):
        street = re.sub(r"\bThe\b", "", street)
        street = street.replace("()", "").strip()
        street = street.strip(",")
        return "The " + street.strip()  
    else:
        return street

In [None]:
put_the_to_front("Knoll, The")

In [None]:
def ad_split(street):
    parts = re.split(r"/|&|\\", street)
    if len(parts) > 1:
        first = (parts[0]
                .strip()
                .strip("-")
                .strip(".")
                )
        
        if first.isdigit() or (first == ""):
            return parts[1]
        return parts[0]
    return street

In [None]:
def remove_letters(street):
    if re.search(r"\bB Lane", street):
        return re.sub(r"\b(?!')[a-zA-Z]$", "", street.strip())
    elif re.search(r"Avenue B(\s|$)", street) or re.search(r"Aaron B(\s|$)", street) or re.search(r"Ave B(\s|$)", street):
        return re.sub(r"^[a-zA-Z]\b(?!')", " ", street.strip())
    else:
        return re.sub(r"(^)\b(?!')[a-zA-Z]\b(?!')|\b(?!')[a-zA-Z]\b(?!')($)", '', street)
        #return re.sub(r"(^|\s)\b(?<!')[a-zA-Z]\b(?<!')|\b(?<!')[a-zA-Z]\b(?<!')($|\s)", '', street)
        #return re.sub(r'(^|\s)[a-zA-Z]\s|\s[a-zA-Z]($|\s)', '', street)

In [None]:
def remove_after_paran(street):
    return re.sub(r'\(.*?$', "", street)

In [None]:
def abbrv(df_addr):
    return (df_addr.apply(lambda x: re.sub(r"St Joseph", "Saint Joseph", x))
                      .apply(lambda x: re.sub(r"St($|\b)", "Street ", x))
                      .apply(lambda x: re.sub(r'\bUnit(\b|$)\.*', '', x))
                      .apply(lambda x: re.sub(r"Ct($|\b)", "Court ", x))
                      .apply(lambda x: re.sub(r"Crt($|\b)", "Court ", x))
                      .apply(lambda x: re.sub(r"Cor($|\b)", "Corner ", x))
                      .apply(lambda x: re.sub(r"Ctr($|\b)", "Center ", x))
                      .apply(lambda x: re.sub(r"Av($|\s|\b)", "Avenue ", x))
                      .apply(lambda x: re.sub(r"Ave($|\s|\b)", "Avenue ", x))
                      .apply(lambda x: re.sub(r"Apts($|\b)", "Apartments ", x))
                      .apply(lambda x: re.sub(r"Al($|\b)", "Alley ", x))
                      .apply(lambda x: re.sub(r"Tr($|\b)", "Terrace ", x))
                      .apply(lambda x: re.sub(r"Terr($|\b)", "Terrace ", x))
                      .apply(lambda x: re.sub(r"Te($|\b)", "Terrace ", x))
                      .apply(lambda x: re.sub(r"Trce($|\b)", "Trace", x))
                      .apply(lambda x: re.sub(r"Thse($|\b)", "Townhouse ", x))
                      .apply(lambda x: re.sub(r"Pl($|\b)", "Place ", x))
                      .apply(lambda x: re.sub(r"Wy($|\b)", "Way ", x))
                      .apply(lambda x: re.sub(r"Wa($|\b)", "Way ", x))
                      .apply(lambda x: re.sub(r"Ter($|\b)", "Terrace ", x))
                      .apply(lambda x: re.sub(r"Blvd($|\b)", "Boulevard ", x))
                      .apply(lambda x: re.sub(r"Bl($|\b)", "Boulevard ", x))
                      .apply(lambda x: re.sub(r"Bv($|\b)", "Boulevard ", x))
                      .apply(lambda x: re.sub(r"Blv($|\b)", "Boulevard ", x))
                      .apply(lambda x: re.sub(r"Bch($|\b)", "Beach ", x))
                      .apply(lambda x: re.sub(r"Cmn($|\b)", "Common ", x))
                      .apply(lambda x: re.sub(r"Cmns($|\b)", "Commons ", x))
                      .apply(lambda x: re.sub(r"Hol($|\b)", "Hollow ", x))
                      .apply(lambda x: re.sub(r"Hlw($|\b)", "Hollow ", x))
                      .apply(lambda x: re.sub(r"Sq($|\b)", "Square ", x))
                      .apply(lambda x: re.sub(r"Trl($|\b)", "Trail ", x))
                      .apply(lambda x: re.sub(r"Tl($|\b)", "Trail ", x))
                      .apply(lambda x: re.sub(r"Hwy($|\b)", "Highway ", x))
                      .apply(lambda x: re.sub(r"Hi($|\b)", "Highway ", x))
                      .apply(lambda x: re.sub(r"Hgwy($|\b)", "Highway ", x))
                      .apply(lambda x: re.sub(r"Lndg($|\b)", "Landing ", x))
                      .apply(lambda x: re.sub(r"Pt($|\b)", "Point ", x))
                      .apply(lambda x: re.sub(r"Po($|\b)", "Point ", x))
                      .apply(lambda x: re.sub(r"Pro($|\b)", "Professional ", x))
                      .apply(lambda x: re.sub(r"Brk($|\b)", "Brook ", x))
                      .apply(lambda x: re.sub(r"Rdg($|\b)", "Ridge ", x))
                      .apply(lambda x: re.sub(r"So($|\b)", "South ", x))
                      .apply(lambda x: re.sub(r"Tp($|\b)", "Turnpike ", x))
                      .apply(lambda x: re.sub(r"Tpk($|\b)", "Turnpike ", x))
                      .apply(lambda x: re.sub(r"Tpke($|\b)", "Turnpike ", x))
                      .apply(lambda x: re.sub(r"Tnpk($|\b)", "Turnpike ", x))
                      .apply(lambda x: re.sub(r"Bp($|\b)", "Bridgeport ", x))
                      .apply(lambda x: re.sub(r"Br($|\b)", "Bridge ", x))
                      .apply(lambda x: re.sub(r"Pkwy($|\b)", "Parkway ", x))
                      .apply(lambda x: re.sub(r"Pky($|\b)", "Parkway ", x))
                      .apply(lambda x: re.sub(r"Vw($|\b)", "View ", x))
                      .apply(lambda x: re.sub(r"Cv($|\b)", "Cove ", x))
                      .apply(lambda x: re.sub(r"Ave($|\b)", "Avenue ", x))
                      .apply(lambda x: re.sub(r"Dr($|\b)", "Drive ", x))
                      .apply(lambda x: re.sub(r"Cr($|\b)", "Circle ", x))
                      .apply(lambda x: re.sub(r"Ci($|\b)", "Circle ", x))
                      .apply(lambda x: re.sub(r"Cir($|\b)", "Circle ", x))
                      .apply(lambda x: re.sub(r"Cl($|\b)", "Close ", x))
                      .apply(lambda x: re.sub(r"Cswy($|\b)", "Causeway ", x))
                      .apply(lambda x: re.sub(r"Cres($|\b)", "Crescent ", x))
                      .apply(lambda x: re.sub(r"Se($|\b)", "Southeast ", x))
                      .apply(lambda x: re.sub(r"Sw($|\b)", "Southwest ", x))
                      .apply(lambda x: re.sub(r"Nw($|\b)", "Northwest ", x))
                      .apply(lambda x: re.sub(r"Ne($|\b)", "Northeast ", x))
                      .apply(lambda x: re.sub(r"No($|\b)", "North ", x))
                      .apply(lambda x: re.sub(r"Plz($|\b)", "Plaza ", x))
                      #.apply(lambda x: re.sub(r"Ptwy($|\s)", "Pentway ", x)) ????
                      .apply(lambda x: re.sub(r"Ptwy($|\b)", "Pathway ", x))
                      .apply(lambda x: re.sub(r"Pth($|\b)", "Path ", x))
                      .apply(lambda x: re.sub(r"Grv($|\b)", "Grove ", x))
                      .apply(lambda x: re.sub(r"Gr($|\b)", "Grove ", x))
                      .apply(lambda x: re.sub(r"Drs($|\b)", "Drive ", x))
                      .apply(lambda x: re.sub(r"Hgts($|\b)", "Heights ", x))
                      .apply(lambda x: re.sub(r"Hghts($|\b)", "Heights ", x))
                      .apply(lambda x: re.sub(r"Hts($|\b)", "Heights ", x))
                      .apply(lambda x: re.sub(r"Ht($|\b)", "Heights ", x))
                      .apply(lambda x: re.sub(r"Hl($|\b)", "Hill ", x))
                      .apply(lambda x: re.sub(r"Hls($|\b)", "Hills ", x))
                      .apply(lambda x: re.sub(r"Pk($|\b)", "Park ", x))
                      .apply(lambda x: re.sub(r"Rdg($|\b)", "Ridge ", x))
                      .apply(lambda x: re.sub(r"Ex($|\b)", "Extension ", x))
                      .apply(lambda x: re.sub(r"Ext($|\b)", "Extension ", x))
                      .apply(lambda x: re.sub(r"Rte($|\b)", "Route ", x))
                      .apply(lambda x: re.sub(r"Rd($|\b)", "Road ", x))
                      .apply(lambda x: re.sub(r"Rvr($|\b)", "River ", x))
                      .apply(lambda x: re.sub(r"La($|\b)", "Lane ", x))
                      .apply(lambda x: re.sub(r"Ln($|\b)", "Lane ", x))
                      .apply(lambda x: re.sub(r"Ldg($|\b)", "Lodge ", x))
                      .apply(lambda x: re.sub(r"Mtn($|\b)", "Mountain ", x))
                      .apply(lambda x: re.sub(r"Mt($|\b)", "Mountain ", x))
                      .apply(lambda x: re.sub(r"Ind($|\b)", "Industrial ", x))
                      .apply(lambda x: re.sub(r"Lk($|\b)", "Lake ", x))
                      .apply(lambda x: re.sub(r"Vlg($|\b)", "Village ", x))
                      .apply(lambda x: re.sub(r"Mnr($|\b)", "Manor ", x))
                      .apply(lambda x: re.sub(r"Knls($|\b)", "Knolls ", x))
                      .apply(lambda x: re.sub(r"Knl($|\b)", "Knoll ", x))
                      .apply(lambda x: re.sub(r"Pswy($|\b)", "Passway ", x))
                      .apply(lambda x: re.sub(r"Qtr($|\b)", "Quarter ", x))
                      .apply(lambda x: re.sub(r"Mdw($|\b)", "Meadow ", x))
                      .apply(lambda x: re.sub(r"Vly($|\b)", "Valley ", x))
                      .apply(lambda x: re.sub(r"Kn($|\b)", "Knoll ", x))
                      .apply(lambda x: re.sub(r"Grn($|\b)", "Green ", x))
                      .apply(lambda x: re.sub(r"Is($|\b)", "Island ", x))
    )

In [None]:
def clean(df):
    df = df.dropna(subset="addr")
    df["housenr"] = df["addr"].apply(lambda x:np.nan if x==np.nan else re.split(r"\s|/", x)[0])
    df["housenr"] = df["housenr"].apply(match)
    df["addr_mod"] = (df["addr"].apply(split_digits)
                      .str.strip("-")
                      .apply(lambda x: x if not x.strip().endswith('La') else x.replace('La', 'Lane'))
                      .apply(lambda x: x if not x.strip().endswith('Ext') else x.replace('Ext', 'Extension'))
                      .apply(lambda x: re.sub(r"\-[A-Z]$", "",x))
                      .apply(lambda x: re.sub(r"\s[A-Z]\-[A-Z]\s", "",x))
                      .str.replace("Rt ", "Route ")
                      .str.replace('No ', 'North ')
                      .str.replace(' No ', ' North ')
                      .str.replace('Resv ', 'Reservoir ')
                      .str.replace(' Resv ', ' Reservoir ')
                      .str.replace(' Rd ', ' Road ')
                      .str.replace(' La ', ' Lane ')
                      .str.replace('Talcott Forest Road East', 'Talcott Forest Road') #???
                      .apply(lambda x: re.sub(r'\bLn(\s|$)', "Lane ", x))
                      .str.replace(' Tpke ', ' Turnpike ')
                      .apply(lambda x: re.sub(r"(^|\s)N($|\s)", " North ", x))
                      .apply(lambda x: re.sub(r"(^|\s)W($|\s)", " West ", x))
                      .apply(lambda x: re.sub(r"(^|\s)S($|\s)", " South  ", x))
                      .apply(lambda x: re.sub(r"(^|\s)E($|\s)", " East ", x))
                      .apply(lambda x: re.sub(r"(^|\s)N W($|\s)", " Northwest ", x))
                      .apply(put_the_to_front)
                      .apply(lambda x: re.sub("O Neill", "O'Neill", x))
                      .apply(lambda x: re.sub(r"O Brien", "O'Brien", x))
                      .apply(lambda x: re.sub(r"O Clock", "O'Clock", x))
                      .apply(lambda x: re.sub(r"O Rocks", "O'Rocks", x))
                      .apply(lambda x: re.sub(r"Wells Place Place", "Wells Place", x))
                      .apply(lambda x: re.sub(r"Alexander D ", "Alexander Drive ", x))
                      #.apply(lambda x: re.sub(r"(^|\s)\b[a-zA-Z](?!['])\b|\b[a-zA-Z](?!['])\b($|\s)", '', x))
                      #.apply(lambda x: re.sub(r"^\b[a-zA-Z]\b(?!')|\b[a-zA-Z]\b(?!')$", '', x))
                      .apply(remove_letters)
                      .apply(lambda x: re.sub(r'\(.*?\)', "", x)) # remove ("any string")
                      .apply(remove_end_digits)
                      .apply(ad_split)
                      .apply(lambda x: re.sub(r"#\S*", "", x))
                      .str.strip()
                      .str.strip(",")
                      .str.strip(".")
                      .str.strip()
    )
    df["addr_mod"] = abbrv(df["addr_mod"])
    df["addr_mod"] = (df["addr_mod"]
                      #.apply(lambda x: re.split(r"/|&", x)[0])
                      .str.replace("#North", "North")
                      .str.replace("#South", "South")
                      .str.replace("#West", "West")
                      .str.replace("#East", "East")
                      .str.replace("#Wy", "Way")
                      #.apply(lambda x: re.sub(r'(^|\s)\b[a-zA-Z]\b|\b[a-zA-Z]\b($|\s)', '', x))
                      .apply(remove_letters)
                      .str.replace("- Union", "Union")
                      .str.replace("- Extension", "Extension")
                      .str.replace("-Extension", "Extension")
                      .str.replace("Nrwh Wstly", "Norwich Westerly")
                      .str.replace("Avenue-Extension", "Avenue Extension")
                      .str.replace("  ", " ")
                      .str.strip("_")
                      .str.strip()
                      .apply(lambda x: re.sub(r"^[\d\s.]*", "", x))
                      .apply(lambda x: re.sub(r"Rear$", "", x))
                      .apply(lambda x: re.sub(r"Re$", "", x))
                      .apply(lambda x: re.sub(r"Rea$", "Rear", x))
                      .apply(lambda x: re.sub(r"Gar$", "", x))
                      .apply(lambda x: re.sub(r"Lt$", "", x))
                      .apply(lambda x: re.sub(r"Adj$", "", x))
                      .apply(lambda x: re.sub(r"Aka$", "", x))
                      .apply(lambda x: re.sub(r"Ch$", "", x))
                      .apply(lambda x: re.sub(r"Ch2$", "", x))
                      .apply(lambda x: re.sub(r"Ctg$", "", x))
                      .apply(lambda x: re.sub(r"Ul$", "", x))
                      .apply(lambda x: re.sub(r"Cb$", "", x))
                      .apply(lambda x: re.sub(r"Ogba$", "", x))
                      .apply(lambda x: re.sub(r"Gnh$", "", x))
                      .apply(lambda x: re.sub(r"Bpbc$", "", x))
                      .apply(lambda x: re.sub(r"Lowr$", "", x))
                      .apply(lambda x: re.sub(r"Lp$", "", x))
                      .apply(lambda x: re.sub(r"Om$", "", x))
                      .apply(lambda x: re.sub(r"Ply$", "", x))
                      .apply(lambda x: re.sub(r"Preq$", "", x))
                      .apply(lambda x: re.sub(r"Lz$", "", x))
                      .apply(lambda x: re.sub(r"Rz$", "", x))
                      .apply(lambda x: re.sub(r"Abcd$", "", x))
                      .apply(lambda x: re.sub(r"Eb$", "", x))
                      .apply(lambda x: re.sub(r"Prim$", "", x))
                      .apply(lambda x: re.sub(r"Osg$", "", x))
                      .apply(lambda x: re.sub(r"Dwl$", "", x))
                      .apply(lambda x: re.sub(r"Iii$", "", x))
                      .apply(lambda x: re.sub(r"Ab$", "", x))
                      .apply(lambda x: re.sub(r"Lot$", "", x))
                      .apply(lambda x: re.sub(r"Rr$", "", x))
                      .apply(lambda x: re.sub(r"Bldg$", "", x))
                      .apply(lambda x: re.sub(r"Beu$", "", x))
                      .apply(lambda x: re.sub(r"Na$", "", x))
                      .apply(lambda x: re.sub(r"Ph$", "", x))
                      .apply(lambda x: re.sub(r"Un1$", "", x))
                      .apply(lambda x: re.sub(r"Una$", "", x))
                      .apply(lambda x: re.sub(r"Kc$", "", x))
                      .apply(lambda x: re.sub(r"Bh$", "", x))
                      .apply(lambda x: re.sub(r"Mr$", "", x))
                      .apply(lambda x: re.sub(r"Prvt$", "", x))
                      .apply(lambda x: re.sub(r"Gnb$", "", x))
                      .apply(lambda x: re.sub(r"Bsmt$", "", x))
                      .apply(lambda x: re.sub(r"Unit$", "", x))
                      .apply(lambda x: re.sub(r"Aa$", "", x))
                      .apply(lambda x: re.sub(r"Ru$", "", x))
                      .apply(lambda x: re.sub(r"Street Land$", "Street", x)) #???
                      .apply(remove_end_digits)
                      .apply(remove_start_digits)
                      #.apply(lambda x: re.sub(r"\d+$", "", x))
                      #.apply(lambda x: re.sub(r"\d+[a-zA-Z]*$", "", x))
                      .apply(lambda x: re.sub(r"^\d+", "", x))
                      .apply(lambda x: re.sub(r"Oneill", "O'neill", x))
                      .apply(lambda x: re.sub(r"\([a-zA-Z\d]*\)", "",x)) #??
                      .apply(lambda x: re.sub(r"^Off\b", " ",x)) # ???
                      .apply(lambda x: re.sub(r"^Rdwy", "",x)) # ???
                      .apply(lambda x: re.sub(r"^Lot\b", " ",x)) 
                      .apply(lambda x: re.sub(r"^Lot A", "",x)) 
                      .apply(remove_after_paran)
                      .str.replace("Fish Game", "Fish + Game")
                      .str.replace("Street Andrews", "St Andrews")
                      .str.replace("Street John", "St John")
                      .str.replace("Street Lawrence", "St Lawrence")
                      .str.replace("Street Paul", "St Paul")
                      .str.replace("Street Thomas", "St Thomas")
                      .str.replace("Street Stephens", "St Stephens")
                      .str.replace("Street Mathias", "St Stephens")
                      .str.replace("Street Andrew", "St Andrew")
                      .str.replace("Street James", "St James")
                      .str.strip()
                      .str.strip(",")
                      .str.strip("-")
                      .apply(remove_letters)
                      .str.strip("-")
                      .str.strip(",")
                      .str.strip("-")
                      .str.strip(".")
                      .str.strip("+")
                      .str.strip()
                      )
    df["housenr"] = np.where(df.housenr.str.isdigit(), df.housenr, np.nan)
    return df.reset_index(drop="True")

In [None]:
def match_adress_to_location(df, df_geo, county="all"):
    if county!="all":
        df = df[df.county == county]
    print(df.shape)
    df_merge = df.merge(df_geo, how="left", left_on=["addr_mod", "housenr", "city"], right_on=["addr", "housenr", "city"])
    df_missings = df_merge[df_merge.geometry.isnull()]
    df_found = df_merge.dropna(subset=["geometry"]) 
    df_missings = df_merge[df_merge.geometry.isnull()]
    df_missing_housnr = df_missings[df_missings["housenr"].isnull()]
    df_missings = df_missings.dropna(subset="housenr")
    df_missing_merge = df_missings.merge(df_geo, how="left", left_on=["addr_mod", "city"], right_on=["addr", "city"])

    df_missing_merge["housenr_x"] = df_missing_merge["housenr_x"].dropna().apply(lambda x: re.sub(r'[a-zA-z]', '', x)).astype("float64")
    df_missing_merge["housenr_y"] = (df_missing_merge["housenr_y"].dropna().apply(lambda x: x.split('-')[0])
                                     .apply(lambda x: x.split("+")[0])
                                     .apply(lambda x: x.split("&")[0])
                                     .apply(lambda x: x.split(";")[0])
                                     .apply(lambda x: x.split(" ")[0])
                                     .apply(lambda x: x.split(",")[0])
                                     .apply(lambda x: x.split("/")[0])
                                     .str.strip("#")
                                     .str.strip("Ë")
                                     .str.replace(")", "")
                                     .apply(lambda x: re.sub(r'[a-zA-z]', '', x)))
    df_missing_merge["housenr_y"] = np.where(df_missing_merge["housenr_y"] == "", np.nan, df_missing_merge["housenr_y"]).astype("float64")
    df_missing_merge["diff"] = np.abs(df_missing_merge["housenr_x"] - df_missing_merge["housenr_y"])
    df_found_2 = df_missing_merge.loc[df_missing_merge.dropna(subset="geometry_y").groupby(by=["addr_x", "city"])["diff"].idxmin().dropna()]
    df_missing_merge_h = df_missing_housnr.merge(df_geo, how="left", left_on=["addr_mod", "city"], right_on=["addr", "city"])
    df_missing_merge_h = df_missing_merge_h.dropna(subset="geometry_y")
    df_found_3 = df_missing_merge_h.dropna(subset="geometry_y").groupby(by=["addr_x", "city"]).sample(1)
    selected_columns = ["addr", "city", "county", "state", "geometry"]

    
    df_found = df_found.rename(columns={"geometry_y": "geometry", "addr_x":"addr"})
    df_found_2 = df_found_2.drop(columns="addr").rename(columns={"geometry_y": "geometry", "addr_x":"addr"})
    df_found_3 = df_found_3.drop(columns="addr").rename(columns={"geometry_y": "geometry", "addr_x":"addr"})

    df_clean = pd.concat([df_found, df_found_2, df_found_3])
    return df_clean[selected_columns].reset_index(drop=True).drop_duplicates(subset=["addr", "city"])

In [None]:
# clean
df_cleaned = clean(df_combined)

#### Match

##### Fairfax

In [None]:
df_fairfax = df_cleaned[df_cleaned.county == "Fairfax"]

In [None]:
df_geo_fairfax = pd.read_csv("df_geo_fairfax1_2.csv", index_col=[0])

In [None]:
# zipcodes = pd.read_csv("ZIP_Codes.csv")
# zipcodes = zipcodes[["ZIPCODE", "ZIPCITY"]]
df_fairfax_cities = gpd.read_file("fairfax_cities.geojson", index_col=[0])

In [None]:
df_highway_fairfax = pd.read_csv("/Volumes/Seagate/Bavillion/highway/fairfax_highway.csv")
df_highway_fairfax = df_highway_fairfax[["geometry", "tiger:county", "name"]].dropna(subset="name")
df_highway_fairfax = df_highway_fairfax.dropna(subset="geometry")
df_highway_fairfax["geometry"] = df_highway_fairfax["geometry"].apply(wkt.loads)
df_highway_fairfax = gpd.GeoDataFrame(df_highway_fairfax)
df_highway_fairfax.crs = "EPSG:4326"
df_highway_fairfax = df_highway_fairfax.sjoin(df_fairfax_cities[["ZIPCODE", "ZIPCITY", "geometry"]], how="left", predicate="intersects")
df_highway_fairfax = df_highway_fairfax.drop(columns=["tiger:county", "index_right"]).rename(columns={"ZIPCODE":"addr:postcode", 
                                "ZIPCITY":"city", 
                                "name": "addr"})
df_highway_fairfax["city"] = df_highway_fairfax["city"].str.title()
df_highway_fairfax["housenr"] = np.nan
df_highway_fairfax

Add city to geo

In [None]:
df_geo_fairfax.geometry

In [None]:
def _split(postcode):
    if isinstance(postcode, float) and np.isnan(postcode):
        return postcode
    return postcode.split("-")[0]


In [None]:
#df_geo_fairfax = df_geo_fairfax.dropna(subset="addr:postcode")
#df_geo_fairfax["addr:postcode"] = df_geo_fairfax["addr:postcode"].dropna().apply(lambda x: x.split("-")[0])

# df_geo_fairfax["addr:postcode"] = df_geo_fairfax["addr:postcode"].apply(_split)
# df_geo_fairfax["addr:postcode"] = df_geo_fairfax["addr:postcode"].astype("float64")
# df_geo_fairfax = df_geo_fairfax.merge(zipcodes, how="left", left_on="addr:postcode", right_on="ZIPCODE")
# df_geo_fairfax = df_geo_fairfax.drop(columns="city").rename(columns={"ZIPCITY": "city"})

df_geo_fairfax["geometry"] = df_geo_fairfax["geometry"].apply(wkt.loads)
df_geo_fairfax = gpd.GeoDataFrame(df_geo_fairfax)
df_geo_fairfax.crs = "EPSG:4326" 

df_fairfax_cities.crs = "EPSG:4326" 

df_geo_fairfax = df_geo_fairfax.sjoin(df_fairfax_cities[["ZIPCODE", "ZIPCITY", "geometry"]], how="left", predicate="intersects")
df_geo_fairfax = df_geo_fairfax.drop(columns=["city", "addr:postcode"]).rename(columns={"ZIPCODE": "postcode", "ZIPCITY":"city"})

df_geo_fairfax["city"] = df_geo_fairfax["city"].str.title()
print(df_geo_fairfax.city.isna().sum())

grouped_addr = df_fairfax.groupby(by=["addr_mod", "city"]).size().to_frame()
grouped_addr.columns = ["size"]
grouped_addr = grouped_addr.reset_index()
addr_count = grouped_addr.addr_mod.value_counts().to_frame().reset_index()
unique_addr = addr_count[addr_count["count"] == 1].addr_mod.values
unique_addr_city = df_fairfax[df_fairfax.addr_mod.isin(unique_addr)][["addr_mod", "city"]].drop_duplicates()
df_geo_fairfax = df_geo_fairfax.merge(unique_addr_city, how="left", left_on="addr", right_on="addr_mod", suffixes=["_geo", "_add"])
df_geo_fairfax["city"] = np.where(df_geo_fairfax.city_geo.isna(), df_geo_fairfax.city_add, df_geo_fairfax.city_geo)
df_geo_fairfax = df_geo_fairfax.drop(columns=["city_geo", "city_add", "addr_mod"])

In [None]:
match_fairfax = match_adress_to_location(df_cleaned, df_geo_fairfax, county="Fairfax")

In [None]:
df_highway_fairfax["housenr"] = "0"
df_geo_fairfax_comb= pd.concat([df_geo_fairfax, df_highway_fairfax])
missing_fairfax = df_fairfax[~df_fairfax.addr.isin(match_fairfax.addr)]
match_fairfax_2 = match_adress_to_location(missing_fairfax, df_geo_fairfax_comb, county="Fairfax")

In [None]:
final_match_fairfax = pd.concat([match_fairfax, match_fairfax_2])

In [None]:
df_fairfax[~df_fairfax.addr.isin(final_match_fairfax.addr)].addr_mod.drop_duplicates()

##### Windham

In [None]:
connecticut_zip = gpd.read_file("ct_connecticut_zip_codes_geo.min.json")
connecticut_city = pd.read_csv("ct_zipcode_city.csv")
connecticut_zip = connecticut_zip[["ZCTA5CE10", "geometry"]]
connecticut_city = connecticut_city[["zip", "City"]]
connecticut_city = connecticut_city.dropna(subset="City")
connecticut_zip = connecticut_zip.rename(columns={"ZCTA5CE10":"zip"})
connecticut_zip["zip"] = connecticut_zip.zip.str.lstrip("0")
connecticut_zip = connecticut_zip.merge(connecticut_city, how="left", on="zip")
connecticut_zip["City"] = connecticut_zip.City.str.title()

In [None]:
df_windham = df_cleaned[df_cleaned.county == "Windham"]
df_geo_windham = pd.read_csv("df_geo_windham_2.csv", index_col=[0])
match_windham = match_adress_to_location(df_cleaned, df_geo_windham, county="Windham")
missing_windham = df_windham[~df_windham.addr.isin(match_windham.addr)][["addr_mod", "addr", "state" ,"housenr","city", "county"]].drop_duplicates(subset="addr_mod")

df_highway_windham = pd.read_csv("/Volumes/Seagate/Bavillion/highway/windham_highway.csv")
df_highway_windham = df_highway_windham[["geometry", "tiger:county", "name"]].dropna(subset="name")
df_highway_windham = df_highway_windham.dropna(subset="geometry")
df_highway_windham["geometry"] = df_highway_windham["geometry"].apply(wkt.loads)
df_highway_windham = gpd.GeoDataFrame(df_highway_windham)
df_highway_windham.crs = "EPSG:4326"
df_highway_windham = df_highway_windham.sjoin(connecticut_zip[["City", "geometry"]], how="left", predicate="intersects")
df_highway_windham = df_highway_windham.drop(columns=["tiger:county"]).rename(columns={"name": "addr", "City":"city"})
df_highway_windham["city"] = df_highway_windham["city"].str.title()
df_highway_windham["housenr"] = "0"

df_geo_windham_comb= pd.concat([df_geo_windham, df_highway_windham])
missing_winham = df_windham[~df_windham.addr.isin(match_windham.addr)]
match_windham_2 = match_adress_to_location(missing_windham, df_geo_windham_comb)
final_match_windham = pd.concat([match_windham, match_windham_2])

##### Tolland

In [None]:
df_tolland = df_cleaned[df_cleaned.county == "Tolland"]
df_geo_tolland = pd.read_csv("df_geo_tolland_2.csv", index_col=[0])
match_tolland = match_adress_to_location(df_cleaned, df_geo_tolland, county="Tolland")
missing_tolland = df_tolland[~df_tolland.addr.isin(match_tolland.addr)][["addr_mod", "addr", "city", "county"]].drop_duplicates(subset="addr_mod")

In [None]:
df_highway_tolland = pd.read_csv("/Volumes/Seagate/Bavillion/highway/tolland_highway.csv")
df_highway_tolland = df_highway_tolland[["geometry", "tiger:county", "name"]].dropna(subset="name")
df_highway_tolland = df_highway_tolland.dropna(subset="geometry")
df_highway_tolland["geometry"] = df_highway_tolland["geometry"].apply(wkt.loads)
df_highway_tolland = gpd.GeoDataFrame(df_highway_tolland)
df_highway_tolland.crs = "EPSG:4326"
df_highway_tolland = df_highway_tolland.sjoin(connecticut_zip[["City", "geometry"]], how="left", predicate="intersects")
df_highway_tolland = df_highway_tolland.drop(columns=["tiger:county"]).rename(columns={"name": "addr", "City":"city"})
df_highway_tolland["city"] = df_highway_tolland["city"].str.title()
df_highway_tolland["housenr"] = "0"

df_geo_tolland_comb= pd.concat([df_geo_tolland, df_highway_tolland])
missing_tolland = df_tolland[~df_tolland.addr.isin(match_tolland.addr)]
match_tolland_2 = match_adress_to_location(missing_tolland, df_geo_tolland_comb)
final_match_tolland = pd.concat([match_tolland, match_tolland_2])

##### New London

In [None]:
df_new_london = df_cleaned[df_cleaned.county == "New London"]
df_geo_london = pd.read_csv("df_geo_new_london_2.csv", index_col=[0])
match_london = match_adress_to_location(df_cleaned, df_geo_london, county="New London")
missing_london = df_new_london[~df_new_london.addr.isin(match_london.addr)][["addr_mod", "addr", "city", "county"]].drop_duplicates(subset="addr_mod")

In [None]:
df_highway_london = pd.read_csv("/Volumes/Seagate/Bavillion/highway/new_london_highway.csv")
df_highway_london = df_highway_london[["geometry", "tiger:county", "name"]].dropna(subset="name")
df_highway_london = df_highway_london.dropna(subset="geometry")
df_highway_london["geometry"] = df_highway_london["geometry"].apply(wkt.loads)
df_highway_london = gpd.GeoDataFrame(df_highway_london)
df_highway_london.crs = "EPSG:4326"
df_highway_london = df_highway_london.sjoin(connecticut_zip[["City", "geometry"]], how="left", predicate="intersects")
df_highway_london = df_highway_london.drop(columns=["tiger:county"]).rename(columns={"name": "addr", "City":"city"})
df_highway_london["city"] = df_highway_london["city"].str.title()
df_highway_london["housenr"] = "0"

df_geo_london_comb= pd.concat([df_geo_london, df_highway_london])
missing_london = df_new_london[~df_new_london.addr.isin(match_london.addr)]
match_london_2 = match_adress_to_location(missing_london, df_geo_london_comb)
final_match_london = pd.concat([match_london, match_london_2])

##### New Haven

In [None]:
df_new_haven = df_cleaned[df_cleaned.county == "New Haven"]
df_geo_haven = pd.read_csv("df_geo_new_haven_2.csv", index_col=[0])
match_haven = match_adress_to_location(df_cleaned, df_geo_haven, county="New Haven")
missing_haven = df_new_haven[~df_new_haven.addr.isin(match_haven.addr)][["addr_mod", "addr", "city", "county"]].drop_duplicates(subset="addr_mod")

In [None]:
df_highway_haven = pd.read_csv("/Volumes/Seagate/Bavillion/highway/new_haven_highway.csv")
df_highway_haven = df_highway_haven[["geometry", "tiger:county", "name"]].dropna(subset="name")
df_highway_haven = df_highway_haven.dropna(subset="geometry")
df_highway_haven["geometry"] = df_highway_haven["geometry"].apply(wkt.loads)
df_highway_haven = gpd.GeoDataFrame(df_highway_haven)
df_highway_haven.crs = "EPSG:4326"
df_highway_haven = df_highway_haven.sjoin(connecticut_zip[["City", "geometry"]], how="left", predicate="intersects")
df_highway_haven = df_highway_haven.drop(columns=["tiger:county"]).rename(columns={"name": "addr", "City":"city"})
df_highway_haven["city"] = df_highway_haven["city"].str.title()
df_highway_haven["housenr"] = "0"

df_geo_haven_comb= pd.concat([df_geo_haven, df_highway_haven])
missing_haven = df_new_haven[~df_new_haven.addr.isin(match_haven.addr)]
match_haven_2 = match_adress_to_location(missing_haven, df_geo_haven_comb)
final_match_haven = pd.concat([match_haven, match_haven_2])

##### Litchfield

In [None]:
df_litchfield = df_cleaned[df_cleaned.county == "Litchfield"]
df_geo_litchfield = pd.read_csv("df_geo_litchfield_2.csv", index_col=[0])
match_litchfield = match_adress_to_location(df_cleaned, df_geo_litchfield, county="Litchfield")
missing_litchfield = df_litchfield[~df_litchfield.addr.isin(match_litchfield.addr)][["addr_mod", "addr", "city", "county"]].drop_duplicates(subset="addr_mod")

In [None]:
df_highway_litchfield = pd.read_csv("/Volumes/Seagate/Bavillion/highway/litchfield_highway.csv")
df_highway_litchfield = df_highway_litchfield[["geometry", "tiger:county", "name"]].dropna(subset="name")
df_highway_litchfield = df_highway_litchfield.dropna(subset="geometry")
df_highway_litchfield["geometry"] = df_highway_litchfield["geometry"].apply(wkt.loads)
df_highway_litchfield = gpd.GeoDataFrame(df_highway_litchfield)
df_highway_litchfield.crs = "EPSG:4326"
df_highway_litchfield = df_highway_litchfield.sjoin(connecticut_zip[["City", "geometry"]], how="left", predicate="intersects")
df_highway_litchfield = df_highway_litchfield.drop(columns=["tiger:county"]).rename(columns={"name": "addr", "City":"city"})
df_highway_litchfield["city"] = df_highway_litchfield["city"].str.title()
df_highway_litchfield["housenr"] = "0"

df_geo_litchfield_comb= pd.concat([df_geo_litchfield, df_highway_litchfield])
missing_litchfield = df_litchfield[~df_litchfield.addr.isin(match_litchfield.addr)]
match_litchfield_2 = match_adress_to_location(missing_litchfield, df_geo_litchfield_comb)
final_match_litchfield = pd.concat([match_litchfield, match_litchfield_2])

##### Hartford

In [None]:
df_hartford = df_cleaned[df_cleaned.county == "Hartford"]
df_geo_hartford = pd.read_csv("df_geo_hartford_2.csv", index_col=[0])
match_hartford = match_adress_to_location(df_cleaned, df_geo_hartford, county="Hartford")
missing_hartford = df_hartford[~df_hartford.addr.isin(match_hartford.addr)][["addr_mod", "addr", "city", "county"]].drop_duplicates(subset="addr_mod")

In [None]:
df_highway_hartford = pd.read_csv("/Volumes/Seagate/Bavillion/highway/hartford_highway.csv")
df_highway_hartford = df_highway_hartford[["geometry", "tiger:county", "name"]].dropna(subset="name")
df_highway_hartford = df_highway_hartford.dropna(subset="geometry")
df_highway_hartford["geometry"] = df_highway_hartford["geometry"].apply(wkt.loads)
df_highway_hartford = gpd.GeoDataFrame(df_highway_hartford)
df_highway_hartford.crs = "EPSG:4326"
df_highway_hartford = df_highway_hartford.sjoin(connecticut_zip[["City", "geometry"]], how="left", predicate="intersects")
df_highway_hartford = df_highway_hartford.drop(columns=["tiger:county"]).rename(columns={"name": "addr", "City":"city"})
df_highway_hartford["city"] = df_highway_hartford["city"].str.title()
df_highway_hartford["housenr"] = "0"

df_geo_hartford_comb= pd.concat([df_geo_hartford, df_highway_hartford])
missing_hartford = df_hartford[~df_hartford.addr.isin(match_hartford.addr)]
match_hartford_2 = match_adress_to_location(missing_hartford, df_geo_hartford_comb)
final_match_hartford = pd.concat([match_hartford, match_hartford_2])

In [None]:
final_match_hartford.shape

In [None]:
match_hartford.shape

##### Middlesex

In [None]:
df_middlesex = df_cleaned[df_cleaned.county == "Middlesex"]
df_geo_middlesex = pd.read_csv("df_geo_middlesex_2.csv", index_col=[0])
match_middlesex = match_adress_to_location(df_cleaned, df_geo_middlesex, county="Middlesex")
missing_middlesex = df_middlesex[~df_middlesex.addr.isin(match_middlesex.addr)][["addr_mod", "addr", "city", "county"]].drop_duplicates(subset="addr_mod")

In [None]:
df_highway_middlesex = pd.read_csv("/Volumes/Seagate/Bavillion/highway/middlesex_highway.csv")
df_highway_middlesex = df_highway_middlesex[["geometry", "tiger:county", "name"]].dropna(subset="name")
df_highway_middlesex = df_highway_middlesex.dropna(subset="geometry")
df_highway_middlesex["geometry"] = df_highway_middlesex["geometry"].apply(wkt.loads)
df_highway_middlesex = gpd.GeoDataFrame(df_highway_middlesex)
df_highway_middlesex.crs = "EPSG:4326"
df_highway_middlesex = df_highway_middlesex.sjoin(connecticut_zip[["City", "geometry"]], how="left", predicate="intersects")
df_highway_middlesex = df_highway_middlesex.drop(columns=["tiger:county"]).rename(columns={"name": "addr", "City":"city"})
df_highway_middlesex["city"] = df_highway_middlesex["city"].str.title()
df_highway_middlesex["housenr"] = "0"

df_geo_middlesex_comb= pd.concat([df_geo_middlesex, df_highway_middlesex])
missing_middlesex = df_middlesex[~df_middlesex.addr.isin(match_middlesex.addr)]
match_middlesex_2 = match_adress_to_location(missing_middlesex, df_geo_middlesex_comb)
final_match_middlesex = pd.concat([match_middlesex, match_middlesex_2])

##### Fairfield

In [None]:
df_fairfield = df_cleaned[df_cleaned.county == "Fairfield"]
df_geo_fairfield = pd.read_csv("df_geo_fairfield_2.csv", index_col=[0])
match_fairfield = match_adress_to_location(df_cleaned, df_geo_fairfield, county="Fairfield")
missing_fairfield = df_fairfield[~df_fairfield.addr.isin(match_fairfield.addr)][["addr_mod", "addr", "city", "county"]].drop_duplicates(subset="addr_mod")

In [None]:
df_highway_fairfield = pd.read_csv("/Volumes/Seagate/Bavillion/highway/fairfield_highway.csv")
df_highway_fairfield = df_highway_fairfield[["geometry", "tiger:county", "name"]].dropna(subset="name")
df_highway_fairfield = df_highway_fairfield.dropna(subset="geometry")
df_highway_fairfield["geometry"] = df_highway_fairfield["geometry"].apply(wkt.loads)
df_highway_fairfield = gpd.GeoDataFrame(df_highway_fairfield)
df_highway_fairfield.crs = "EPSG:4326"
df_highway_fairfield = df_highway_fairfield.sjoin(connecticut_zip[["City", "geometry"]], how="left", predicate="intersects")
df_highway_fairfield = df_highway_fairfield.drop(columns=["tiger:county"]).rename(columns={"name": "addr", "City":"city"})
df_highway_fairfield["city"] = df_highway_fairfield["city"].str.title()
df_highway_fairfield["housenr"] = "0"

df_geo_fairfield_comb= pd.concat([df_geo_fairfield, df_highway_fairfield])
missing_fairfield= df_fairfield[~df_fairfield.addr.isin(match_fairfield.addr)]
match_fairfield_2 = match_adress_to_location(missing_fairfield, df_geo_fairfield_comb)
final_match_fairfield = pd.concat([match_fairfield, match_fairfield_2])

#### Merge together

In [None]:
final_match = pd.concat([
    final_match_fairfield,
    final_match_fairfax,
    final_match_hartford,
    final_match_haven,
    final_match_litchfield,
    final_match_london,
    final_match_middlesex,
    final_match_tolland,
    final_match_windham,
])

In [None]:
final_match.reset_index(drop=True, inplace=True)

In [None]:
final_match.to_csv("final_match.csv")

##### Missings

In [None]:
missing = pd.DataFrame()
counties = {
            "new_haven" : "New Haven",
            "new_london": "New London",
            "middlesex": "Middlesex",
            "litchfield": "Litchfield",
            "hartford": "Hartford",
            "fairfield": "Fairfield",
            "tolland": "Tolland",
            "windham": "Windham",
            }

for k, v in counties.items():
    df = df_cleaned[df_cleaned.county == v]
    df_geo = pd.read_csv(f"df_geo_{k}_2.csv", index_col=[0])
    match = match_adress_to_location(df_cleaned, df_geo, county=v)
    missing = pd.concat([missing, df[~df.addr.isin(match.addr)][["addr_mod", "addr", "city", "county"]].drop_duplicates(subset="addr_mod")])

In [None]:
a = pd.read_csv("df_geo_new_haven_2.csv", index_col=[0])
b = pd.read_csv("df_geo_new_london_2.csv", index_col=[0])
c = pd.read_csv("df_geo_middlesex_2.csv", index_col=[0])
d = pd.read_csv("df_geo_litchfield_2.csv", index_col=[0])
e = pd.read_csv("df_geo_hartford_2.csv", index_col=[0])
f = pd.read_csv("df_geo_fairfield_2.csv", index_col=[0])
g = pd.read_csv("df_geo_tolland_2.csv", index_col=[0])
h = pd.read_csv("df_geo_windham_2.csv", index_col=[0])

df = df_cleaned[df_cleaned.county != "Fairfax"]
df_geo = pd.concat([a, b,c,d,e,f,g,h])
match = match_adress_to_location(df_cleaned, df_geo)
missing = df[~df.addr.isin(match.addr)][["addr_mod", "addr", "city", "county"]].drop_duplicates(subset="addr_mod")

In [None]:
missing.shape