# 02 Feature Engineering (Geo Features)

In this notebook we calculate the geo-freqency as well as the geo-distance features.

In [None]:
import geopandas as gpd
import pandas as pd
from shapely import wkt
import numpy as np

##### Geo Frequency Features

In [None]:
df = pd.read_csv("../data/cleaned/final_match.csv",index_col=[0])

In [None]:
df_poi_fairfield = pd.read_csv("/Volumes/Seagate/bavillion/poi/fairfield.csv")
df_poi_fairfax = pd.read_csv("/Volumes/Seagate/bavillion/poi/fairfax.csv")
df_poi_hartford = pd.read_csv("/Volumes/Seagate/bavillion/poi/hartford.csv")
df_poi_litchfield = pd.read_csv("/Volumes/Seagate/bavillion/poi/litchfield.csv")
df_poi_middlesex = pd.read_csv("/Volumes/Seagate/bavillion/poi/middlesex.csv")
df_poi_new_haven = pd.read_csv("/Volumes/Seagate/bavillion/poi/new_haven.csv")
df_poi_new_london = pd.read_csv("/Volumes/Seagate/bavillion/poi/new_london.csv")
df_poi_tolland = pd.read_csv("/Volumes/Seagate/bavillion/poi/tolland.csv")
df_poi_windham = pd.read_csv("/Volumes/Seagate/bavillion/poi/windham.csv")

In [None]:
columns = ["building", 
        "amenity",
        "atm",
        "bus",
        "public_transport", 
        "tourism", 
        "geometry", 
        "aeroway", 
        "ferry",
        "healthcare",
        "government",
        "bar",
        "railway",
        "highway",
        "school", 
        "preschool",
        "museum"]
df_poi = df_poi[columns]

In [None]:
df_poi.loc[:, "bus"] = np.where((~df_poi.bus.isna()), "bus_"+df_poi.public_transport, df_poi.bus)
df_poi.loc[:, "ferry"] = np.where((~df_poi.ferry.isna()), "ferry_"+df_poi.public_transport, df_poi.ferry)
df_poi.loc[:, "railway"] = "railway_" +df_poi.loc[:,"railway"]
df_poi.loc[:, "school"] = np.where(df_poi.school == "yes", "school", df_poi.school)
df_poi.loc[:, "preschool"] = np.where(df_poi.preschool == "yes", "preschool", df_poi.preschool)
df_poi.loc[:, "museum"] = np.where((~df_poi.museum.isna()), "museum", df_poi.museum)

In [None]:
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.tourism, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.aeroway, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.healthcare, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.bus, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.ferry, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.railway, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.school, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.preschool, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.museum, df_poi.amenity)

**1. Religious Institutions:**
- place_of_worship

**2. Educational Facilities:**
- school
- university
- college
- library
- kindergarten
- music_school
- prep_school
- driving_school
- childcare

**3. Healthcare Services:**
- dentist
- clinic
- hospital
- doctors
- nursing_home

**4. Emergency Services:**
- fire_station
- police

**5. Animalcare Facilities:**
- shelter
- animal_shelter
- animal_boarding

**6. Community Venues:**
- social_facility
- community_centre
- social_centre
- townhall

**7. Community Services:**
- grave_yard
- ranger_station
- waste_transfer_station
- waste_disposal
- post_depot
- public_bookcase

**8. Shopping Facilities:**
- marketplace

**9. Food and Drink Establishments:**
- restaurant
- fast_food
- ice_cream
- cafe
- bbq

**8. Financial Services:**
- bank
- atm
- finance
- money_transfer
- check_cashing

**9. Transportation Services:**
- aerodrome
- railway_platform
- ferry_terminal
- bus_station
- boat_storage
- bus_platform
- taxi
- railway_station
- bus_stop_position
- ferry_stop_position

**10. Entertainment Venues:**
- museum
- arts_centre
- stadium_seating
- cinema
- theatre
- attraction
- amusement
- events_venue

**11. Adults Entertainment:**
- nightclub
- pub
- stripclub
- bar

**12. Sports Facilities:**
- dancing_school
- dojo

**13. Utilities:**
- charging_station

**14. Accommodation:**
- motel
- hotel

**15. Government and Civic Services:**
- courthouse
- prison

**16. Recreational Facilities:**
- park
- campground
- camp_site 
- picnic_site
- zoo
- aquarium
- viewpoint
- boat_rental

In [None]:
#Well-known text (WKT) is a text markup language for representing vector geometry objects
df['geometry'] = df['geometry'].apply(wkt.loads)
df_geo = gpd.GeoDataFrame(df, crs='epsg:4326')
df_geo_fairfield = df_geo[df_geo.county =="Fairfield"]

df_poi["geometry"] = df_poi["geometry"].apply(wkt.loads)
df_poi = gpd.GeoDataFrame(df_poi, crs="epsg:4326")

In [None]:
def nearest_points(point, gdf, distance_in_km=5):
    print(point)
    return (gdf.geometry.to_crs("EPSG:3857").centroid.distance(point) < distance_in_km*1000).sum()
    
    #return (gdf.geometry.to_crs("EPSG:3857").centroid.distance(point)).max()

In [None]:
points_of_interest = {
    "reli_inst": ["place_of_worship",
                  "monastery"],
    "edu_fac": ["school", 
           "university", 
           "college", 
           "library",
           "kindergarten",
           "music_school",
           "prep_school",
           "driving_school",
           "childcare"],
    "healthcare": ["dentist", 
              "clinic", 
              "hospital", 
              "doctors", 
              "nursing_home"],
    "emergency": ["fire_station", "police"],
    "animalcare": ["shelter","animal_shelter", "animal_boarding"],
    "commu_venu": ["social_facility", 
              "community_centre",
              "exhibition_centre",
              "conference_centre",
              "social_centre", 
              "townhall",
              "coworking_space"],
    "commu_serv": [
              "charity",
              "public_building",
              "grave_yard",
              "crematorium",
              "mortuary",
              "ranger_station",
              "post_depot",
              "mail_room",
              "public_bath"
              "public_bookcase"],
    "shopping": ["marketplace", "market"],
    "food_drink": ["restaurant",
              "fast_food",
              "ice_cream",
              "cafe",
              "bbq",
              "canteen"],
    "financial": ["bank",
             "atm",
             "finance",
             "money_transfer",
             "check_cashing"],
    "transport": ["aerodrome",
             "railway_platform",
             "ferry_terminal",
             "bus_station",
             "boat_storage",
             "bus_platform",
             "taxi",
             "railway_halt",
             "railway_car_shuttle",
             "car_sharing",
             "railway_station",
             "bus_stop_position",
             "ferry_stop_position"],
    "entertainment": ["museum",
                 "arts_centre",
                 "theme_park",
                 "stadium_seating",
                 "cinema",
                 "theatre",
                 "attraction",
                 "amusement",
                 "events_venue",
                 "karaoke_box",
                 "music_venue",
                 "planetarium",
                 "lounge",
                 "internet_cafe"
                 ],
    "adults_entertain": [
                     'hookah',
                     'hookah_lounge',
                     "biergarten",
                     "casino",
                     "nightclub",
                    "pub",
                    "stripclub",
                    "bar",
                    "love_hotel"],
    "sports": ["dancing_school",
               "dojo",
               "ski_school", 
               "ski_rental"],
    "utilities": ["charging_station",
                  'compressed_air', 
                  'sanitary_dump_station', 
                  "vacuum_cleaner",
                  "waste_transfer_station",
              "waste_disposal"],
    "accommodation": ["motel", "hotel", "hostel"],
    "government_civic": ["courthouse", "prison"],
    "recreational": ["park",
                "campground",
                "camp_site",
                "picnic_site",
                "zoo",
                "aquarium",
                "viewpoint",
                "boat_rental",
                "bicycle_rental"]
}


In [None]:
def points_within(point, gdf, radius):
    buffer = point.buffer(radius)
    points_within = gdf.intersects(buffer)
    return points_within.sum()

In [None]:
dfs_poi = {
    "Fairfield": df_poi_fairfield,
    "Fairfax": df_poi_fairfax,
    "Hartford": df_poi_hartford,
    "New Haven": df_poi_new_haven,
    "Litchfield": df_poi_litchfield,
    "New London": df_poi_new_london,
    "Middlesex": df_poi_middlesex,
    "Tolland": df_poi_tolland,
    "Windham": df_poi_windham
}

In [None]:
for k, df_poi in dfs_poi.items():
        columns = ["building", 
                "amenity",
                "atm",
                "bus",
                "public_transport", 
                "tourism", 
                "geometry", 
                "aeroway", 
                "ferry",
                "healthcare",
                "government",
                "bar",
                "railway",
                "highway",
                "school", 
                "preschool",
                "museum"]
        df_poi = df_poi[columns]

        df_poi.loc[:, "bus"] = np.where((~df_poi.bus.isna()), "bus_"+df_poi.public_transport, df_poi.bus)
        df_poi.loc[:, "ferry"] = np.where((~df_poi.ferry.isna()), "ferry_"+df_poi.public_transport, df_poi.ferry)
        df_poi.loc[:, "railway"] = "railway_" +df_poi.loc[:,"railway"]
        df_poi.loc[:, "school"] = np.where(df_poi.school == "yes", "school", df_poi.school)
        df_poi.loc[:, "preschool"] = np.where(df_poi.preschool == "yes", "preschool", df_poi.preschool)
        df_poi.loc[:, "museum"] = np.where((~df_poi.museum.isna()), "museum", df_poi.museum)
        df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.tourism, df_poi.amenity)
        df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.aeroway, df_poi.amenity)
        df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.healthcare, df_poi.amenity)
        df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.bus, df_poi.amenity)
        df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.ferry, df_poi.amenity)
        df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.railway, df_poi.amenity)
        df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.school, df_poi.amenity)
        df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.preschool, df_poi.amenity)
        df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.museum, df_poi.amenity)
        #Well-known text (WKT) is a text markup language for representing vector geometry objects
        df_tmp = df.copy()
        df_tmp['geometry'] = df['geometry'].apply(wkt.loads)
        df_geo = gpd.GeoDataFrame(df_tmp, crs='epsg:4326')
        df_geo_county = df_geo[df_geo.county ==k]
        
        df_poi = df_poi.dropna(subset="geometry")
        df_poi["geometry"] = df_poi["geometry"].apply(wkt.loads)
        df_poi = gpd.GeoDataFrame(df_poi, crs="epsg:4326")

        for key, value in points_of_interest.items():
               df_poi_tmp =df_poi[df_poi.amenity.isin(value)]
               df_poi_tmp["geometry"] = df_poi_tmp.geometry.to_crs("EPSG:3857")
               df_geo_county[f"n_{key}"] = df_geo_county.geometry.to_crs("EPSG:3857").apply(lambda x: points_within(x, df_poi_tmp, 5000))
        
        df_geo_county.to_csv(f"df_{k}_geo_features.csv")

In [None]:
# educational feature
df_poi_ed =df_poi[
    df_poi.amenity.isin(["school", 
                         "university",
                         "college",
                         "library",
                         "kindergarten",
                         "music_school",
                         "prep_school",
                         "driving_school"
                         "childcare"])]
df_poi_ed["geometry"] = df_poi_ed.geometry.to_crs("EPSG:3857")


df_geo_fairfield["n_ed_distance"] = df_geo_fairfield.geometry.to_crs("EPSG:3857").apply(lambda x: nearest_points(x, df_poi_ed, distance_in_km=5))

In [None]:
def points_within(point, gdf, radius):
    buffer = point.buffer(radius)
    points_within = gdf.intersects(buffer)
    return points_within.sum()

##### Geo Distance Features

In [None]:
df = pd.read_csv("../data/cleaned/final_match.csv",index_col=[0])

In [None]:
df_poi_fairfield = pd.read_csv("/Volumes/Seagate/bavillion/poi/fairfield.csv")
df_poi_fairfax = pd.read_csv("/Volumes/Seagate/bavillion/poi/fairfax.csv")
df_poi_hartford = pd.read_csv("/Volumes/Seagate/bavillion/poi/hartford.csv")
df_poi_litchfield = pd.read_csv("/Volumes/Seagate/bavillion/poi/litchfield.csv")
df_poi_middlesex = pd.read_csv("/Volumes/Seagate/bavillion/poi/middlesex.csv")
df_poi_new_haven = pd.read_csv("/Volumes/Seagate/bavillion/poi/new_haven.csv")
df_poi_new_london = pd.read_csv("/Volumes/Seagate/bavillion/poi/new_london.csv")
df_poi_tolland = pd.read_csv("/Volumes/Seagate/bavillion/poi/tolland.csv")
df_poi_windham = pd.read_csv("/Volumes/Seagate/bavillion/poi/windham.csv")

In [None]:
df_poi_connecticut = pd.concat([df_poi_fairfield, 
                                df_poi_hartford,
                                df_poi_litchfield,
                                df_poi_middlesex,
                                df_poi_new_haven,
                                df_poi_new_london,
                                df_poi_tolland,
                                df_poi_windham])

In [None]:
points_of_interest = {
    "aerodrome": ["aerodrome"],
    "ferry_terminal": ["ferry_terminal"],
    "railway_station": ["railway_station", "railway_platform"],
    "market": ["market", "marketplace"],
    "hospital": ["hospital"],
    "hotel": ["hotel"],
    "museum": ["museum"]
}

In [None]:
def nearest_points_distance(point, gdf):
    return gdf.geometry.to_crs("EPSG:3857").centroid.distance(point).min()
    


In [None]:
df[df.county=="Fairfax"]

In [None]:
df_geo_county.to_csv("/Volumes/Seagate/bavillion/df_Fairfax_geo_distance_features.csv")

In [None]:
k = "fairfax"
df_poi = df_poi_fairfax.copy()
columns = ["building", 
        "amenity",
        "atm",
        "bus",
        "public_transport", 
        "tourism", 
        "geometry", 
        "aeroway", 
        "ferry",
        "healthcare",
        "government",
        "bar",
        "railway",
        "highway",
        "school", 
        "preschool",
        "museum"]
df_poi = df_poi[columns]

df_poi.loc[:, "bus"] = np.where((~df_poi.bus.isna()), "bus_"+df_poi.public_transport, df_poi.bus)
df_poi.loc[:, "ferry"] = np.where((~df_poi.ferry.isna()), "ferry_"+df_poi.public_transport, df_poi.ferry)
df_poi.loc[:, "railway"] = "railway_" +df_poi.loc[:,"railway"]
df_poi.loc[:, "school"] = np.where(df_poi.school == "yes", "school", df_poi.school)
df_poi.loc[:, "preschool"] = np.where(df_poi.preschool == "yes", "preschool", df_poi.preschool)
df_poi.loc[:, "museum"] = np.where((~df_poi.museum.isna()), "museum", df_poi.museum)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.tourism, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.aeroway, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.healthcare, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.bus, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.ferry, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.railway, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.school, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.preschool, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.museum, df_poi.amenity)
#Well-known text (WKT) is a text markup language for representing vector geometry objects
df_tmp = df.copy()
df_tmp['geometry'] = df['geometry'].apply(wkt.loads)
df_geo = gpd.GeoDataFrame(df_tmp, crs='epsg:4326')
df_geo_county = df_geo[df_geo.county =="Fairfax"]

df_poi = df_poi.dropna(subset="geometry")
df_poi["geometry"] = df_poi["geometry"].apply(wkt.loads)
df_poi = gpd.GeoDataFrame(df_poi, crs="epsg:4326")

for key, value in points_of_interest.items():
        df_poi_tmp =df_poi[df_poi.amenity.isin(value)]
        df_poi_tmp["geometry"] = df_poi_tmp.geometry.to_crs("EPSG:3857")
        df_geo_county[f"distance_{key}"] = df_geo_county.geometry.to_crs("EPSG:3857").apply(lambda x: nearest_points_distance(x, df_poi_tmp))

df_geo_county.to_csv("df_Fairfax_geo_distance_features.csv")