# Data Analysis (Geo Analysis)

In this notebook, we geographically analyze different geo features.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import math
import numpy as np
import folium
from shapely import wkt
import matplotlib.colors as mcolors
import h3
from shapely.geometry import Polygon
import geopandas as gpd
import mapclassify

### Preprocessing

In [None]:
df_econ_monthly = pd.read_csv("../data/cleaned/monthly_econ_data.csv", index_col=0)
df_econ_yearly = pd.read_csv("../data/cleaned/yearly_econ_data.csv", index_col=0)

In [None]:
df = pd.read_csv("../data/cleaned/df_final.csv", index_col=0)

In [None]:
df.loc[:, "geometry_y"] = df["geometry_y"].apply(wkt.loads)

In [None]:
df_conn = df.loc[df.state_y == "Connecticut"]
df_conn_2020 = df_conn.loc[df_conn.year == 2020]

In [None]:
df_fair = df.loc[df.county == "Fairfax"]
df_fair_2021 = df_fair.loc[df_fair.year == 2021]

In [None]:
# for testing
df_conn_test = df_conn_2020.head(10000)

In [None]:
# remove outliers
df_conn_test = df_conn_test.loc[df_conn_test.price > 1000]
df_conn_test = df_conn_test.loc[df_conn_test.price < 800000]

#### Price

In [None]:
# PRICE

# Create a map centered around Connecticut
m = folium.Map(location=[41.6032, -73.0877], zoom_start=8)

# Normalize the prices for color mapping
min_price = df_conn_test['price'].min()
max_price = df_conn_test['price'].max()

# Define a color scale
#colormap = folium.LinearColormap(colors=['blue', 'green', 'yellow', 'orange', 'red'], vmin=min_price, vmax=max_price)
# Normalize the prices for color mapping
colormap = folium.LinearColormap(colors=['#f0f8ff', '#0000ff'], vmin=min_price, vmax=max_price)
colormap.caption = 'House Prices in Connecticut'


# Add points to the map
for _, row in df_conn_test.iterrows():
    norm_price = (row['price'] - min_price) / (max_price - min_price)
    folium.CircleMarker(
        location=(row['geometry_y'].y, row['geometry_y'].x),
        radius=5,
        popup=f'Price: ${row["price"]}',
        color=colormap(row['price']),
        fill=True,
        fill_color=colormap(row['price'])
    ).add_to(m)

# Add color scale to map
colormap.add_to(m)
m

In [None]:
# NORM PRICE

# Create a map centered around Connecticut
m = folium.Map(location=[41.6032, -73.0877], zoom_start=8)

# Normalize the prices for color mapping
min_price = df_conn_test['price'].min()
max_price = df_conn_test['price'].max()
norm_prices = (df_conn_test['price'] - min_price) / (max_price - min_price)

# Define a color scale
colormap = folium.LinearColormap(colors=['blue', 'green', 'yellow', 'orange', 'red'],  vmin=norm_prices.min(), vmax=norm_prices.max())
# Normalize the prices for color mapping
#colormap = folium.LinearColormap(colors=['#f0f8ff', '#0000ff'], vmin=norm_prices.min(), vmax=norm_prices.max())
colormap.caption = 'House Prices in Connecticut'


# Add points to the map
for _, row in df_conn_test.iterrows():
    norm_price = (row['price'] - min_price) / (max_price - min_price)
    folium.CircleMarker(
        location=(row['geometry_y'].y, row['geometry_y'].x),
        radius=5,
        popup=f'Price: ${row["price"]}',
        color=colormap(norm_price),
        fill=True,
        fill_color=colormap(norm_price)
    ).add_to(m)

# Add color scale to map
#colormap.caption = 'House Prices in Connecticut'
colormap.add_to(m)
m

In [None]:
# Create a map centered around Connecticut
m = folium.Map(location=[41.6032, -73.0877], zoom_start=8)

df_conn_test["log_price"] = np.log(df_conn_test.price)
# Normalize the prices for color mapping
min_price = df_conn_test['log_price'].min()
max_price = df_conn_test['log_price'].max()
norm_prices = (df_conn_test['log_price'] - min_price) / (max_price - min_price)

# Define a color scale
colormap = folium.LinearColormap(colors=['blue', 'green', 'yellow', 'orange', 'red'], vmin=min_price, vmax=max_price)
# Normalize the prices for color mapping
#colormap = folium.LinearColormap(colors=['#f0f8ff', '#0000ff'], vmin=min_price, vmax=max_price)
colormap.caption = 'House Prices in Connecticut'


# Add points to the map
for _, row in df_conn_test.iterrows():
    norm_price = (row['log_price'] - min_price) / (max_price - min_price)
    row["geometry_y"] = row.geometry_y.centroid
    folium.CircleMarker(
        location=(row['geometry_y'].y, row['geometry_y'].x),
        radius=5,
        popup=f'Price: ${row["price"]}',
        color=colormap(row['log_price']),
        fill=True,
        fill_color=colormap(row['log_price'])
    ).add_to(m)

# Add color scale to map
#colormap.caption = 'House Prices in Connecticut'
colormap.add_to(m)
m

##### Features

In [None]:

def plot_map(df, feature, one_color=False, log=False, norm=False):
    # Create a map centered around Connecticut
    m = folium.Map(location=[41.6032, -73.0877], zoom_start=8)
    df_map = df.copy()
    if log:
        df_map[feature] = np.log(df_map[feature])
    if norm:
        min = df_map[feature].min()
        max = df_map[feature].max()
        df_map[feature] = (df_map[feature] - min) / (max - min)

    # Normalize the prices for color mapping
    min = df_map[feature].min()
    max = df_map[feature].max()

    if one_color:
        colormap = folium.LinearColormap(colors=['#f0f8ff', '#0000ff'], vmin=min, vmax=max)
    else: 
        colormap = folium.LinearColormap(colors=['blue', 'green', 'yellow', 'orange', 'red'], vmin=min, vmax=max)

    colormap.caption = feature


    # Add points to the map
    for _, row in df_map.iterrows():
        #norm = (row['distance_ferry_terminal'] - min_price) / (max_price - min_price)
        row["geometry_y"] = row.geometry_y.centroid
        folium.CircleMarker(
            location=(row['geometry_y'].y, row['geometry_y'].x),
            radius=5,
            popup=f'{feature}: {row[feature]}',
            color=colormap(row[feature]),
            fill=True,
            fill_color=colormap(row[feature])
        ).add_to(m)

    # Add color scale to map
    colormap.add_to(m)
    return m

##### Price

In [None]:
plot_map(df_conn_2020, "price", log=True)

In [None]:
plot_map(df_conn[df_conn.year == 2000], "price", log=True)

In [None]:
plot_map(df_conn[df_conn.year == 2005], "price", log=True)

In [None]:
plot_map(df_conn[df_conn.year == 2021], "price", log=True)

In [None]:
plot_map(df_fair_2021, "price", log=True)

##### Features

In [None]:
df_conn_2021 = df_conn[df_conn["year"] == 2021]

In [None]:
df_conn_2021.columns

In [None]:
plot_map(df_conn_2021, "livarea")

In [None]:
plot_map(df_conn_2021, "distance_aerodrome")

In [None]:
plot_map(df_fair_2021, "distance_aerodrome")

In [None]:
plot_map(df_conn_2021, "distance_ferry_terminal")

In [None]:
plot_map(df_fair_2021, "distance_ferry_terminal")

In [None]:
plot_map(df_conn_2021, "distance_railway_station")

In [None]:
plot_map(df_fair_2021, "distance_railway_station")

In [None]:
plot_map(df_conn_2021, "distance_market")

In [None]:
plot_map(df_fair_2021, "distance_market")

In [None]:
plot_map(df_conn_2021, "distance_hospital")

In [None]:
plot_map(df_fair_2021, "distance_hospital")

In [None]:
plot_map(df_conn_2021, "distance_hotel")

In [None]:
plot_map(df_fair_2021, "distance_hotel")

In [None]:
plot_map(df_conn_2021, "distance_museum")

In [None]:
plot_map(df_fair_2021, "distance_museum")

### Uber H3

In [None]:
def geometry_to_h3(geometry, resolution):
    return h3.geo_to_h3(geometry.y, geometry.x, resolution=resolution)

In [None]:
def h3_to_geometry(h3_index):
    return Polygon(h3.h3_to_geo_boundary(h3_index, geo_json=True))

In [None]:
def get_grouped_h3_df(df, feature, resolution, log=False, county=None):
    if county:
        df = df[df["county"] == county]
    df["h3"] = df.geometry_y.map(lambda x: geometry_to_h3(x, resolution))
    grouped = df.groupby(by="h3")[feature].mean().to_frame().reset_index()
    grouped["geometry"] = grouped.h3.map(h3_to_geometry)
    if log:
        grouped[feature] = np.log(grouped[feature])
    return gpd.GeoDataFrame(grouped, crs='EPSG:4326', geometry='geometry')

In [None]:
df_conn_2021["geometry_y"] = df_conn_2021.geometry_y.map(lambda x: x.centroid)

In [None]:
df_fair_2021["geometry_y"] = df_fair_2021.geometry_y.map(lambda x: x.centroid)

##### Price

In [None]:
grouped = get_grouped_h3_df(df_fair_2021, "price", 7, log=False, county="Fairfax")
m = folium.Map(location=[41.6032, -73.0877], zoom_start=11)
grouped.explore(column=grouped['price'], vmax = grouped['price'].quantile(0.75), tooltip=True, cmap="viridis", m = m)

In [None]:
grouped = get_grouped_h3_df(df_conn_2021, "price", 7, log=False)
m = folium.Map(location=[41.6032, -73.0877], zoom_start=11)
grouped.explore(column=grouped['price'], vmax = grouped['price'].quantile(0.75), tooltip=True, cmap="viridis", m = m)

##### Features

In [None]:
grouped = get_grouped_h3_df(df_conn_2021, "distance_aerodrome", 7)
m = folium.Map(location=[41.6032, -73.0877], zoom_start=11)
grouped.explore(column=grouped['distance_aerodrome'], vmax = grouped['distance_aerodrome'].quantile(0.75), tooltip=True, cmap="viridis", m = m)

In [None]:
location = [38.877716, -77.297486]
feature = "n_utilities"
#location=[41.6032, -73.0877]
grouped = get_grouped_h3_df(df_fair_2021, feature, 7)
m = folium.Map(location=location, zoom_start=10)
grouped.explore(column=grouped[feature], vmax = grouped[feature].quantile(0.75), tooltip=True, cmap="viridis", m = m)

In [None]:
grouped = get_grouped_h3_df(df_conn_test, "distance_aerodrome", 7,  county="Fairfield")
m = folium.Map(location=[41.6032, -73.0877], zoom_start=11)
grouped.explore(column=grouped['distance_aerodrome'], vmax = grouped['distance_aerodrome'].quantile(0.75), tooltip=True, cmap="viridis", m = m)

In [None]:
grouped = get_grouped_h3_df(df_conn_test, "distance_aerodrome", 7)
m = folium.Map(location=[41.6032, -73.0877], zoom_start=11)
grouped.explore(column=grouped['distance_aerodrome'], vmax = grouped['distance_aerodrome'].quantile(0.75), tooltip=True, cmap="viridis", m = m)

#### Geo Feature

In [None]:
df_poi_fairfield = pd.read_csv("../data/poi_geo/fairfield.csv")
df_poi_fairfax = pd.read_csv("../data/poi_geo/fairfax.csv")
df_poi_hartford = pd.read_csv("../data/poi_geo/hartford.csv")
df_poi_litchfield = pd.read_csv("../data/poi_geo/litchfield.csv")
df_poi_middlesex = pd.read_csv("../data/poi_geo/middlesex.csv")
df_poi_new_haven = pd.read_csv("../data/poi_geo/new_haven.csv")
df_poi_new_london = pd.read_csv("../data/poi_geo/new_london.csv")
df_poi_tolland = pd.read_csv("../data/poi_geo/tolland.csv")
df_poi_windham = pd.read_csv("../data/poi_geo//windham.csv")

In [None]:
df_poi_fairfield["county"] = "Fairfield"
df_poi_fairfax ["county"] = "Fairfax"
df_poi_hartford["county"] = "Hartford"
df_poi_litchfield["county"] = "Litchfield"
df_poi_middlesex["county"] = "Middlesex"
df_poi_new_haven["county"] = "New Haven"
df_poi_new_london["county"] = "New London"
df_poi_tolland["county"] = "Tolland"
df_poi_windham["county"] = "Windham"

In [None]:
df_poi = pd.concat([df_poi_fairfield,
          df_poi_fairfax,
          df_poi_hartford,
          df_poi_litchfield,
          df_poi_middlesex,
          df_poi_new_haven,
          df_poi_new_london,
          df_poi_tolland,
          df_poi_windham])

In [None]:
columns = ["building", 
        "amenity",
        "atm",
        "bus",
        "public_transport", 
        "tourism", 
        "geometry", 
        "aeroway", 
        "ferry",
        "healthcare",
        "government",
        "bar",
        "railway",
        "highway",
        "school", 
        "preschool",
        "museum",
        "county"]
df_poi = df_poi[columns]

In [None]:
df_poi.loc[:, "bus"] = np.where((~df_poi.bus.isna()), "bus_"+df_poi.public_transport, df_poi.bus)
df_poi.loc[:, "ferry"] = np.where((~df_poi.ferry.isna()), "ferry_"+df_poi.public_transport, df_poi.ferry)
df_poi.loc[:, "railway"] = "railway_" +df_poi.loc[:,"railway"]
df_poi.loc[:, "school"] = np.where(df_poi.school == "yes", "school", df_poi.school)
df_poi.loc[:, "preschool"] = np.where(df_poi.preschool == "yes", "preschool", df_poi.preschool)
df_poi.loc[:, "museum"] = np.where((~df_poi.museum.isna()), "museum", df_poi.museum)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.tourism, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.aeroway, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.healthcare, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.bus, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.ferry, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.railway, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.school, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.preschool, df_poi.amenity)
df_poi.loc[:,"amenity"] = np.where(df_poi.amenity.isna(), df_poi.museum, df_poi.amenity)

In [None]:
df_poi = df_poi.dropna(subset="geometry")

In [None]:
df_poi['geometry'] = df_poi['geometry'].apply(wkt.loads)
df_geo = gpd.GeoDataFrame(df_poi, crs='epsg:4326')
df_geo["geometry"] = df_geo.geometry.map(lambda x: x.centroid)

In [None]:
points_of_interest = {
    "reli_inst": ["place_of_worship",
                  "monastery"],
    "edu_fac": ["school", 
           "university", 
           "college", 
           "library",
           "kindergarten",
           "music_school",
           "prep_school",
           "driving_school",
           "childcare"],
    "healthcare": ["dentist", 
              "clinic", 
              "hospital", 
              "doctors", 
              "nursing_home"],
    "emergency": ["fire_station", "police"],
    "animalcare": ["shelter","animal_shelter", "animal_boarding"],
    "commu_venu": ["social_facility", 
              "community_centre",
              "exhibition_centre",
              "conference_centre",
              "social_centre", 
              "townhall",
              "coworking_space"],
    "commu_serv": [
              "charity",
              "public_building",
              "grave_yard",
              "crematorium",
              "mortuary",
              "ranger_station",
              "post_depot",
              "mail_room",
              "public_bath"
              "public_bookcase"],
    "shopping": ["marketplace", "market"],
    "food_drink": ["restaurant",
              "fast_food",
              "ice_cream",
              "cafe",
              "bbq",
              "canteen"],
    "financial": ["bank",
             "atm",
             "finance",
             "money_transfer",
             "check_cashing"],
    "transport": ["aerodrome",
             "railway_platform",
             "ferry_terminal",
             "bus_station",
             "boat_storage",
             "bus_platform",
             "taxi",
             "railway_halt",
             "railway_car_shuttle",
             "car_sharing",
             "railway_station",
             "bus_stop_position",
             "ferry_stop_position"],
    "entertainment": ["museum",
                 "arts_centre",
                 "theme_park",
                 "stadium_seating",
                 "cinema",
                 "theatre",
                 "attraction",
                 "amusement",
                 "events_venue",
                 "karaoke_box",
                 "music_venue",
                 "planetarium",
                 "lounge",
                 "internet_cafe"
                 ],
    "adults_entertain": [
                     'hookah',
                     'hookah_lounge',
                     "biergarten",
                     "casino",
                     "nightclub",
                    "pub",
                    "stripclub",
                    "bar",
                    "love_hotel"],
    "sports": ["dancing_school",
               "dojo",
               "ski_school", 
               "ski_rental"],
    "utilities": ["charging_station",
                  'compressed_air', 
                  'sanitary_dump_station', 
                  "vacuum_cleaner",
                  "waste_transfer_station",
              "waste_disposal"],
    "accommodation": ["motel", "hotel", "hostel"],
    "government_civic": ["courthouse", "prison"],
    "recreational": ["park",
                "campground",
                "camp_site",
                "picnic_site",
                "zoo",
                "aquarium",
                "viewpoint",
                "boat_rental",
                "bicycle_rental"]
}


In [None]:
df_geo["amenity_group"] = np.nan

In [None]:
for key, value in points_of_interest.items():
    df_geo.loc[df_geo.amenity.isin(value), "amenity_group"] = key

In [None]:
df_geo.amenity_group.unique()

In [None]:
location = [38.877716, -77.297486]
feature = "shopping"
#location=[41.6032, -73.0877]

df_geo_am = df_geo[df_geo["amenity_group"] == feature]

# Create a map centered around Connecticut
m = folium.Map(location=location, zoom_start=10)

# Add points to the map
for _, row in df_geo_am.iterrows():
    folium.CircleMarker(
        location=(row['geometry'].y, row['geometry'].x),
        radius=5,
        popup=f'{row["amenity"]}',
        color="blue",
    ).add_to(m)

m

In [None]:
df_geo["h3"] = df_geo.geometry.map(lambda x: geometry_to_h3(x, resolution=9))

In [None]:
df_geo_am = df_geo[df_geo["amenity_group"] == "edu_fac"]
df_geo_am = df_geo_am[df_geo_am["county"] == "Fairfield"]

In [None]:
df_geo.amenity_group.unique()

In [None]:
df_geo_am = df_geo[df_geo["amenity_group"] == "shopping"]
df_geo_am.groupby(by=["h3"])

# Create a map centered around Connecticut
m = folium.Map(location=[41.6032, -73.0877], zoom_start=8)

# Add points to the map
for _, row in df_geo_am.iterrows():
    folium.CircleMarker(
        location=(row['geometry'].y, row['geometry'].x),
        radius=5,
        popup=f'{row["amenity"]}',
        color="blue",
    ).add_to(m)

m

In [None]:
def poi_h3_df(df, resolution=9, county=None, amenity_group=None, log=None):
    df_geo_am = df.copy()
    if amenity_group:
        df_geo_am = df_geo_am[df_geo_am["amenity_group"] == amenity_group]
    if county:
        df_geo_am = df_geo_am[df_geo_am["county"] == county]
    df_geo_am["h3"] = df_geo_am.geometry.map(lambda x: geometry_to_h3(x, resolution=resolution))
    df_h3 = df_geo_am.groupby(by="h3").size().to_frame().reset_index().rename(columns={0: "freq"})
    df_h3["geometry"] = df_h3.h3.map(h3_to_geometry)
    if log:
        df_h3["freq"] = np.log(df_h3["freq"])
    return gpd.GeoDataFrame(df_h3, crs='EPSG:4326', geometry='geometry')



In [None]:
grouped = poi_h3_df(df_geo, county="Fairfield", amenity_group="edu_fac", resolution=7, log=True)
m = folium.Map(location=[41.6032, -73.0877], zoom_start=11)
grouped.explore(column=grouped['freq'], vmax = grouped['freq'].max(), tooltip=True, cmap="viridis", m = m)

In [None]:
df_geo.amenity_group.unique()

In [None]:
location = [38.877716, -77.297486]
feature = "shopping"
#location=[41.6032, -73.0877]

grouped = poi_h3_df(df_geo, amenity_group=feature, resolution=7, log=True)
m = folium.Map(location=location, zoom_start=10)
grouped.explore(column=grouped['freq'], vmax = grouped['freq'].max(), tooltip=True, cmap="viridis", m = m)

In [None]:
grouped = poi_h3_df(df_geo, county="Fairfield", amenity_group="reli_inst", resolution=7, log=True)
m = folium.Map(location=[41.6032, -73.0877], zoom_start=11)
grouped.explore(column=grouped['freq'], vmax = grouped['freq'].max(), tooltip=True, cmap="viridis", m = m)

In [None]:
grouped = poi_h3_df(df_geo, county="Fairfield", amenity_group="food_drink", resolution=7, log=True)
m = folium.Map(location=[41.6032, -73.0877], zoom_start=11)
grouped.explore(column=grouped['freq'], vmax = grouped['freq'].max(), tooltip=True, cmap="viridis", m = m)