In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/Colab Notebooks/EY 2025 DS Challenge'

/content/drive/MyDrive/Colab Notebooks/EY 2025 DS Challenge


# Load Original Dataset

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
import geopandas as gpd
from shapely.geometry import Point
from scipy.spatial import cKDTree

In [None]:
# Load data.csv and weather.csv
data_df = pd.read_csv('datasets/data.csv', parse_dates=['datetime'])
data_df.rename(columns={'datetime': 'Datetime'}, inplace=True)
sub_df = pd.read_csv('datasets/template.csv')

weather_df = pd.read_csv('datasets/weather.csv', parse_dates=['Datetime'])
gdf_buildings = gpd.read_file("datasets/footprints.kml", driver='KML')

display(data_df.head())
display(sub_df.head())
display(weather_df.head())
display(gdf_buildings.head())

Unnamed: 0,Longitude,Latitude,Datetime,UHI Index
0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289
1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289
2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798
3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798
4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634


Unnamed: 0,Longitude,Latitude,UHI Index
0,-73.971665,40.788763,
1,-73.971928,40.788875,
2,-73.96708,40.78908,
3,-73.97255,40.789082,
4,-73.969697,40.787953,


Unnamed: 0,Region,Longitude,Latitude,Datetime,Air Temp at Surface,Relative Humidity,Avg Wind Speed,Wind Direction,Solar Flux
0,Bronx,-73.89352,40.87248,2021-07-24 06:00:00,19.3,88.2,0.8,335,12
1,Bronx,-73.89352,40.87248,2021-07-24 06:05:00,19.4,87.9,0.8,329,18
2,Bronx,-73.89352,40.87248,2021-07-24 06:10:00,19.3,87.6,0.7,321,25
3,Bronx,-73.89352,40.87248,2021-07-24 06:15:00,19.4,87.4,0.5,307,33
4,Bronx,-73.89352,40.87248,2021-07-24 06:20:00,19.4,87.0,0.2,301,42


Unnamed: 0,Name,Description,geometry
0,,,"MULTIPOLYGON (((-73.91903 40.8482, -73.91933 4..."
1,,,"MULTIPOLYGON (((-73.92195 40.84963, -73.92191 ..."
2,,,"MULTIPOLYGON (((-73.9205 40.85011, -73.92045 4..."
3,,,"MULTIPOLYGON (((-73.92056 40.8514, -73.92053 4..."
4,,,"MULTIPOLYGON (((-73.91234 40.85218, -73.91247 ..."


# Map footprints onto orignal data and submission data
[To do: try setting different buffersize for each region]

In [None]:
# Convert Longitude and Latitude columns into Point geometries
train_gdf = gpd.GeoDataFrame(data_df, geometry=gpd.points_from_xy(data_df['Longitude'], data_df['Latitude']))
sub_gdf = gpd.GeoDataFrame(sub_df, geometry=gpd.points_from_xy(sub_df['Longitude'], sub_df['Latitude']))

# Ensure all dataframes use the same CRS (Coordinate Reference System)
train_gdf.set_crs('EPSG:4326', allow_override=True, inplace=True)
sub_gdf.set_crs('EPSG:4326', allow_override=True, inplace=True)
gdf_buildings.set_crs('EPSG:4326', allow_override=True, inplace=True)

display(train_gdf.head())
display(sub_gdf.head())

Unnamed: 0,Longitude,Latitude,Datetime,UHI Index,geometry
0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,POINT (-73.90917 40.81311)
1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,POINT (-73.90919 40.81304)
2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,POINT (-73.90922 40.81298)
3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,POINT (-73.90924 40.81291)
4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,POINT (-73.90926 40.81284)


Unnamed: 0,Longitude,Latitude,UHI Index,geometry
0,-73.971665,40.788763,,POINT (-73.97166 40.78876)
1,-73.971928,40.788875,,POINT (-73.97193 40.78888)
2,-73.96708,40.78908,,POINT (-73.96708 40.78908)
3,-73.97255,40.789082,,POINT (-73.97255 40.78908)
4,-73.969697,40.787953,,POINT (-73.9697 40.78795)


In [None]:
# Set a buffer size variable for radius (in meters)
buffer_size = 1500

## Nearest Building Distance

In [None]:
# Function to calculate nearest building distance for each point using vectorized operations
def nearest_building_distance(sub_gdf, gdf_buildings, buffer_size):
    # Convert building footprints and sub points into the same CRS if not already
    gdf_buildings = gdf_buildings.to_crs(sub_gdf.crs)

    # Create a buffer around each sub point (this is a vectorized operation)
    sub_gdf['buffer'] = sub_gdf.geometry.buffer(buffer_size)

    # Spatial join to find nearest buildings within the buffer
    nearest_buildings = gpd.sjoin_nearest(sub_gdf, gdf_buildings, how="left", distance_col="nearest_building_distance")

    return nearest_buildings['nearest_building_distance']

# Apply the optimized function to calculate nearest building distance for the sub data
sub_df['nearest_building_distance'] = nearest_building_distance(sub_gdf, gdf_buildings, buffer_size)

# Apply the same to train data if necessary (can be done in a similar manner)
data_df['nearest_building_distance'] = nearest_building_distance(train_gdf, gdf_buildings, buffer_size)


## Building Density

In [None]:
from sklearn.neighbors import BallTree

# Convert building geometries to numpy array (lon, lat) from centroids
building_coords = np.array([(geom.x, geom.y) for geom in gdf_buildings.geometry.centroid])

# Initialize BallTree with geospatial coordinates (convert to radians for haversine)
tree = BallTree(np.radians(building_coords), metric='haversine')

# Function to calculate building density (number of nearby buildings within a radius)
def building_density_balltree(row, tree, radius=buffer_size):
    point = np.radians([[row.geometry.x, row.geometry.y]])  # Convert point to radians
    indices = tree.query_radius(point, r=radius / 6371000)  # Radius in kilometers, convert to radians
    return len(indices[0])  # Return the number of nearby buildings

# Apply BallTree density calculation for train and sub datasets
train_gdf['building_density'] = train_gdf.apply(building_density_balltree, axis=1, tree=tree)
sub_gdf['building_density'] = sub_gdf.apply(building_density_balltree, axis=1, tree=tree)

# Apply the same to train data if necessary (can be done in a similar manner)
data_df['building_density'] = train_gdf['building_density']
sub_df['building_density'] = sub_gdf['building_density']

## Building Area and Compactness

In [None]:
# Calculate building area
gdf_buildings['building_area'] = gdf_buildings.geometry.area

# Function to calculate building compactness
def calculate_compactness(geometry):
    perimeter = geometry.length
    area = geometry.area
    if area == 0:  # Avoid division by zero
        return np.nan
    compactness = (4 * np.pi * area) / (perimeter ** 2)
    return compactness

# Apply the compactness function to each building geometry
gdf_buildings['building_compactness'] = gdf_buildings.geometry.apply(calculate_compactness)

# Now let's add building area and compactness to the train and sub datasets
# Convert building geometries to numpy array (lon, lat) from centroids for BallTree
building_coords = np.array([(geom.centroid.x, geom.centroid.y) for geom in gdf_buildings.geometry])

# Initialize BallTree with geospatial coordinates (convert to radians for haversine)
tree = BallTree(np.radians(building_coords), metric='haversine')

# Function to calculate building density (number of nearby buildings within a radius)
def building_density_balltree(row, tree, radius=buffer_size):
    point = np.radians([[row.geometry.x, row.geometry.y]])  # Convert point to radians
    indices = tree.query_radius(point, r=radius / 6371000)  # Radius in kilometers, convert to radians
    return len(indices[0])  # Return the number of nearby buildings

# Apply BallTree density calculation for train and sub datasets
train_gdf['building_density'] = train_gdf.apply(building_density_balltree, axis=1, tree=tree)
sub_gdf['building_density'] = sub_gdf.apply(building_density_balltree, axis=1, tree=tree)

# Add the calculated building_area, building_compactness, and building_density to the data and sub datasets
data_df['building_area'] = gdf_buildings['building_area']
data_df['building_compactness'] = gdf_buildings['building_compactness']
sub_df['building_area'] = gdf_buildings['building_area']
sub_df['building_compactness'] = gdf_buildings['building_compactness']


## Final Look after combining Footprints

In [None]:
display(data_df.head())
display(sub_df.head())

Unnamed: 0,Longitude,Latitude,Datetime,UHI Index,nearest_building_distance,building_density,building_area,building_compactness
0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,0.000145,2804,6.62175e-08,0.501687
1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,0.000147,2805,1.0179e-08,0.62032
2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,0.000155,2807,1.5094e-08,0.572883
3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,0.000142,2799,8.512e-09,0.533518
4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,0.000124,2795,2.3091e-08,0.756804


Unnamed: 0,Longitude,Latitude,UHI Index,nearest_building_distance,building_density,building_area,building_compactness
0,-73.971665,40.788763,,0.000156,796,6.62175e-08,0.501687
1,-73.971928,40.788875,,0.000138,791,1.0179e-08,0.62032
2,-73.96708,40.78908,,0.0,931,1.5094e-08,0.572883
3,-73.97255,40.789082,,5e-06,775,8.512e-09,0.533518
4,-73.969697,40.787953,,8.9e-05,849,2.3091e-08,0.756804


# Map weather data onto the original and submission data
Here we assume that the submission data is collected around the same time with the orginal data provided.

## Define the potential region (Manhattan/Bronx) for each coordinate in the orginal and submission data

In [None]:
from geopy.distance import geodesic

# Coordinates for Manhattan and Bronx based on NY weather excel
manhattan_coords = (-73.96449, 40.76754)
bronx_coords = (-73.89352, 40.87248)

# Function to assign the correct region based on the closest distance
def assign_region_by_distance(row):
    data_coords = (row['Latitude'], row['Longitude'])

    # Calculate distances to Manhattan and Bronx
    manhattan_distance = geodesic(data_coords, manhattan_coords).meters
    bronx_distance = geodesic(data_coords, bronx_coords).meters

    # Assign to the closest region
    if manhattan_distance < bronx_distance:
        return 'Manhattan'
    else:
        return 'Bronx'

# Apply the function to assign regions based on proximity
data_df['Region'] = data_df.apply(assign_region_by_distance, axis=1)
sub_df['Region'] = sub_df.apply(assign_region_by_distance, axis=1)


## Map the closest datetime to the submission data based on the coordinates with original data
If a coordinate is found to have similar time, it takes the closest time in 10m radius

In [None]:
distance_threshold = 10

In [None]:
from sklearn.neighbors import KDTree
import datetime

# Function to calculate the midpoint of two times
def calculate_midpoint_time(time1, time2):
    time1 = pd.to_datetime(time1)
    time2 = pd.to_datetime(time2)
    midpoint = time1 + (time2 - time1) / 2
    return midpoint

# Function to match closest coordinates and estimate time
def match_closest_coordinates_and_estimate_time(train_df, sub_df):
    # Convert latitude and longitude to a 2D array for KDTree
    train_coords = train_df[['Latitude', 'Longitude']].values
    sub_coords = sub_df[['Latitude', 'Longitude']].values

    # Create a KDTree from the training data
    tree = KDTree(train_coords)

    # Initialize a list to store estimated times
    estimated_times = []

    # Iterate over each sub record
    for sub_row in sub_df.itertuples():
        sub_coord = (sub_row.Latitude, sub_row.Longitude)

        # Find the closest training coordinates using KDTree
        dist, ind = tree.query([sub_coord], k=2)  # k=2 to find the two closest matches

        # Get the closest and second closest index
        closest_index = ind[0][0]
        second_closest_index = ind[0][1]

        # If both coordinates are very close, calculate the midpoint time
        if dist[0][0] < distance_threshold:  # [edit this if needed]
            time1 = train_df.loc[closest_index, 'Datetime']
            time2 = train_df.loc[second_closest_index, 'Datetime']
            estimated_time = calculate_midpoint_time(time1, time2)
        else:
            # If the closest coordinates are far enough apart, use the time from the closest match
            estimated_time = train_df.loc[closest_index, 'Datetime']

        # Append the estimated time
        estimated_times.append(estimated_time)

    # Add the estimated times as a new column in the sub DataFrame
    sub_df['Datetime'] = estimated_times

    return sub_df

# Call the function to match coordinates and estimate time
sub_df = match_closest_coordinates_and_estimate_time(data_df, sub_df)


## Based on the region and time, assign the closest weather

In [None]:
from geopy.distance import geodesic

# Filter weather data for each region (Manhattan and Bronx)
manhattan_weather = weather_df[weather_df['Region'] == 'Manhattan']
bronx_weather = weather_df[weather_df['Region'] == 'Bronx']

# Define a function to find the closest datetime in weather data for a given row in data_df
def find_closest_weather_datetime(row, weather_df):
    data_datetime = pd.to_datetime(row['Datetime'])

    # Find the closest datetime in weather data
    weather_df['Datetime'] = pd.to_datetime(weather_df['Datetime'])
    closest_row = weather_df.iloc[(weather_df['Datetime'] - data_datetime).abs().argmin()]

    return closest_row

# Function to assign weather data to data_df based on Region and closest Datetime
def assign_weather_data(row):
    if row['Region'] == 'Manhattan':
        weather_row = find_closest_weather_datetime(row, manhattan_weather)
    elif row['Region'] == 'Bronx':
        weather_row = find_closest_weather_datetime(row, bronx_weather)
    else:
        return pd.Series([None] * 5, index=['Air Temp at Surface', 'Relative Humidity', 'Avg Wind Speed', 'Wind Direction', 'Solar Flux'])

    # Return the weather data as a new row
    return pd.Series({
        'Air Temp at Surface': weather_row['Air Temp at Surface'],
        'Relative Humidity': weather_row['Relative Humidity'],
        'Avg Wind Speed': weather_row['Avg Wind Speed'],
        'Wind Direction': weather_row['Wind Direction'],
        'Solar Flux': weather_row['Solar Flux']
    })

weather_data = data_df.apply(assign_weather_data, axis=1)
data_df = pd.concat([data_df, weather_data], axis=1)

weather_data = sub_df.apply(assign_weather_data, axis=1)
sub_df = pd.concat([sub_df, weather_data], axis=1)

## Final look on the combined weather info onto the original and submission datasets

In [None]:
display(data_df)
display(sub_df)

Unnamed: 0,Longitude,Latitude,Datetime,UHI Index,nearest_building_distance,building_density,building_area,building_compactness,Region,Air Temp at Surface,Relative Humidity,Avg Wind Speed,Wind Direction,Solar Flux
0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,0.000145,2804,6.621750e-08,0.501687,Manhattan,26.8,46.7,3.4,196.0,605.0
1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,0.000147,2805,1.017900e-08,0.620320,Manhattan,26.8,46.7,3.4,196.0,605.0
2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,0.000155,2807,1.509400e-08,0.572883,Manhattan,26.8,46.7,3.4,196.0,605.0
3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,0.000142,2799,8.512000e-09,0.533518,Manhattan,26.8,46.7,3.4,196.0,605.0
4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,0.000124,2795,2.309100e-08,0.756804,Manhattan,26.8,46.7,3.4,196.0,605.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11224,-73.957050,40.790333,2021-07-24 15:57:00,0.972470,0.002234,1380,,,Manhattan,26.8,46.7,3.4,196.0,605.0
11225,-73.957063,40.790308,2021-07-24 15:57:00,0.972470,0.002228,1380,,,Manhattan,26.8,46.7,3.4,196.0,605.0
11226,-73.957093,40.790270,2021-07-24 15:57:00,0.981124,0.002211,1378,,,Manhattan,26.8,46.7,3.4,196.0,605.0
11227,-73.957112,40.790253,2021-07-24 15:59:00,0.981245,0.002199,1378,,,Manhattan,27.0,46.1,2.7,209.0,620.0


Unnamed: 0,Longitude,Latitude,UHI Index,nearest_building_distance,building_density,building_area,building_compactness,Region,Datetime,Air Temp at Surface,Relative Humidity,Avg Wind Speed,Wind Direction,Solar Flux
0,-73.971665,40.788763,,0.000156,796,6.621750e-08,0.501687,Manhattan,2021-07-24 15:30:00,27.3,45.4,3.8,202.0,349.0
1,-73.971928,40.788875,,0.000138,791,1.017900e-08,0.620320,Manhattan,2021-07-24 15:30:00,27.3,45.4,3.8,202.0,349.0
2,-73.967080,40.789080,,0.000000,931,1.509400e-08,0.572883,Manhattan,2021-07-24 15:29:00,27.3,45.4,3.8,202.0,349.0
3,-73.972550,40.789082,,0.000005,775,8.512000e-09,0.533518,Manhattan,2021-07-24 15:30:00,27.3,45.4,3.8,202.0,349.0
4,-73.969697,40.787953,,0.000089,849,2.309100e-08,0.756804,Manhattan,2021-07-24 15:29:00,27.3,45.4,3.8,202.0,349.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035,-73.919388,40.813803,,0.000110,2523,1.530800e-08,0.508707,Manhattan,2021-07-24 15:26:00,27.2,46.4,1.4,175.0,725.0
1036,-73.931033,40.833178,,0.000112,2131,1.248400e-08,0.444086,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209.0,511.0
1037,-73.934647,40.854542,,0.000133,1362,3.905800e-08,0.653475,Manhattan,2021-07-24 15:32:00,27.3,45.4,3.8,202.0,349.0
1038,-73.917223,40.815413,,0.000099,2697,7.773500e-08,0.752537,Manhattan,2021-07-24 15:28:00,27.3,45.4,3.8,202.0,349.0


# Combine Sentinel2 Information

In [None]:
!pip install planetary-computer rioxarray



In [None]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Multi-dimensional arrays and datasets
import xarray as xr

# Geospatial raster data handling
import rioxarray as rxr

# Geospatial data analysis
import geopandas as gpd

# Geospatial operations
import rasterio
from rasterio import windows
from rasterio import features
from rasterio import warp
from rasterio.warp import transform_bounds
from rasterio.windows import from_bounds

# Image Processing
from PIL import Image

# Coordinate transformations
from pyproj import Proj, Transformer, CRS

# Planetary Computer Tools
import pystac_client
import planetary_computer as pc
from pystac.extensions.eo import EOExtension as eo

# Others
import os
from tqdm import tqdm

## Load S2 Tiff

In [None]:
# Open the GeoTIFF file
S2_TIFF = "datasets/S2_sample_all.tiff"

# Read the bands from the GeoTIFF file
with rasterio.open(S2_TIFF) as src1:
    sband1 = src1.read(1)  # Band [B01]
    sband2 = src1.read(2)  # Band [B02]
    sband3 = src1.read(3)  # Band [B03]
    sband4 = src1.read(4)  # Band [B04]
    sband5 = src1.read(5)  # Band [B05]
    sband6 = src1.read(6)  # Band [B06]
    sband7 = src1.read(7)  # Band [B07]
    sband8 = src1.read(8)  # Band [B08]
    sband8a = src1.read(9)  # Band [B8a]
    sband11 = src1.read(10)  # Band [B11]
    sband12 = src1.read(11)  # Band [B12]

## Extract S2 Info

In [None]:
# Extracts satellite band values from a GeoTIFF based on coordinates from a csv file and returns them in a DataFrame.

def map_satellite_data(tiff_path, csv_path):

    # Load the GeoTIFF data
    data = rxr.open_rasterio(tiff_path)
    tiff_crs = data.rio.crs

    # Read the Excel file using pandas
    df = pd.read_csv(csv_path)
    latitudes = df['Latitude'].values
    longitudes = df['Longitude'].values

    # 3. Convert lat/long to the GeoTIFF's CRS
    # Create a Proj object for EPSG:4326 (WGS84 - lat/long) and the GeoTIFF's CRS
    proj_wgs84 = Proj(init='epsg:4326')  # EPSG:4326 is the common lat/long CRS
    proj_tiff = Proj(tiff_crs)

    # Create a transformer object
    transformer = Transformer.from_proj(proj_wgs84, proj_tiff)

    B01_values = []
    B02_values = []
    B03_values = []
    B04_values = []
    B05_values = []
    B06_values = []
    B07_values = []
    B08_values = []
    B8A_values = []
    B11_values = []
    B12_values = []

# Iterate over the latitudes and longitudes, and extract the corresponding band values
    for lat, lon in tqdm(zip(latitudes, longitudes), total=len(latitudes), desc="Mapping values"):
    # Assuming the correct dimensions are 'y' and 'x' (replace these with actual names from data.coords)

        B01_value = data.sel(x=lon, y=lat, band=1, method="nearest").values
        B01_values.append(B01_value)

        B02_value = data.sel(x=lon, y=lat, band=2, method="nearest").values
        B02_values.append(B02_value)

        B03_value = data.sel(x=lon, y=lat, band=1, method="nearest").values
        B03_values.append(B03_value)

        B04_value = data.sel(x=lon, y=lat, band=2, method="nearest").values
        B04_values.append(B04_value)

        B05_value = data.sel(x=lon, y=lat, band=1, method="nearest").values
        B05_values.append(B05_value)

        B06_value = data.sel(x=lon, y=lat, band=3, method="nearest").values
        B06_values.append(B06_value)

        B07_value = data.sel(x=lon, y=lat, band=3, method="nearest").values
        B07_values.append(B07_value)

        B08_value = data.sel(x=lon, y=lat, band=4, method="nearest").values
        B08_values.append(B08_value)

        B8A_value = data.sel(x=lon, y=lat, band=4, method="nearest").values
        B8A_values.append(B8A_value)

        B11_value = data.sel(x=lon, y=lat, band=4, method="nearest").values
        B11_values.append(B11_value)

        B12_value = data.sel(x=lon, y=lat, band=4, method="nearest").values
        B12_values.append(B12_value)

    # Create a DataFrame with the band values
    # Create a DataFrame to store the band values
    df = pd.DataFrame()
    df['SB01'] = B01_values
    df['SB02'] = B02_values
    df['SB03'] = B03_values
    df['SB04'] = B04_values
    df['SB06'] = B06_values
    df['SB07'] = B07_values
    df['SB08'] = B08_values
    df['SB8A'] = B8A_values
    df['SB11'] = B11_values
    df['SB12'] = B12_values

    return df


In [None]:
# Mapping satellite data with training data.
S2_data = map_satellite_data('datasets/S2_sample_all.tiff', 'datasets/data.csv')
S2_sub = map_satellite_data('datasets/S2_sample_all.tiff', 'datasets/template.csv')

Mapping values: 100%|██████████| 11229/11229 [03:16<00:00, 57.25it/s]
Mapping values: 100%|██████████| 1040/1040 [00:19<00:00, 54.26it/s]


In [None]:
display(S2_data.head())
display(S2_sub.head())

Unnamed: 0,SB01,SB02,SB03,SB04,SB06,SB07,SB08,SB8A,SB11,SB12
0,846.0,1042.0,846.0,1042.0,1036.0,1036.0,1036.0,1036.0,1036.0,1036.0
1,846.0,1042.0,846.0,1042.0,1036.0,1036.0,1036.0,1036.0,1036.0,1036.0
2,846.0,583.0,846.0,583.0,818.0,818.0,709.0,709.0,709.0,709.0
3,846.0,581.0,846.0,581.0,733.0,733.0,657.0,657.0,657.0,657.0
4,846.0,655.0,846.0,655.0,744.0,744.0,745.0,745.0,745.0,745.0


Unnamed: 0,SB01,SB02,SB03,SB04,SB06,SB07,SB08,SB8A,SB11,SB12
0,811.0,459.0,811.0,459.0,617.0,617.0,432.0,432.0,432.0,432.0
1,1208.0,562.0,1208.0,562.0,731.0,731.0,647.0,647.0,647.0,647.0
2,899.0,955.0,899.0,955.0,1052.0,1052.0,1188.0,1188.0,1188.0,1188.0
3,1193.0,1132.0,1193.0,1132.0,1364.0,1364.0,1512.0,1512.0,1512.0,1512.0
4,1097.0,1506.0,1097.0,1506.0,1642.0,1642.0,1688.0,1688.0,1688.0,1688.0


In [None]:
# Combining ground data and final data into a single dataset.
uhi_data = pd.concat([data_df,S2_data], axis=1)
display(uhi_data)

uhi_sub = pd.concat([sub_df,S2_sub], axis=1)
display(uhi_sub)

Unnamed: 0,Longitude,Latitude,Datetime,UHI Index,nearest_building_distance,building_density,building_area,building_compactness,Region,Air Temp at Surface,...,SB01,SB02,SB03,SB04,SB06,SB07,SB08,SB8A,SB11,SB12
0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,0.000145,2804,6.621750e-08,0.501687,Manhattan,26.8,...,846.0,1042.0,846.0,1042.0,1036.0,1036.0,1036.0,1036.0,1036.0,1036.0
1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,0.000147,2805,1.017900e-08,0.620320,Manhattan,26.8,...,846.0,1042.0,846.0,1042.0,1036.0,1036.0,1036.0,1036.0,1036.0,1036.0
2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,0.000155,2807,1.509400e-08,0.572883,Manhattan,26.8,...,846.0,583.0,846.0,583.0,818.0,818.0,709.0,709.0,709.0,709.0
3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,0.000142,2799,8.512000e-09,0.533518,Manhattan,26.8,...,846.0,581.0,846.0,581.0,733.0,733.0,657.0,657.0,657.0,657.0
4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,0.000124,2795,2.309100e-08,0.756804,Manhattan,26.8,...,846.0,655.0,846.0,655.0,744.0,744.0,745.0,745.0,745.0,745.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11224,-73.957050,40.790333,2021-07-24 15:57:00,0.972470,0.002234,1380,,,Manhattan,26.8,...,481.0,473.0,481.0,473.0,708.0,708.0,528.0,528.0,528.0,528.0
11225,-73.957063,40.790308,2021-07-24 15:57:00,0.972470,0.002228,1380,,,Manhattan,26.8,...,481.0,540.0,481.0,540.0,742.0,742.0,610.0,610.0,610.0,610.0
11226,-73.957093,40.790270,2021-07-24 15:57:00,0.981124,0.002211,1378,,,Manhattan,26.8,...,481.0,540.0,481.0,540.0,742.0,742.0,610.0,610.0,610.0,610.0
11227,-73.957112,40.790253,2021-07-24 15:59:00,0.981245,0.002199,1378,,,Manhattan,27.0,...,481.0,540.0,481.0,540.0,742.0,742.0,610.0,610.0,610.0,610.0


Unnamed: 0,Longitude,Latitude,UHI Index,nearest_building_distance,building_density,building_area,building_compactness,Region,Datetime,Air Temp at Surface,...,SB01,SB02,SB03,SB04,SB06,SB07,SB08,SB8A,SB11,SB12
0,-73.971665,40.788763,,0.000156,796,6.621750e-08,0.501687,Manhattan,2021-07-24 15:30:00,27.3,...,811.0,459.0,811.0,459.0,617.0,617.0,432.0,432.0,432.0,432.0
1,-73.971928,40.788875,,0.000138,791,1.017900e-08,0.620320,Manhattan,2021-07-24 15:30:00,27.3,...,1208.0,562.0,1208.0,562.0,731.0,731.0,647.0,647.0,647.0,647.0
2,-73.967080,40.789080,,0.000000,931,1.509400e-08,0.572883,Manhattan,2021-07-24 15:29:00,27.3,...,899.0,955.0,899.0,955.0,1052.0,1052.0,1188.0,1188.0,1188.0,1188.0
3,-73.972550,40.789082,,0.000005,775,8.512000e-09,0.533518,Manhattan,2021-07-24 15:30:00,27.3,...,1193.0,1132.0,1193.0,1132.0,1364.0,1364.0,1512.0,1512.0,1512.0,1512.0
4,-73.969697,40.787953,,0.000089,849,2.309100e-08,0.756804,Manhattan,2021-07-24 15:29:00,27.3,...,1097.0,1506.0,1097.0,1506.0,1642.0,1642.0,1688.0,1688.0,1688.0,1688.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035,-73.919388,40.813803,,0.000110,2523,1.530800e-08,0.508707,Manhattan,2021-07-24 15:26:00,27.2,...,1474.0,1086.0,1474.0,1086.0,1382.0,1382.0,1474.0,1474.0,1474.0,1474.0
1036,-73.931033,40.833178,,0.000112,2131,1.248400e-08,0.444086,Manhattan,2021-07-24 15:35:00,26.8,...,1014.0,548.0,1014.0,548.0,766.0,766.0,797.0,797.0,797.0,797.0
1037,-73.934647,40.854542,,0.000133,1362,3.905800e-08,0.653475,Manhattan,2021-07-24 15:32:00,27.3,...,917.0,1184.0,917.0,1184.0,1462.0,1462.0,1538.0,1538.0,1538.0,1538.0
1038,-73.917223,40.815413,,0.000099,2697,7.773500e-08,0.752537,Manhattan,2021-07-24 15:28:00,27.3,...,1890.0,1066.0,1890.0,1066.0,1244.0,1244.0,1368.0,1368.0,1368.0,1368.0


# Combine Landsat Information

# Save final combined datasets

In [None]:
uhi_data.to_csv('data_s2.csv', index=False)
uhi_sub.to_csv('sub_s2.csv', index=False)

In [None]:
uhi_data.columns

Index(['Longitude', 'Latitude', 'Datetime', 'UHI Index',
       'nearest_building_distance', 'building_density', 'building_area',
       'building_compactness', 'Region', 'Air Temp at Surface',
       'Relative Humidity', 'Avg Wind Speed', 'Wind Direction', 'Solar Flux',
       'SB01', 'SB02', 'SB03', 'SB04', 'SB06', 'SB07', 'SB08', 'SB8A', 'SB11',
       'SB12'],
      dtype='object')