In [1]:
pip install rioxarray rasterio pystac_client planetary_computer

Collecting rioxarray
  Downloading rioxarray-0.18.2-py3-none-any.whl.metadata (5.4 kB)
Collecting rasterio
  Downloading rasterio-1.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting pystac_client
  Downloading pystac_client-0.8.6-py3-none-any.whl.metadata (3.0 kB)
Collecting planetary_computer
  Downloading planetary_computer-1.0.0-py3-none-any.whl.metadata (7.4 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting pystac>=1.10.0 (from pystac[validation]>=1.10.0->pystac_client)
  Downloading pystac-1.12.2-py3-none-any.whl.metadata (4.6 kB)
Collecting python-dotenv (from planetary_computer)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading rioxarray-0.18.2-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.9/61.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rasterio-1.4.3-cp310-cp310-man

In [2]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

import xarray as xr
import rioxarray as rxr
import geopandas as gpd
import rasterio
from rasterio import windows
from rasterio import features
from rasterio import warp
from rasterio.warp import transform_bounds
from rasterio.windows import from_bounds
from scipy.spatial import cKDTree

from PIL import Image
from pyproj import Proj, Transformer, CRS

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
import xgboost as xgb
from sklearn.metrics import r2_score

import pystac_client
import planetary_computer as pc
from pystac.extensions.eo import EOExtension as eo

import os
from tqdm import tqdm

In [3]:
wind_df_bronx = pd.read_excel('/kaggle/input/ey-challenge/NY_Mesonet_Weather.xlsx', sheet_name='Bronx')
wind_df_bronx['Date / Time'] = pd.to_datetime(wind_df_bronx['Date / Time'])
wind_df_bronx['Time'] = wind_df_bronx['Date / Time'].dt.strftime('%H:%M')
wind_df_bronx = wind_df_bronx[(wind_df_bronx['Time'] >= '15:00') & (wind_df_bronx['Time'] <= '16:00')]

In [4]:
wind_df_manhattan = pd.read_excel('/kaggle/input/ey-challenge/NY_Mesonet_Weather.xlsx', sheet_name='Manhattan')
wind_df_manhattan['Date / Time'] = pd.to_datetime(wind_df_manhattan['Date / Time'])
wind_df_manhattan['Time'] = wind_df_manhattan['Date / Time'].dt.strftime('%H:%M')
wind_df_manhattan = wind_df_manhattan[(wind_df_manhattan['Time'] >= '15:00') & (wind_df_manhattan['Time'] <= '16:00')]

In [5]:
ground_df = pd.read_csv("/kaggle/input/ey-challenge/ground_data_ver2.csv")
ground_df

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,air,humidity,wind,dir,solar
0,-73.909167,40.813107,24-07-2021 15:53,1.030289,27.151579,46.340323,3.029015,151.687438,443.527262
1,-73.909187,40.813045,24-07-2021 15:53,1.030289,27.151246,46.342010,3.028945,151.703688,443.525144
2,-73.909215,40.812978,24-07-2021 15:53,1.023798,27.150867,46.343927,3.028867,151.722148,443.522738
3,-73.909242,40.812908,24-07-2021 15:53,1.023798,27.150478,46.345897,3.028786,151.741114,443.520266
4,-73.909257,40.812845,24-07-2021 15:53,1.021634,27.150151,46.347551,3.028717,151.757046,443.518189
...,...,...,...,...,...,...,...,...,...
11224,-73.957050,40.790333,24-07-2021 15:57,0.972470,26.892365,47.652594,2.975012,164.324124,441.880173
11225,-73.957063,40.790308,24-07-2021 15:57,0.972470,26.892204,47.653410,2.974978,164.331984,441.879148
11226,-73.957093,40.790270,24-07-2021 15:57,0.981124,26.891936,47.654764,2.974923,164.345020,441.877449
11227,-73.957112,40.790253,24-07-2021 15:59,0.981245,26.891809,47.655409,2.974896,164.351234,441.876639


In [6]:
wind_df_bronx_air = wind_df_bronx['Air Temp at Surface [degC]'].mean()
wind_df_bronx_humidity = wind_df_bronx['Relative Humidity [percent]'].mean()
wind_df_bronx_wind = wind_df_bronx['Avg Wind Speed [m/s]'].mean()
wind_df_bronx_dir = wind_df_bronx['Wind Direction [degrees]'].mean()
wind_df_bronx_solar = wind_df_bronx['Solar Flux [W/m^2]'].mean()

wind_df_manhattan_air = wind_df_manhattan['Air Temp at Surface [degC]'].mean()
wind_df_manhattan_humidity = wind_df_manhattan['Relative Humidity [percent]'].mean()
wind_df_manhattan_wind = wind_df_manhattan['Avg Wind Speed [m/s]'].mean()
wind_df_manhattan_dir = wind_df_manhattan['Wind Direction [degrees]'].mean()
wind_df_manhattan_solar = wind_df_manhattan['Solar Flux [W/m^2]'].mean()

bronx_latitude, bronx_longitude = 40.87248, -73.89352
manhattan_latitude, manhattan_longitude = 40.76754, -73.96449

def euclidean_distance(lat1, lon1, lat2, lon2):
    return np.sqrt((lat2 - lat1) ** 2 + (lon2 - lon1) ** 2)

def update_meteorological_data(row):
    distance_to_bronx = euclidean_distance(row['Latitude'], row['Longitude'], bronx_latitude, bronx_longitude)
    distance_to_manhattan = euclidean_distance(row['Latitude'], row['Longitude'], manhattan_latitude, manhattan_longitude)

    if distance_to_bronx < distance_to_manhattan:
        row['air'] = wind_df_bronx_air
        row['humidity'] = wind_df_bronx_humidity
        row['wind'] = wind_df_bronx_wind
        row['dir'] = wind_df_bronx_dir
        row['solar'] = wind_df_bronx_solar
    else:
        row['air'] = wind_df_manhattan_air
        row['humidity'] = wind_df_manhattan_humidity
        row['wind'] = wind_df_manhattan_wind
        row['dir'] = wind_df_manhattan_dir
        row['solar'] = wind_df_manhattan_solar
    
    return row

ground_df = ground_df.apply(update_meteorological_data, axis=1)
ground_df

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,air,humidity,wind,dir,solar
0,-73.909167,40.813107,24-07-2021 15:53,1.030289,27.492308,44.615385,3.100000,135.076923,445.692308
1,-73.909187,40.813045,24-07-2021 15:53,1.030289,27.492308,44.615385,3.100000,135.076923,445.692308
2,-73.909215,40.812978,24-07-2021 15:53,1.023798,27.492308,44.615385,3.100000,135.076923,445.692308
3,-73.909242,40.812908,24-07-2021 15:53,1.023798,27.492308,44.615385,3.100000,135.076923,445.692308
4,-73.909257,40.812845,24-07-2021 15:53,1.021634,27.492308,44.615385,3.100000,135.076923,445.692308
...,...,...,...,...,...,...,...,...,...
11224,-73.957050,40.790333,24-07-2021 15:57,0.972470,26.753846,48.353846,2.946154,171.076923,441.000000
11225,-73.957063,40.790308,24-07-2021 15:57,0.972470,26.753846,48.353846,2.946154,171.076923,441.000000
11226,-73.957093,40.790270,24-07-2021 15:57,0.981124,26.753846,48.353846,2.946154,171.076923,441.000000
11227,-73.957112,40.790253,24-07-2021 15:59,0.981245,26.753846,48.353846,2.946154,171.076923,441.000000


In [7]:
b_data = pd.read_csv("/kaggle/input/ey-challenge/B_data.csv")
b_data

Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B11,B12,NDVI,NDBI,NDWI
0,846.0,1042.0,1036.0,1036.0,1272.0,1502.0,1605.0,1906.0,1906.0,1265.0,1206.0,0.295717,-0.202144,-0.295717
1,846.0,1042.0,1036.0,1036.0,1272.0,1502.0,1605.0,1906.0,1906.0,1265.0,1206.0,0.295717,-0.202144,-0.295717
2,846.0,583.0,818.0,709.0,1054.0,1668.0,2097.0,2190.0,2190.0,991.0,777.0,0.510866,-0.376925,-0.456117
3,846.0,581.0,733.0,657.0,1054.0,1668.0,2097.0,2182.0,2182.0,991.0,741.5,0.537161,-0.375355,-0.497084
4,846.0,655.0,744.0,745.0,1021.0,1728.0,1943.0,2112.0,2112.0,1134.0,708.5,0.478474,-0.301294,-0.478992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11224,481.0,473.0,708.0,528.0,990.0,2382.0,2494.0,3284.0,3284.0,1079.0,501.0,0.722980,-0.505386,-0.645291
11225,481.0,540.0,742.0,610.0,990.0,2382.0,2494.0,2900.0,2900.0,1079.0,551.5,0.652422,-0.457653,-0.592532
11226,481.0,540.0,742.0,610.0,990.0,2382.0,2494.0,2900.0,2900.0,1079.0,551.5,0.652422,-0.457653,-0.592532
11227,481.0,540.0,742.0,610.0,990.0,2382.0,2494.0,2900.0,2900.0,1079.0,551.5,0.652422,-0.457653,-0.592532


In [8]:
footprint_data = pd.read_csv("/kaggle/input/ey-challenge/Building_Footprint.csv")
footprint_data = footprint_data[['X', 'Y']]
footprint_data

Unnamed: 0,X,Y
0,-73.919277,40.848089
1,-73.921858,40.849620
2,-73.920382,40.850091
3,-73.920465,40.851392
4,-73.912466,40.852192
...,...,...
9441,-73.952052,40.779123
9442,-73.948788,40.776070
9443,-73.951236,40.768560
9444,-73.952602,40.759267


In [9]:
def count_nearby_points(ground_df, footprint_data, max_distance):
    footprint_tree = cKDTree(footprint_data[['X', 'Y']].values)
    
    counts = []
    
    for _, ground_row in ground_df.iterrows():
        ground_coords = np.array([[ground_row['Longitude'], ground_row['Latitude']]])
        indices = footprint_tree.query_ball_point(ground_coords[0], r=max_distance)
        counts.append(len(indices))
    
    ground_df['Count'] = counts
    return ground_df

ground_df = count_nearby_points(ground_df, footprint_data, max_distance=0.01)
ground_df

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,air,humidity,wind,dir,solar,Count
0,-73.909167,40.813107,24-07-2021 15:53,1.030289,27.492308,44.615385,3.100000,135.076923,445.692308,632
1,-73.909187,40.813045,24-07-2021 15:53,1.030289,27.492308,44.615385,3.100000,135.076923,445.692308,628
2,-73.909215,40.812978,24-07-2021 15:53,1.023798,27.492308,44.615385,3.100000,135.076923,445.692308,622
3,-73.909242,40.812908,24-07-2021 15:53,1.023798,27.492308,44.615385,3.100000,135.076923,445.692308,618
4,-73.909257,40.812845,24-07-2021 15:53,1.021634,27.492308,44.615385,3.100000,135.076923,445.692308,614
...,...,...,...,...,...,...,...,...,...,...
11224,-73.957050,40.790333,24-07-2021 15:57,0.972470,26.753846,48.353846,2.946154,171.076923,441.000000,187
11225,-73.957063,40.790308,24-07-2021 15:57,0.972470,26.753846,48.353846,2.946154,171.076923,441.000000,187
11226,-73.957093,40.790270,24-07-2021 15:57,0.981124,26.753846,48.353846,2.946154,171.076923,441.000000,185
11227,-73.957112,40.790253,24-07-2021 15:59,0.981245,26.753846,48.353846,2.946154,171.076923,441.000000,185


In [10]:
uhi_data = pd.concat([ground_df, b_data], axis=1)
uhi_data

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,air,humidity,wind,dir,solar,Count,...,B05,B06,B07,B08,B8A,B11,B12,NDVI,NDBI,NDWI
0,-73.909167,40.813107,24-07-2021 15:53,1.030289,27.492308,44.615385,3.100000,135.076923,445.692308,632,...,1272.0,1502.0,1605.0,1906.0,1906.0,1265.0,1206.0,0.295717,-0.202144,-0.295717
1,-73.909187,40.813045,24-07-2021 15:53,1.030289,27.492308,44.615385,3.100000,135.076923,445.692308,628,...,1272.0,1502.0,1605.0,1906.0,1906.0,1265.0,1206.0,0.295717,-0.202144,-0.295717
2,-73.909215,40.812978,24-07-2021 15:53,1.023798,27.492308,44.615385,3.100000,135.076923,445.692308,622,...,1054.0,1668.0,2097.0,2190.0,2190.0,991.0,777.0,0.510866,-0.376925,-0.456117
3,-73.909242,40.812908,24-07-2021 15:53,1.023798,27.492308,44.615385,3.100000,135.076923,445.692308,618,...,1054.0,1668.0,2097.0,2182.0,2182.0,991.0,741.5,0.537161,-0.375355,-0.497084
4,-73.909257,40.812845,24-07-2021 15:53,1.021634,27.492308,44.615385,3.100000,135.076923,445.692308,614,...,1021.0,1728.0,1943.0,2112.0,2112.0,1134.0,708.5,0.478474,-0.301294,-0.478992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11224,-73.957050,40.790333,24-07-2021 15:57,0.972470,26.753846,48.353846,2.946154,171.076923,441.000000,187,...,990.0,2382.0,2494.0,3284.0,3284.0,1079.0,501.0,0.722980,-0.505386,-0.645291
11225,-73.957063,40.790308,24-07-2021 15:57,0.972470,26.753846,48.353846,2.946154,171.076923,441.000000,187,...,990.0,2382.0,2494.0,2900.0,2900.0,1079.0,551.5,0.652422,-0.457653,-0.592532
11226,-73.957093,40.790270,24-07-2021 15:57,0.981124,26.753846,48.353846,2.946154,171.076923,441.000000,185,...,990.0,2382.0,2494.0,2900.0,2900.0,1079.0,551.5,0.652422,-0.457653,-0.592532
11227,-73.957112,40.790253,24-07-2021 15:59,0.981245,26.753846,48.353846,2.946154,171.076923,441.000000,185,...,990.0,2382.0,2494.0,2900.0,2900.0,1079.0,551.5,0.652422,-0.457653,-0.592532


In [11]:
# columns_to_check = ["B01", "B02", "B03", "B04", "B05", "B06", "B07", "B08", "B8A", "B11", "B12", 'NDVI', 'NDBI', 'NDWI']
# for col in columns_to_check:
#     uhi_data[col] = uhi_data[col].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) and x.ndim > 0 else x)

# uhi_data = uhi_data.drop_duplicates(subset=columns_to_check, keep='first')
# uhi_data = uhi_data.reset_index(drop=True)
# uhi_data

In [12]:
correlation_with_uhi = uhi_data.drop(columns=['datetime', 'Longitude', 'Latitude']).corr()['UHI Index']
correlation_with_uhi

UHI Index    1.000000
air          0.303876
humidity    -0.303876
wind         0.303876
dir         -0.303876
solar        0.303876
Count        0.370112
B01          0.193179
B02          0.163307
B03          0.168818
B04          0.175350
B05          0.170996
B06          0.117021
B07          0.094459
B08          0.082083
B8A          0.082083
B11          0.187564
B12          0.149961
NDVI        -0.254888
NDBI         0.187625
NDWI         0.250060
Name: UHI Index, dtype: float64

In [13]:
uhi_data = uhi_data[['air', 'humidity', 'wind', 'dir', 'solar', 'Count', 'B01', 'B02', 'B03', 'B04', 'B05', 'B11', 'NDVI', 'NDBI', 'NDWI', 'UHI Index']]

X = uhi_data.drop(columns=['UHI Index']).values
y = uhi_data ['UHI Index'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)

In [14]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [15]:
# model = AdaBoostRegressor(n_estimators=50, random_state=42)
# model.fit(X_train, y_train)

# model = xgb.XGBRegressor(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)

# model = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=50, random_state=42)
# model.fit(X_train, y_train)

model = ExtraTreesRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# model = RandomForestRegressor(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)

In [16]:
insample_predictions = model.predict(X_train)
Y_train = y_train.tolist()
r2_score(Y_train, insample_predictions)

0.9989023822688963

In [17]:
outsample_predictions = model.predict(X_test)
Y_test = y_test.tolist()
r2_score(Y_test, outsample_predictions)

0.8267497350384059

In [18]:
test_file = pd.read_csv("/kaggle/input/ey-challenge/cal_data.csv")
test_file

Unnamed: 0,Longitude,Latitude,UHI Index,air,humidity,wind,dir,solar,B01,B02,...,B05,B06,B07,B08,B8A,B11,B12,NDVI,NDBI,NDWI
0,-73.971665,40.788763,,26.874672,47.742166,2.971326,165.186669,441.767747,811.0,459.0,...,984.0,2089.0,2405.0,2502.0,2502.0,893.0,523.5,0.705521,-0.473932,-0.604360
1,-73.971928,40.788875,,26.875446,47.738249,2.971487,165.148950,441.772663,1208.0,562.0,...,1112.0,2076.0,2248.0,2906.0,2906.0,1188.0,566.0,0.635801,-0.419638,-0.598020
2,-73.967080,40.789080,,26.874395,47.743566,2.971268,165.200155,441.765989,899.0,955.0,...,979.0,995.0,1158.0,1246.0,1246.0,763.0,1155.0,0.023829,-0.240418,-0.084421
3,-73.972550,40.789082,,26.877017,47.730294,2.971814,165.072351,441.782647,1193.0,1132.0,...,1866.0,1939.0,2076.0,1774.0,1774.0,2346.0,1189.0,0.079732,0.138835,-0.130656
4,-73.969697,40.787953,,26.869208,47.769827,2.970188,165.453031,441.733029,1097.0,1506.0,...,1294.0,2204.0,2411.0,2834.0,2834.0,1848.0,1293.5,0.253428,-0.210594,-0.266309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035,-73.919388,40.813803,,27.124466,46.477585,3.023366,153.009221,443.354978,1474.0,1086.0,...,1824.0,1553.0,1995.0,1578.0,1578.0,2089.0,1562.0,0.034076,0.139351,-0.066216
1036,-73.931033,40.833178,,27.178869,46.202166,3.034700,150.357037,443.700669,1014.0,548.0,...,2034.0,2393.0,2701.0,2798.0,2798.0,1201.0,710.0,0.556606,-0.399350,-0.570146
1037,-73.934647,40.854542,,27.250184,45.841134,3.049558,146.880437,444.153815,917.0,1184.0,...,1633.0,1939.0,2413.0,2244.0,2244.0,1597.0,1402.0,0.186674,-0.168446,-0.211009
1038,-73.917223,40.815413,,27.138759,46.405225,3.026344,152.312420,443.445801,1890.0,1066.0,...,2302.0,2587.0,2621.0,2094.0,2094.0,2379.0,2430.0,0.209705,0.063716,-0.254643


In [19]:
test_file = test_file.apply(update_meteorological_data, axis=1)
test_file

Unnamed: 0,Longitude,Latitude,UHI Index,air,humidity,wind,dir,solar,B01,B02,...,B05,B06,B07,B08,B8A,B11,B12,NDVI,NDBI,NDWI
0,-73.971665,40.788763,,26.753846,48.353846,2.946154,171.076923,441.000000,811.0,459.0,...,984.0,2089.0,2405.0,2502.0,2502.0,893.0,523.5,0.705521,-0.473932,-0.604360
1,-73.971928,40.788875,,26.753846,48.353846,2.946154,171.076923,441.000000,1208.0,562.0,...,1112.0,2076.0,2248.0,2906.0,2906.0,1188.0,566.0,0.635801,-0.419638,-0.598020
2,-73.967080,40.789080,,26.753846,48.353846,2.946154,171.076923,441.000000,899.0,955.0,...,979.0,995.0,1158.0,1246.0,1246.0,763.0,1155.0,0.023829,-0.240418,-0.084421
3,-73.972550,40.789082,,26.753846,48.353846,2.946154,171.076923,441.000000,1193.0,1132.0,...,1866.0,1939.0,2076.0,1774.0,1774.0,2346.0,1189.0,0.079732,0.138835,-0.130656
4,-73.969697,40.787953,,26.753846,48.353846,2.946154,171.076923,441.000000,1097.0,1506.0,...,1294.0,2204.0,2411.0,2834.0,2834.0,1848.0,1293.5,0.253428,-0.210594,-0.266309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035,-73.919388,40.813803,,27.492308,44.615385,3.100000,135.076923,445.692308,1474.0,1086.0,...,1824.0,1553.0,1995.0,1578.0,1578.0,2089.0,1562.0,0.034076,0.139351,-0.066216
1036,-73.931033,40.833178,,27.492308,44.615385,3.100000,135.076923,445.692308,1014.0,548.0,...,2034.0,2393.0,2701.0,2798.0,2798.0,1201.0,710.0,0.556606,-0.399350,-0.570146
1037,-73.934647,40.854542,,27.492308,44.615385,3.100000,135.076923,445.692308,917.0,1184.0,...,1633.0,1939.0,2413.0,2244.0,2244.0,1597.0,1402.0,0.186674,-0.168446,-0.211009
1038,-73.917223,40.815413,,27.492308,44.615385,3.100000,135.076923,445.692308,1890.0,1066.0,...,2302.0,2587.0,2621.0,2094.0,2094.0,2379.0,2430.0,0.209705,0.063716,-0.254643


In [20]:
test_file = count_nearby_points(test_file, footprint_data, max_distance=0.01)
test_file

Unnamed: 0,Longitude,Latitude,UHI Index,air,humidity,wind,dir,solar,B01,B02,...,B06,B07,B08,B8A,B11,B12,NDVI,NDBI,NDWI,Count
0,-73.971665,40.788763,,26.753846,48.353846,2.946154,171.076923,441.000000,811.0,459.0,...,2089.0,2405.0,2502.0,2502.0,893.0,523.5,0.705521,-0.473932,-0.604360,227
1,-73.971928,40.788875,,26.753846,48.353846,2.946154,171.076923,441.000000,1208.0,562.0,...,2076.0,2248.0,2906.0,2906.0,1188.0,566.0,0.635801,-0.419638,-0.598020,225
2,-73.967080,40.789080,,26.753846,48.353846,2.946154,171.076923,441.000000,899.0,955.0,...,995.0,1158.0,1246.0,1246.0,763.0,1155.0,0.023829,-0.240418,-0.084421,207
3,-73.972550,40.789082,,26.753846,48.353846,2.946154,171.076923,441.000000,1193.0,1132.0,...,1939.0,2076.0,1774.0,1774.0,2346.0,1189.0,0.079732,0.138835,-0.130656,226
4,-73.969697,40.787953,,26.753846,48.353846,2.946154,171.076923,441.000000,1097.0,1506.0,...,2204.0,2411.0,2834.0,2834.0,1848.0,1293.5,0.253428,-0.210594,-0.266309,215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035,-73.919388,40.813803,,27.492308,44.615385,3.100000,135.076923,445.692308,1474.0,1086.0,...,1553.0,1995.0,1578.0,1578.0,2089.0,1562.0,0.034076,0.139351,-0.066216,536
1036,-73.931033,40.833178,,27.492308,44.615385,3.100000,135.076923,445.692308,1014.0,548.0,...,2393.0,2701.0,2798.0,2798.0,1201.0,710.0,0.556606,-0.399350,-0.570146,391
1037,-73.934647,40.854542,,27.492308,44.615385,3.100000,135.076923,445.692308,917.0,1184.0,...,1939.0,2413.0,2244.0,2244.0,1597.0,1402.0,0.186674,-0.168446,-0.211009,303
1038,-73.917223,40.815413,,27.492308,44.615385,3.100000,135.076923,445.692308,1890.0,1066.0,...,2587.0,2621.0,2094.0,2094.0,2379.0,2430.0,0.209705,0.063716,-0.254643,583


In [21]:
submission_val_data = test_file.loc[:,['air', 'humidity', 'wind', 'dir', 'solar', 'Count', 'B01', 'B02', 'B03', 'B04', 'B05', 'B11', 'NDVI', 'NDBI', 'NDWI']]
submission_val_data = submission_val_data.values
transformed_submission_data = sc.transform(submission_val_data)

In [22]:
final_predictions = model.predict(transformed_submission_data)
final_prediction_series = pd.Series(final_predictions)

submission_df = pd.DataFrame({'Longitude':test_file['Longitude'].values, 'Latitude':test_file['Latitude'].values, 'UHI Index': final_prediction_series.values})
submission_df

Unnamed: 0,Longitude,Latitude,UHI Index
0,-73.971665,40.788763,0.966623
1,-73.971928,40.788875,0.962733
2,-73.967080,40.789080,0.997684
3,-73.972550,40.789082,0.986971
4,-73.969697,40.787953,0.960449
...,...,...,...
1035,-73.919388,40.813803,1.020908
1036,-73.931033,40.833178,1.044113
1037,-73.934647,40.854542,1.013349
1038,-73.917223,40.815413,1.009450


In [23]:
submission_df.to_csv("submission.csv",index = False)

In [24]:
# ground_df.to_csv("ground_data_ver4.csv",index = False)