<a href="https://colab.research.google.com/github/valliansayoga/Dash-by-Plotly/blob/master/EY2025_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [3]:
to_drop = ["Latitude", "Longitude", "datetime"]
target = "UHI Index"

df = pd.read_csv("Train_Final.csv").drop(to_drop, axis=1, errors="ignore")

# # Uncomment if used
# df.drop(columns=df.columns[df.columns.str.contains("count", regex=False)], inplace=True)


df.head()

Unnamed: 0,UHI Index,is_a_building,nearest_building_distance,10m_nearby_building_count,20m_nearby_building_count,30m_nearby_building_count,40m_nearby_building_count,50m_nearby_building_count,average_distance,std_distance,...,nearest_polygon_angle,distance_variation,distance_range,avg_polygon_complexity,red,green,blue,nir,ndvi_median,temp_median
0,1.030289,0,19.079136,0,1,1,1,1,3029.494982,3010.415846,...,-157.415935,3010.415846,6020.831692,29.0,0.124005,0.112592,0.093287,0.196825,0.226974,38.431539
1,1.030289,0,19.233293,0,1,1,1,1,3034.387105,3015.153812,...,-153.90797,3015.153812,6030.307623,29.0,0.124005,0.112592,0.093287,0.196825,0.226974,38.431539
2,1.023798,0,20.268009,0,0,1,1,1,3040.018287,3019.750278,...,-148.916535,3019.750278,6039.500556,29.0,0.071398,0.07357,0.052175,0.197622,0.469203,37.785534
3,1.023798,0,20.968705,0,0,1,1,2,3045.783857,3024.815152,...,-142.868592,3024.815152,6049.630304,29.0,0.071398,0.07357,0.052175,0.197622,0.469203,37.785534
4,1.021634,0,16.324876,0,1,1,2,2,3048.490175,3032.165299,...,-137.014238,3032.165299,6064.330598,29.0,0.071398,0.07357,0.052175,0.197622,0.469203,37.785534


In [4]:
def create_train(df_features, scaler, train_size=0.8):
    print("Removing duplicates...")
    rows_before = df_features.shape[0]
    check_dupl = df_features.columns[1:]
    df_features = df_features.drop_duplicates(subset=check_dupl, keep='first')
    rows_after = df_features.shape[0]
    print(f"Removed {rows_before-rows_after} duplicate rows!")

    X = df_features.drop(target, axis=1)
    y = df_features[target]

    print("Scaling...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=train_size)
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    print("Done")
    return X_train, X_test, y_train, y_test, scaler
scaler = StandardScaler()
X_train, X_test, y_train, y_test, scaler = create_train(df, scaler)

Removing duplicates...
Removed 0 duplicate rows!
Scaling...
Done


# Modelling

In [5]:
from sklearn.metrics import r2_score


def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    insample = r2_score(y_train, model.predict(X_train))
    outsample = r2_score(y_test, model.predict(X_test))
    return insample, outsample

In [6]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from tqdm import tqdm
models = [
    {"model": RandomForestRegressor(200, random_state=0, n_jobs=-1)},
    {"model": RandomForestRegressor(250, random_state=0, n_jobs=-1)},
    {"model": RandomForestRegressor(300, random_state=0, n_jobs=-1)},
    {"model": RandomForestRegressor(150, random_state=0, n_jobs=-1)},
    {"model": ExtraTreesRegressor(n_estimators=200, random_state=0, n_jobs=-1, bootstrap=True, oob_score=True)},
    {"model": ExtraTreesRegressor(n_estimators=200, random_state=0, n_jobs=-1)},
    {"model": ExtraTreesRegressor(n_estimators=250, random_state=0, n_jobs=-1)},
    {"model": ExtraTreesRegressor(n_estimators=100, random_state=0, n_jobs=-1)},
    {"model": KNeighborsRegressor(3, n_jobs=-1)},
    {"model": KNeighborsRegressor(5, n_jobs=-1)},
    {"model": DecisionTreeRegressor(random_state=0)},
    # {"model": MLPRegressor((8, 16, 32), random_state=0)},
]

for model in tqdm(models):
    insample, outsample = evaluate_model(model["model"], X_train, X_test, y_train, y_test)
    model["insample"] = insample
    model["outsample"] = outsample

results = pd.DataFrame(models).sort_values("outsample", ascending=False).reset_index(drop=True)
results

Unnamed: 0,model,insample,outsample
0,"(ExtraTreeRegressor(random_state=209652396), E...",1.0,0.93418
1,"(ExtraTreeRegressor(random_state=209652396), E...",1.0,0.933886
2,"(ExtraTreeRegressor(random_state=209652396), E...",1.0,0.932674
3,"(DecisionTreeRegressor(max_features=1.0, rando...",0.987908,0.916276
4,"(DecisionTreeRegressor(max_features=1.0, rando...",0.98779,0.916233
5,"(DecisionTreeRegressor(max_features=1.0, rando...",0.988062,0.915998
6,"(DecisionTreeRegressor(max_features=1.0, rando...",0.988034,0.915953
7,"(ExtraTreeRegressor(random_state=209652396), E...",0.988206,0.914058
8,DecisionTreeRegressor(random_state=0),1.0,0.839239
9,"KNeighborsRegressor(n_jobs=-1, n_neighbors=3)",0.882012,0.73083


In [7]:
best_model = results.iloc[0]
best_model.model

In [8]:
importance = pd.DataFrame(
    {"Features": X_train.columns, "Importance": best_model.model.feature_importances_},
).sort_values("Importance", ascending=False).reset_index(drop=True)
importance

Unnamed: 0,Features,Importance
0,distance_range,0.140832
1,average_distance,0.139142
2,distance_variation,0.134033
3,std_distance,0.133304
4,temp_median,0.065169
5,avg_polygon_complexity,0.042848
6,neighboring_intersection,0.041318
7,nearest_building_size,0.040205
8,building_area_density,0.03362
9,ndvi_median,0.032785


# Predicting Submission

In [9]:
def create_submission(filename: str, model, scaler):
    to_drop = ["Latitude", "Longitude"]
    sub_df = pd.read_csv("Submission_Final.csv")
    final_df = sub_df[to_drop].copy()
    print("Predicting", sub_df.shape[0], "rows...")
    to_predict = pd.DataFrame(
        scaler.transform(sub_df.loc[:, X_train.columns]),
        columns=X_train.columns
    )

    print("Predicting...")
    final_df["UHI Index"] = model.predict(to_predict)
    final_df.to_csv(filename, index=False)
    print("Done!")
    return
create_submission("BestModel_BldngFt_NoDups_Cmplx.csv", best_model.model, scaler)

Predicting 1040 rows...
Predicting...
Done!
