<a href="https://colab.research.google.com/github/valliansayoga/Dash-by-Plotly/blob/master/EY2025_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm

pd.options.display.max_columns = None

In [3]:
scl_mapping = {
    0: 'no_data',
    1: 'saturated_or_defective_pixel',
    2: 'topographic_casted_shadows',
    3: 'cloud_shadows',
    4: 'vegetation',
    5: 'not_vegetated',
    6: 'water',
    7: 'unclassified',
    8: 'cloud_medium_probability',
    9: 'cloud_high_probability',
    10: 'thin_cirrus',
    11: 'snow_or_ice'
}
scl_mapping

{0: 'no_data',
 1: 'saturated_or_defective_pixel',
 2: 'topographic_casted_shadows',
 3: 'cloud_shadows',
 4: 'vegetation',
 5: 'not_vegetated',
 6: 'water',
 7: 'unclassified',
 8: 'cloud_medium_probability',
 9: 'cloud_high_probability',
 10: 'thin_cirrus',
 11: 'snow_or_ice'}

In [13]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    insample = r2_score(y_train, model.predict(X_train))
    outsample = r2_score(y_test, model.predict(X_test))
    return insample, outsample

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    insample = r2_score(y_train, model.predict(X_train))
    outsample = r2_score(y_test, model.predict(X_test))
    return insample, outsample

def add_features(df):
    df["evi_x_lwir"] = df.evi_median * df.lwir_median
    df["ndbi_x_lwir"] = df.ndbi_median * df.lwir_median
    df["ndbi_/_bldg_dnsty"] = df.ndbi_median * df.building_density
    df["pan_chromatic"] = df.red_median * df.green_median * df.blue_median
    df["infra_red_combo"] = df.nir_median * df.swir16_median * df.swir22_median
    return df

def create_train(df_features, scaler, target="UHI Index", train_size=0.8, indices=None):
    print("Removing duplicates...")
    rows_before = df_features.shape[0]
    check_dupl = df_features.columns[1:]
    df_features = df_features.drop_duplicates(subset=check_dupl, keep='first')
    rows_after = df_features.shape[0]
    print(f"Removed {rows_before-rows_after} duplicate rows!")

    X = df_features.drop(target, axis=1)
    y = df_features[target]

    print("Scaling...")
    if indices is not None:
        X_train, X_test, y_train, y_test = X.iloc[indices[0]], X.iloc[indices[1]], y.iloc[indices[0]], y.iloc[indices[1]]
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=train_size)

    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    print("Done")
    return X_train, X_test, y_train, y_test, scaler

def load_preprocess_predict_score(models, scaler):
    to_drop = ["Latitude", "Longitude", "datetime"]
    target = "UHI Index"

    # Round 1 to get pareto + 1 features
    separator = "-"*66
    spaces = " "*24
    equals = "="*32

    print(spaces, "Starting round 1", spaces)
    print(separator)
    df = pd.read_csv("Train_Final.csv").drop(to_drop, axis=1, errors="ignore")
    df = df.pipe(add_features)

    X_train, X_test, y_train, y_test, scaler = create_train(
        df,
        scaler,
    )

    for model in tqdm(models):
        insample, outsample = evaluate_model(model["model"], X_train, X_test, y_train, y_test)
        model["insample"] = insample
        model["outsample"] = outsample

    results = pd.DataFrame(models).sort_values("outsample", ascending=False).reset_index(drop=True)
    print(equals, "Model Scores", equals)
    print(results)
    print(separator)
    best_model = results.iloc[0]
    print(equals, "Best Model", equals)
    print(best_model.model)
    print(separator)

    importance = pd.DataFrame(
        {"Features": X_train.columns, "Importance": best_model.model.feature_importances_},
    ).sort_values("Importance", ascending=False).reset_index(drop=True)
    importance["cumulative_importance"] = importance.Importance.cumsum() / importance.Importance.sum()
    print(equals, "Feature Importance", equals)
    print(importance)
    print(separator)

    pareto = importance[importance.cumulative_importance <= 0.8]
    last_index = pareto.index[-1] + 1
    pareto = pd.concat([pareto, importance.iloc[last_index:last_index+1]])
    print(equals, "Pareto Features + 1", equals)
    print(pareto)
    print(separator)

    print(spaces, "Starting round 2", spaces)
    print(separator)
    df = pd.read_csv("Train_Final.csv").drop(to_drop, axis=1, errors="ignore").pipe(add_features)
    use_cols = [target, *pareto.Features]
    df = df.loc[:, use_cols]
    X_train, X_test, y_train, y_test, scaler = create_train(
        df,
        scaler,
    )

    for model in tqdm(models):
        insample, outsample = evaluate_model(model["model"], X_train, X_test, y_train, y_test)
        model["insample"] = insample
        model["outsample"] = outsample

    results = pd.DataFrame(models).sort_values("outsample", ascending=False).reset_index(drop=True)
    print(equals, "Model Scores", equals)
    print(results)
    print(separator)
    best_model = results.iloc[0]
    print(equals, "Best Model", equals)
    print(best_model.model)
    print(separator)

    importance = pd.DataFrame(
        {"Features": X_train.columns, "Importance": best_model.model.feature_importances_},
    ).sort_values("Importance", ascending=False).reset_index(drop=True)
    importance["cumulative_importance"] = importance.Importance.cumsum() / importance.Importance.sum()
    print(equals, "Feature Importance", equals)
    print(importance)
    print(separator)
    return best_model, X_train

# Modelling

In [14]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler

models = [
    {"model": RandomForestRegressor(random_state=0, n_jobs=-1)},
    {"model": RandomForestRegressor(250, max_features=0.5, random_state=0, n_jobs=-1)},
    {"model": RandomForestRegressor(150, max_features=0.5, random_state=0, n_jobs=-1)},
    {"model": ExtraTreesRegressor(max_features=0.5, n_estimators=200, random_state=0, n_jobs=-1)},
    {"model": ExtraTreesRegressor(max_features=0.5, n_estimators=250, random_state=0, n_jobs=-1)},
    {"model": ExtraTreesRegressor(max_features=0.5, n_estimators=100, random_state=0, n_jobs=-1)},
]
scaler = StandardScaler()
best_model, X_train = load_preprocess_predict_score(models, scaler)

                         Starting round 1                         
------------------------------------------------------------------
Removing duplicates...
Removed 0 duplicate rows!
Scaling...
Done


100%|██████████| 6/6 [01:38<00:00, 16.41s/it]


                                               model  insample  outsample
0  (ExtraTreeRegressor(max_features=0.5, random_s...  1.000000   0.959008
1  (ExtraTreeRegressor(max_features=0.5, random_s...  1.000000   0.958958
2  (ExtraTreeRegressor(max_features=0.5, random_s...  1.000000   0.958325
3  (DecisionTreeRegressor(max_features=0.5, rando...  0.992495   0.946768
4  (DecisionTreeRegressor(max_features=0.5, rando...  0.992568   0.946714
5  (DecisionTreeRegressor(max_features=1.0, rando...  0.992141   0.944762
------------------------------------------------------------------
ExtraTreesRegressor(max_features=0.5, n_estimators=250, n_jobs=-1,
                    random_state=0)
------------------------------------------------------------------
                     Features  Importance  cumulative_importance
0              distance_range    0.122817               0.122817
1                std_distance    0.118752               0.241569
2          distance_variation    0.115638         

100%|██████████| 6/6 [00:59<00:00,  9.97s/it]

                                               model  insample  outsample
0  (ExtraTreeRegressor(max_features=0.5, random_s...  1.000000   0.966532
1  (ExtraTreeRegressor(max_features=0.5, random_s...  1.000000   0.966305
2  (ExtraTreeRegressor(max_features=0.5, random_s...  1.000000   0.966272
3  (DecisionTreeRegressor(max_features=0.5, rando...  0.993428   0.953088
4  (DecisionTreeRegressor(max_features=0.5, rando...  0.993287   0.952487
5  (DecisionTreeRegressor(max_features=1.0, rando...  0.993052   0.951557
------------------------------------------------------------------
ExtraTreesRegressor(max_features=0.5, n_estimators=200, n_jobs=-1,
                    random_state=0)
------------------------------------------------------------------
                    Features  Importance  cumulative_importance
0               std_distance    0.135573               0.135573
1             distance_range    0.122472               0.258046
2           average_distance    0.121731             




# Predicting Submission

In [15]:
def create_submission(filename: str, model, scaler):
    sub_df = pd.read_csv("Submission_Final.csv")
    final_df = sub_df[["Latitude", "Longitude"]].copy()
    print("Predicting", sub_df.shape[0], "rows...")

    ############################
    sub_df = add_features(sub_df)

    # # # # # Comment if not used!
    # sub_df.scl_median = sub_df.scl_median.map(scl_mapping)
    # scl_ohe = ohe.transform(sub_df.loc[:, ["scl_median"]])
    # scl_ohe = pd.DataFrame(scl_ohe, columns=ohe.get_feature_names_out(["scl_median"]))
    # sub_df = pd.concat([sub_df.drop("scl_median", axis=1), scl_ohe], axis=1)

    to_predict = pd.DataFrame(
        scaler.transform(sub_df.loc[:, X_train.columns]),
        columns=X_train.columns
    )

    print("Predicting...")
    final_df["UHI Index"] = model.predict(to_predict)
    final_df.to_csv(filename, index=False)
    print("Done!")
    return
create_submission("BestModel_Radius100_Pareto_Engineered.csv", best_model.model, scaler)

Predicting 1040 rows...
Predicting...
Done!


---