In [1]:
import warnings
import time
import os
import geopandas as gpd
RASTERIO_BEST_PRACTICES = dict(  # See https://github.com/pangeo-data/cog-best-practices
    CURL_CA_BUNDLE="/etc/ssl/certs/ca-certificates.crt",
    GDAL_DISABLE_READDIR_ON_OPEN="EMPTY_DIR",
    AWS_NO_SIGN_REQUEST="YES",
    GDAL_MAX_RAW_BLOCK_CACHE_SIZE="200000000",
    GDAL_SWATH_SIZE="200000000",
    VSI_CURL_CACHE_SIZE="200000000",
)
os.environ.update(RASTERIO_BEST_PRACTICES)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import rasterio
import rasterio.warp
import rasterio.mask
import shapely.geometry
import geopandas
import dask_geopandas
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from scipy.stats import spearmanr
from scipy.linalg import LinAlgWarning
from dask.distributed import Client
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import mean_squared_error

warnings.filterwarnings(action="ignore", category=LinAlgWarning, module="sklearn")

import pystac_client
import planetary_computer as pc

# Function define

In [2]:
def featurize(input_img, model, device):
    """Helper method for running an image patch through the model.

    Args:
        input_img (np.ndarray): Image in (C x H x W) format with a dtype of uint8.
        model (torch.nn.Module): Feature extractor network
    """
    assert len(input_img.shape) == 1
    input_img = torch.from_numpy(input_img / 255.0).float()
    input_img = input_img.to(device)
    with torch.no_grad():
        feats = model(input_img.unsqueeze(0)).cpu().numpy()
    return feats



# RCF RGB

In [3]:
class RCF(nn.Module):
    """A model for extracting Random Convolution Features (RCF) from input imagery."""

    def __init__(self, num_features=16, kernel_size=3, num_input_channels=1):
        super(RCF, self).__init__()

        # We create `num_features / 2` filters so require `num_features` to be divisible by 2
        assert num_features % 2 == 0

        self.conv1 = nn.Conv2d(
            num_input_channels,
            num_features // 2,
            kernel_size=kernel_size,
            stride=1,
            padding=0,
            dilation=1,
            bias=True,
        )

        nn.init.normal_(self.conv1.weight, mean=0.0, std=1.0)
        nn.init.constant_(self.conv1.bias, -1.0)

    def forward(self, x):
        x1a = F.relu(self.conv1(x), inplace=True)
        x1b = F.relu(-self.conv1(x), inplace=True)

        x1a = F.adaptive_avg_pool2d(x1a, (1, 1)).squeeze()
        x1b = F.adaptive_avg_pool2d(x1b, (1, 1)).squeeze()

        if len(x1a.shape) == 1:  # case where we passed a single input
            return torch.cat((x1a, x1b), dim=0)
        elif len(x1a.shape) == 2:  # case where we passed a batch of > 1 inputs
            return torch.cat((x1a, x1b), dim=1)
device = torch.device("cuda")


In [4]:
def query(points, num_images_per_point=1):
    """
    Find STAC items for points in the `points` DataFrame

    Parameters
    ----------
    points : geopandas.GeoDataFrame
        A GeoDataFrame
    num_images_per_point : int
        Number of STAC items to retrieve for each point

    Returns
    -------
    geopandas.GeoDataFrame
        A new geopandas.GeoDataFrame with a `stac_items` column containing a list of
        STAC items that cover each point.
    """
    intersects = shapely.geometry.mapping(points.unary_union.convex_hull)

    search_start = "2017-01-01"
    search_end = "2018-12-31"
    catalog = pystac_client.Client.open(
        "https://planetarycomputer.microsoft.com/api/stac/v1"
    )


    # The time frame in which we search for non-cloudy imagery
    search = catalog.search(
        collections=["sentinel-2-l2a"],
        intersects=intersects,
        datetime=[search_start, search_end],
        query={"eo:cloud_cover": {"lt": 20}},
        limit=100,
    )
    ic = search.get_all_items_as_dict()
    #for key, value in ic.iteritems():
       # print (key, value)

    num_search_results = len(ic["features"])
    print("Number of search results:", num_search_results)

    features = ic["features"]
    features_d = {item["id"]: item for item in features}
    print(features_d)

    data = {
        "eo:cloud_cover": [],
        "geometry": [],
    }

    index = []

    for item in features:
        data["eo:cloud_cover"].append(item["properties"]["eo:cloud_cover"])
        data["geometry"].append(shapely.geometry.shape(item["geometry"]))
        index.append(item["id"])

    items = geopandas.GeoDataFrame(data, index=index, geometry="geometry").sort_values(
        "eo:cloud_cover"
    )
    point_list = points.geometry.tolist()

    point_items = []
    for point in point_list:
        covered_by = items[items.covers(point)]
        if len(covered_by):
            stac_items = [
                features_d[item_id]
                for item_id in covered_by.index[:num_images_per_point]
            ]
            point_items.append(stac_items)
        else:
            # There weren't any scenes matching our conditions for this point (too cloudy)
            point_items.append([])

    return points.assign(stac_item=point_items)


In [5]:
class CustomDataset(Dataset):
    def __init__(self, points, fns, buffer=500):
        self.points = points
        self.fns = fns
        self.buffer = buffer

    def __len__(self):
        return self.points.shape[0]

    def __getitem__(self, idx):

        lon, lat = self.points[idx]
        fn = self.fns[idx]

        if fn is None:
            return None
        else:
            point_geom = shapely.geometry.mapping(shapely.geometry.Point(lon, lat))

            with rasterio.Env():
                with rasterio.open(fn, "r") as f:
                    point_geom = rasterio.warp.transform_geom(
                        "epsg:4326", f.crs.to_string(), point_geom
                    )
                    point_shape = shapely.geometry.shape(point_geom)
                    mask_shape = point_shape.buffer(self.buffer).envelope
                    mask_geom = shapely.geometry.mapping(mask_shape)
                    try:
                        out_image, out_transform = rasterio.mask.mask(
                            f, [mask_geom], crop=True
                        )
                    except ValueError as e:
                        if "Input shapes do not overlap raster." in str(e):
                            return None

            out_image = out_image / 255.0
            out_image = torch.from_numpy(out_image).float()
            return out_image

def extract_features(train_dataset, model, num_features):
    dataloader = DataLoader(
        train_dataset,
        batch_size=4,
        shuffle=False,
        num_workers=os.cpu_count() * 2,
        collate_fn=lambda x: x,
        pin_memory=False,
    )

    x_train = np.zeros((train_dataset.points.shape[0], num_features), dtype=float)

    tic = time.time()
    i = 0

    for images in dataloader:
        for image in images:
            if image is not None:
                # A full image should be ~101x101 pixels (i.e. ~1km^2 at a 10m/px spatial
                # resolution), however we can receive smaller images if an input point
                # happens to be at the edge of a S2 scene (a literal edge case). To deal
                # with these (edge) cases we crudely drop all images where the spatial
                # dimensions aren't both greater than 20 pixels.
                if image.shape[1] >= 20 and image.shape[2] >= 20:
                    image = image.to(device)
                    with torch.no_grad():
                        feats = model(image.unsqueeze(0)).cpu().numpy()

                    x_train[i] = feats
                else:
                    # this happens if the point is close to the edge of a scene
                    # (one or both of the spatial dimensions of the image are very small)
                    pass
            else:
                pass  # this happens if we do not find a S2 scene for some point

            if i % 1000 == 0:
                print(
                    f"{i}/{train_dataset.points.shape[0]} -- {i / train_dataset.points.shape[0] * 100:0.2f}%"
                    + f" -- {time.time()-tic:0.2f} seconds"
                )
                tic = time.time()
            i += 1

    return x_train


# Read dataset

In [6]:
df = pd.read_csv(

    "https://drive.google.com/uc?export=download&id=1vaZxJap_x1iyf-ytkDo13Dy_7ETdWsO3",  # noqa: E501
    index_col=0,
    na_values=[0,-999]
).dropna()
points = df[["lon", "lat"]]
houseprice = df["houseprice"]

gdf = geopandas.GeoDataFrame(df, geometry=geopandas.points_from_xy(df.lon, df.lat))
gdf


Unnamed: 0_level_0,ID,lon,lat,houseprice,City,geometry
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
243,15363,35.253528,0.461746,0.064764,Eldoret,POINT (35.25353 0.46175)
244,15377,35.266255,0.461746,0.086711,Eldoret,POINT (35.26626 0.46175)
254,15258,35.248983,0.462655,0.167129,Eldoret,POINT (35.24898 0.46266)
255,15272,35.261710,0.462655,1.091892,Eldoret,POINT (35.26171 0.46266)
265,15153,35.244437,0.463564,0.745180,Eldoret,POINT (35.24444 0.46356)
...,...,...,...,...,...,...
1573,2766,37.067096,-1.025716,7.017600,Thika,POINT (37.06710 -1.02572)
1574,2778,37.078005,-1.025716,7.017600,Thika,POINT (37.07800 -1.02572)
1590,2672,37.072550,-1.024807,5.070685,Thika,POINT (37.07255 -1.02481)
1606,2566,37.067096,-1.023898,1.713330,Thika,POINT (37.06710 -1.02390)


In [7]:
gdf.groupby('City').count()

Unnamed: 0_level_0,ID,lon,lat,houseprice,geometry
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Eldoret,590,590,590,590,590
Embu,329,329,329,329,329
Garissa,275,275,275,275,275
Kakamega,411,411,411,411,411
Kericho,244,244,244,244,244
Kisumu,167,167,167,167,167
Kitui,106,106,106,106,106
Machakos,264,264,264,264,264
Malindi,204,204,204,204,204
Mombasa,324,324,324,324,324


# Load data

In [8]:
NPARTITIONS = 250

ddf = dask_geopandas.from_geopandas(gdf, npartitions=1)

In [9]:
%%time

with Client(n_workers=16) as client:
    print(client.dashboard_link)
    meta = ddf._meta.assign(stac_item=[])
    df2 = ddf.map_partitions(query, meta=meta).compute()

/user/zhaxinge@upenn.edu/proxy/8787/status


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)




CPU times: user 2.86 s, sys: 2.74 s, total: 5.6 s
Wall time: 37.2 s


In [10]:
light = pd.read_csv(
    "https://drive.google.com/uc?export=download&id=1Xg3RrqW4YJVJqBXWHCX-hoNqnbBq_pIn",  # noqa: E501, this link to the Google drive contains the light data
    #index_col=0,
    na_values=[0,-999]
).dropna()
ldf = gpd.GeoDataFrame(light, geometry=geopandas.points_from_xy(light.lon, light.lat))



In [11]:
len(light)

16291

In [12]:

join = gpd.sjoin(df2,ldf, how="inner", predicate='intersects')
column_mapping = {
    'lon_left': 'lon',
    'lat_left': 'lat',
    'City_left': 'City', # change
}

join = join.rename(columns=column_mapping).drop(columns = ['index_right','lat_right', 'lon_right','City_right'])


join

Unnamed: 0_level_0,ID_left,lon,lat,houseprice,City,geometry,stac_item,Unnamed: 0,ID_right,nightlight
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
166,24690,39.999240,-3.367984,0.129686,Malindi,POINT (39.99924 -3.36798),[{'id': 'S2B_MSIL2A_20180307T072739_R049_T37MF...,166,24690,1.328667
167,24099,37.246460,-1.593632,0.338897,Machakos,POINT (37.24646 -1.59363),[{'id': 'S2A_MSIL2A_20180114T075201_R092_T37MC...,167,24099,0.000379
181,24495,40.003786,-3.366168,2.258490,Malindi,POINT (40.00379 -3.36617),[{'id': 'S2B_MSIL2A_20180307T072739_R049_T37MF...,181,24495,1.587732
183,234105,39.637599,-4.118565,0.003528,Mombasa,POINT (39.63760 -4.11856),[{'id': 'S2B_MSIL2A_20180605T072609_R049_T37ME...,183,234105,1.352132
190,23790,37.238278,-1.590905,1.127589,Machakos,POINT (37.23828 -1.59091),[{'id': 'S2A_MSIL2A_20180114T075201_R092_T37MC...,190,23790,0.080019
...,...,...,...,...,...,...,...,...,...,...
1802,27126,39.656690,-3.930847,0.054200,Mombasa,POINT (39.65669 -3.93085),[{'id': 'S2B_MSIL2A_20180605T072609_R049_T37ME...,1802,27126,0.266560
1807,2969,37.219187,-1.401879,0.219142,Machakos,POINT (37.21919 -1.40188),[{'id': 'S2A_MSIL2A_20180114T075201_R092_T37MC...,1807,2969,0.039606
1815,2875,37.224642,-1.400970,0.003344,Machakos,POINT (37.22464 -1.40097),[{'id': 'S2A_MSIL2A_20180114T075201_R092_T37MC...,1815,2875,0.021165
1822,2754,37.205551,-1.400061,0.178383,Machakos,POINT (37.20555 -1.40006),[{'id': 'S2A_MSIL2A_20180114T075201_R092_T37MC...,1822,2754,0.027492


# Extract features

In [13]:

class CustomRGBNDataset(Dataset):
    def __init__(self,points,fns_rgb,fns_n):
        self.rgb_dataset = CustomDataset(points, fns_rgb)
        self.nir_dataset = CustomDataset(points, fns_n)
        self.points = points
        assert len(self.rgb_dataset)==len(self.nir_dataset)
        
        self.length=len(self.rgb_dataset)
    
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        return torch.cat([self.rgb_dataset[idx], self.nir_dataset[idx]],dim=0)
    
    

In [14]:
#searching the planetary database for the related urls

df3 = join.dropna(subset=["stac_item"])

#Three channel
matching_urls = [
    pc.sign(item[0]["assets"]["visual"]["href"]) for item in df3.stac_item
] 
#NIR channel
matching_nir = [
    pc.sign(item[0]["assets"]['B08']["href"]) for item in df3.stac_item
]

df4 for the light dataset: there are two types of light datasets here are the extracted data from the mosaic methods, one for the data from the direct nightlight

In [None]:
columns_to_exclude = ['ID', 'lon', 'lat', 'houseprice', 'City', 'geometry', 'stac_item']
selected_columns = [col for col in df3.columns if col not in columns_to_exclude]

df4 = df3[selected_columns]
df4 = df4 [['nightlight']]

In [None]:
# use points from df3 to match the search urls in the sentinal or landsat datasat
points = df3[["lon", "lat"]].to_numpy()
houseprice_log = np.log10(df3["houseprice"].to_numpy() + 1)

# Create a group array based on the "city" column
groups = df3["City"].to_numpy()

# Perform leave-one-city-out splitting
logo = LeaveOneGroupOut()
train_sets = []
test_sets = []
group_scores = {}



# Define the hyperparameter search space
param_distributions = {
    'alpha': np.logspace(-8, 8, base=10, num=17),
    'solver': ['auto']
}

for train_indices, test_indices in logo.split(points, groups=groups):
    train_sets.append(train_indices)
    test_sets.append(test_indices)
    city = groups[test_indices[0]]
    print(city)
    
    train_dataset1 = CustomDataset(points[train_indices], [matching_nir[idx] for idx in train_indices])
    test_dataset1 = CustomDataset(points[test_indices], [matching_nir[idx] for idx in train_indices])
    
    train_dataset2 = CustomDataset(points[train_indices], [matching_urls[idx] for idx in train_indices])
    test_dataset2 = CustomDataset(points[test_indices], [matching_urls[idx] for idx in train_indices])
    
    model1 = RCF(num_features=128, num_input_channels=1).eval().to(device) 
    model2 = RCF(num_features=256, num_input_channels=3).eval().to(device)
    # Extract features from CustomDataset for train and test datasets: x_train1 for the nir, x_train2 for the rgb, x_train3 for the extracted lightnight
    x_train1 = extract_features(train_dataset1,model1,num_features=128)
    x_train2 = extract_features(train_dataset2,model2,num_features=256)
    x_train3 = df4.iloc[train_indices]
    
    x_test1 = extract_features(test_dataset1, model1,num_features=128)
    x_test2 = extract_features(test_dataset2, model2,num_features=256)
    x_test3 = df4.iloc[test_indices]
    
    # Add nightlight_log to the training set
    x_train = np.concatenate((x_train1,x_train2), axis=1)
    print(x_train.shape)
    #x_train[np.isinf(x_train)] = 0
    x_test = np.concatenate((x_test1,x_test2), axis=1)
    #x_test[np.isinf(x_test)] = 0

    y_train = houseprice_log.copy()[train_indices]
    y_test = houseprice_log.copy()[test_indices]
    print(x_train2.shape)
    print(y_train.shape)

    # Perform random search for hyperparameter tuning
    ridge = Ridge()
    ridge_random = RandomizedSearchCV(ridge, param_distributions, cv=5, n_iter=20, random_state=42)
    ridge_random.fit(x_train, y_train)
    
    # Get the best hyperparameters
    best_alpha = ridge_random.best_params_['alpha']
    best_solver = ridge_random.best_params_['solver']
    
    # Initialize Ridge Regression with the best hyperparameters
    ridge_cv_best = Ridge(alpha=best_alpha, solver=best_solver)
    ridge_cv_best.fit(x_train, y_train)
    
    # Evaluate the model on the test dataset
    test_predictions = ridge_cv_best.predict(x_test)
    test_mse = mean_squared_error(y_test, test_predictions)
    test_score = r2_score(y_test, test_predictions)
    
    # Calculate the train score on the current city group
    train_predictions = ridge_cv_best.predict(x_train)
    train_mse = mean_squared_error(y_train, train_predictions)
    train_score = r2_score(y_train, train_predictions)
    
    # Store the test score and train score for the current city group
    group_scores[city] = {'train_score': train_score, 'train_mse':train_mse,'test_score': test_score, 'test_mse':test_mse}

Embu




0/3344 -- 0.00% -- 1.59 seconds
1000/3344 -- 29.90% -- 2.97 seconds
2000/3344 -- 59.81% -- 3.24 seconds
3000/3344 -- 89.71% -- 2.87 seconds
0/3344 -- 0.00% -- 0.65 seconds
1000/3344 -- 29.90% -- 4.20 seconds
2000/3344 -- 59.81% -- 4.25 seconds
3000/3344 -- 89.71% -- 3.92 seconds
0/293 -- 0.00% -- 0.43 seconds
0/293 -- 0.00% -- 0.38 seconds
(3344, 384)
(3344, 256)
(3344,)




Garissa




0/3373 -- 0.00% -- 0.78 seconds
1000/3373 -- 29.65% -- 2.92 seconds
2000/3373 -- 59.29% -- 2.93 seconds
3000/3373 -- 88.94% -- 3.25 seconds
0/3373 -- 0.00% -- 0.65 seconds
1000/3373 -- 29.65% -- 3.93 seconds
2000/3373 -- 59.29% -- 3.85 seconds
3000/3373 -- 88.94% -- 3.94 seconds
0/264 -- 0.00% -- 0.42 seconds
0/264 -- 0.00% -- 0.40 seconds
(3373, 384)
(3373, 256)
(3373,)




Kakamega




0/3226 -- 0.00% -- 0.72 seconds
1000/3226 -- 31.00% -- 2.90 seconds
2000/3226 -- 62.00% -- 2.97 seconds
3000/3226 -- 92.99% -- 2.75 seconds
0/3226 -- 0.00% -- 0.63 seconds
1000/3226 -- 31.00% -- 3.67 seconds
2000/3226 -- 62.00% -- 4.16 seconds
3000/3226 -- 92.99% -- 3.97 seconds
0/411 -- 0.00% -- 0.40 seconds
0/411 -- 0.00% -- 0.38 seconds
(3226, 384)
(3226, 256)
(3226,)




Kericho




0/3405 -- 0.00% -- 0.69 seconds
1000/3405 -- 29.37% -- 3.03 seconds
2000/3405 -- 58.74% -- 2.89 seconds
3000/3405 -- 88.11% -- 2.89 seconds
0/3405 -- 0.00% -- 0.65 seconds
1000/3405 -- 29.37% -- 3.87 seconds
2000/3405 -- 58.74% -- 3.49 seconds
3000/3405 -- 88.11% -- 3.89 seconds
0/232 -- 0.00% -- 0.40 seconds
0/232 -- 0.00% -- 0.40 seconds
(3405, 384)
(3405, 256)
(3405,)




Kisumu




0/3483 -- 0.00% -- 0.70 seconds
1000/3483 -- 28.71% -- 3.19 seconds
2000/3483 -- 57.42% -- 2.76 seconds
3000/3483 -- 86.13% -- 2.97 seconds
0/3483 -- 0.00% -- 0.58 seconds
1000/3483 -- 28.71% -- 3.85 seconds
2000/3483 -- 57.42% -- 3.92 seconds
3000/3483 -- 86.13% -- 4.12 seconds
0/154 -- 0.00% -- 0.40 seconds
0/154 -- 0.00% -- 0.40 seconds
(3483, 384)
(3483, 256)
(3483,)




Kitui




0/3586 -- 0.00% -- 0.73 seconds
1000/3586 -- 27.89% -- 3.26 seconds
2000/3586 -- 55.77% -- 2.81 seconds
3000/3586 -- 83.66% -- 2.73 seconds
0/3586 -- 0.00% -- 0.67 seconds
1000/3586 -- 27.89% -- 3.81 seconds
2000/3586 -- 55.77% -- 4.16 seconds
3000/3586 -- 83.66% -- 3.87 seconds
0/51 -- 0.00% -- 0.46 seconds
0/51 -- 0.00% -- 0.44 seconds
(3586, 384)
(3586, 256)
(3586,)




Machakos




0/3451 -- 0.00% -- 0.71 seconds
1000/3451 -- 28.98% -- 3.16 seconds
2000/3451 -- 57.95% -- 2.94 seconds
3000/3451 -- 86.93% -- 2.89 seconds
0/3451 -- 0.00% -- 0.59 seconds
1000/3451 -- 28.98% -- 4.23 seconds
2000/3451 -- 57.95% -- 3.76 seconds
3000/3451 -- 86.93% -- 3.82 seconds
0/186 -- 0.00% -- 0.43 seconds
0/186 -- 0.00% -- 0.47 seconds
(3451, 384)
(3451, 256)
(3451,)




Malindi




0/3454 -- 0.00% -- 0.48 seconds
1000/3454 -- 28.95% -- 3.15 seconds
2000/3454 -- 57.90% -- 2.76 seconds
3000/3454 -- 86.86% -- 2.75 seconds
0/3454 -- 0.00% -- 0.51 seconds
1000/3454 -- 28.95% -- 3.83 seconds
2000/3454 -- 57.90% -- 3.64 seconds
3000/3454 -- 86.86% -- 3.76 seconds
0/183 -- 0.00% -- 0.38 seconds
0/183 -- 0.00% -- 0.36 seconds
(3454, 384)
(3454, 256)
(3454,)




Mombasa




0/3313 -- 0.00% -- 0.61 seconds
1000/3313 -- 30.18% -- 3.13 seconds
2000/3313 -- 60.37% -- 2.82 seconds
3000/3313 -- 90.55% -- 2.82 seconds
0/3313 -- 0.00% -- 0.66 seconds
1000/3313 -- 30.18% -- 3.89 seconds
2000/3313 -- 60.37% -- 3.73 seconds
3000/3313 -- 90.55% -- 4.16 seconds
0/324 -- 0.00% -- 0.46 seconds
0/324 -- 0.00% -- 0.43 seconds
(3313, 384)
(3313, 256)
(3313,)




Nairobi




0/3289 -- 0.00% -- 0.80 seconds
1000/3289 -- 30.40% -- 2.97 seconds
2000/3289 -- 60.81% -- 3.04 seconds
3000/3289 -- 91.21% -- 2.83 seconds
0/3289 -- 0.00% -- 1.07 seconds
1000/3289 -- 30.40% -- 3.80 seconds
2000/3289 -- 60.81% -- 3.46 seconds
3000/3289 -- 91.21% -- 4.26 seconds
0/348 -- 0.00% -- 0.40 seconds
0/348 -- 0.00% -- 0.41 seconds
(3289, 384)
(3289, 256)
(3289,)




Naivasha




0/3555 -- 0.00% -- 0.67 seconds
1000/3555 -- 28.13% -- 3.18 seconds
2000/3555 -- 56.26% -- 2.89 seconds
3000/3555 -- 84.39% -- 2.73 seconds
0/3555 -- 0.00% -- 0.64 seconds
1000/3555 -- 28.13% -- 4.03 seconds
2000/3555 -- 56.26% -- 4.12 seconds
3000/3555 -- 84.39% -- 3.85 seconds
0/82 -- 0.00% -- 0.41 seconds


In [None]:
# Print the evaluation scores for each city group
for city, score in group_scores.items():
    print(f"City: {city}, Test Score: {score}")

In [None]:
pd.DataFrame.from_dict(group_scores,  orient='index')

# Full datasset

This part provide the full dataser for training and testing

In [None]:
dataset = CustomDataset(points, matching_urls)

dataloader = DataLoader(
    dataset,
    batch_size=4,
    shuffle=False,
    num_workers=os.cpu_count() * 2,
    collate_fn=lambda x: x,
    pin_memory=False,
)

In [None]:
x_all = np.zeros((points.shape[0], num_features), dtype=float)

tic = time.time()
i = 0
for images in dataloader:
    for image in images:

        if image is not None:
            # A full image should be ~101x101 pixels (i.e. ~1km^2 at a 10m/px spatial
            # resolution), however we can receive smaller images if an input point
            # happens to be at the edge of a S2 scene (a literal edge case). To deal
            # with these (edge) cases we crudely drop all images where the spatial
            # dimensions aren't both greater than 20 pixels.
            if image.shape[1] >= 20 and image.shape[2] >= 20:
                image = image.to(device)
                with torch.no_grad():
                    feats = model(image.unsqueeze(0)).cpu().numpy()

                x_all[i] = feats
            else:
                # this happens if the point is close to the edge of a scene
                # (one or both of the spatial dimensions of the image are very small)
                pass
        else:
            pass  # this happens if we do not find a S2 scene for some point

        if i % 1000 == 0:
            print(
                f"{i}/{points.shape[0]} -- {i / points.shape[0] * 100:0.2f}%"
                + f" -- {time.time()-tic:0.2f} seconds"
            )
            tic = time.time()
        i += 1

In [None]:
y_all = houseprice_log.copy()

In [None]:
nofeature_mask = ~(x_all.sum(axis=1) == 0)

In [None]:
x_all.shape, y_all.shape

In [None]:
x_all = x_all[nofeature_mask]
y_all = y_all[nofeature_mask]

In [None]:
x_all.shape, y_all.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

In [None]:
ridge_cv_random = RidgeCV(cv=5, alphas=np.logspace(-8, 8, base=10, num=17))
ridge_cv_random.fit(x_train, y_train)

In [None]:
print(f"Validation R2 performance {ridge_cv_random.best_score_:0.2f}")

In [None]:
y_pred = np.maximum(ridge_cv_random.predict(x_test), 0)

plt.figure()
plt.scatter(y_pred, y_test, alpha=0.2, s=4)
plt.xlabel("Predicted", fontsize=15)
plt.ylabel("Ground Truth", fontsize=15)
plt.title(r"$\log_{10}(1 + $houseprice$)$", fontsize=15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.xlim([0, 6])
plt.ylim([0, 6])

plt.text(
    0.5,
    5,
    s="R$^2$ = %0.2f" % (r2_score(y_test, y_pred)),
    fontsize=15,
    fontweight="bold",
)
m, b = np.polyfit(y_pred, y_test, 1)
plt.plot(y_pred, m * y_pred + b, color="black")
plt.gca().spines.right.set_visible(False)
plt.gca().spines.top.set_visible(False)

plt.show()
plt.close()

In [None]:
spearmanr(y_pred, y_test)

In [None]:
import statsmodels.api as sm

# Perform OLS regression on the whole dataset
x_train = sm.add_constant(x_train)  # Add constant term for the intercept
ols_model = sm.OLS(y_train, x_train)
ols_results = ols_model.fit()

# Get the summary tables
summary_tables = ols_results.summary().tables

# Print the upper part of the OLS Regression Results
print(summary_tables[0])


In [None]:

# Perform OLS regression on the whole dataset
x_test = sm.add_constant(x_test)  # Add constant term for the intercept
ols_model = sm.OLS(y_test, x_test)
ols_results = ols_model.fit()

# Get the summary tables
summary_tables = ols_results.summary().tables

# Print the upper part of the OLS Regression Results
print(summary_tables[0])

In [None]:
points = points[nofeature_mask]

In [None]:
split_lon = np.percentile(points[:, 0], 80)
train_idxs = np.where(points[:, 0] <= split_lon)[0]
test_idxs = np.where(points[:, 0] > split_lon)[0]

x_train = x_all[train_idxs]
x_test = x_all[test_idxs]
y_train = y_all[train_idxs]
y_test = y_all[test_idxs]

In [None]:
plt.figure()
plt.scatter(points[:, 0], points[:, 1], c=y_all, s=1)
plt.vlines(
    split_lon,
    ymin=points[:, 1].min(),
    ymax=points[:, 1].max(),
    color="black",
    linewidth=4,
)
plt.axis("off")
plt.show()
plt.close()

In [None]:
ridge_cv = RidgeCV(cv=5, alphas=np.logspace(-8, 8, base=10, num=17))
ridge_cv.fit(x_train, y_train)

In [None]:
print(f"Validation R2 performance {ridge_cv.best_score_:0.2f}")

In [None]:
y_pred = np.maximum(ridge_cv.predict(x_test), 0)

plt.figure()
plt.scatter(y_pred, y_test, alpha=0.2, s=4)
plt.xlabel("Predicted", fontsize=15)
plt.ylabel("Ground Truth", fontsize=15)
plt.title(r"$\log_{10}(1 + $people$/$km$^2)$", fontsize=15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.xlim([0, 6])
plt.ylim([0, 6])

plt.text(
    0.5,
    5,
    s="R$^2$ = %0.2f" % (r2_score(y_test, y_pred)),
    fontsize=15,
    fontweight="bold",
)
m, b = np.polyfit(y_pred, y_test, 1)
plt.plot(y_pred, m * y_pred + b, color="black")
plt.gca().spines.right.set_visible(False)
plt.gca().spines.top.set_visible(False)

plt.show()
plt.close()

In [None]:
spearmanr(y_test, y_pred)

In [None]:
bins = np.linspace(0, 5, num=50)

plt.figure()
plt.hist(y_train, bins=bins)
plt.ylabel("Frequency")
plt.xlabel(r"$\log_{10}(1 + $people$/$km$^2)$")
plt.title("Train points -- western US")
plt.gca().spines.right.set_visible(False)
plt.gca().spines.top.set_visible(False)
plt.show()
plt.close()

plt.figure()
plt.hist(y_test, bins=bins)
plt.ylabel("Frequency")
plt.xlabel(r"$\log_{10}(1 + $people$/$km$^2)$")
plt.title("Test points -- eastern US")
plt.gca().spines.right.set_visible(False)
plt.gca().spines.top.set_visible(False)
plt.show()
plt.close()

In [None]:
y_pred = np.maximum(ridge_cv.predict(x_test), 0)

plt.figure()
plt.scatter(y_pred, y_test, alpha=0.2, s=4)
plt.xlabel("Predicted", fontsize=15)
plt.ylabel("Ground Truth", fontsize=15)
plt.title(r"$\log_{10}(1 + $people$/$km$^2)$", fontsize=15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.xlim([0, 6])
plt.ylim([0, 6])

plt.text(
    0.5,
    5,
    s="R$^2$ = %0.2f" % (r2_score(y_test, y_pred)),
    fontsize=15,
    fontweight="bold",
)
m, b = np.polyfit(y_pred, y_test, 1)
plt.plot(y_pred, m * y_pred + b, color="black")
plt.gca().spines.right.set_visible(False)
plt.gca().spines.top.set_visible(False)

plt.show()
plt.close()