In [63]:
import pandas as pd
import numpy as np
import rasterio as rs
from rasterio.merge import merge
from rasterio.mask import mask
from datetime import datetime
from sklearn.utils import resample


In [27]:
# create single df of all training samples
v_list = ['08', '14', '15', '19', '20', '21', '22',] # '23'
df_list = []

for i in v_list:
    df = pd.read_csv(f'../data/ceo-plantations-train-v{i}.csv')    
    df_list.append(df)

master_sample = pd.concat(df_list, ignore_index=True)

plantation_counts = df['PLANTATION'].value_counts()
plantation_percs = df['PLANTATION'].value_counts(normalize=True) * 100

# Combine counts and percentages into a single dataframe
plantation_stats = pd.DataFrame({
    'Count': plantation_counts,
    'Percentage (%)': round(plantation_percs)
})
plantation_stats

Unnamed: 0,Count,Percentage (%)
2,36934,76.0
255,5880,12.0
1,3361,7.0
0,2629,5.0


In [11]:
master_sample

Unnamed: 0.1,Unnamed: 0,PLOT_ID,SAMPLE_ID,LON,LAT,SYSTEM,PLANTATION,PLOT_FNAME
0,0,1,1,-0.879759,6.295327,Agroforestry,2,8001
1,1,1,2,-0.879759,6.295417,Agroforestry,2,8001
2,2,1,3,-0.879759,6.295506,Agroforestry,2,8001
3,3,1,4,-0.879759,6.295595,Agroforestry,2,8001
4,4,1,5,-0.879759,6.295684,Agroforestry,2,8001
...,...,...,...,...,...,...,...,...
263027,48799,248,48604,-2.175841,9.940482,Not plantation,0,22249
263028,48800,248,48605,-2.175841,9.940571,Agroforestry,2,22249
263029,48801,248,48606,-2.175841,9.940661,Agroforestry,2,22249
263030,48802,248,48607,-2.175841,9.940751,Agroforestry,2,22249


In [23]:
# create a comb mosaic
def mosaic_tif(tifs_to_mosaic, outpath):

    ''''
    Takes in a list of raster files and
    merges them to form a single tif.

    '''
    dir = f'../tmp/ghana/preds/mosaic/'
    reader_mode = []

    for file in tifs_to_mosaic:
        src = rs.open(dir+file)
        reader_mode.append(src) 
    print(f'Merging {len(reader_mode)} tifs.')

    mosaic, out_transform = merge(reader_mode)
    date = datetime.today().strftime('%Y-%m-%d')
    outpath = f"{dir}{outpath}_{date}.tif"
    out_meta = src.meta.copy()  
    out_meta.update({'driver': "GTiff",
                     'dtype': 'uint8',
                     'height': mosaic.shape[1],
                     'width': mosaic.shape[2],
                     'transform': out_transform,
                     'compress':'lzw',
                     'nodata': 255})

    with rs.open(outpath, "w", **out_meta) as dest:
        dest.write(mosaic)

    # Ensure to close all files
    for src in reader_mode:
        src.close()

    return None

In [25]:
tifs = ['pd_north_v27_2024-08-23.tif',
        'pd_east_v27_2024-08-23.tif',
        'pd_west_v27_2024-08-23.tif',
       ]

mosaic_tif(tifs, 'ghana_v27')

[<open DatasetReader name='../tmp/ghana/preds/mosaic/pd_north_v27_2024-08-23.tif' mode='r'>, <open DatasetReader name='../tmp/ghana/preds/mosaic/pd_east_v27_2024-08-23.tif' mode='r'>, <open DatasetReader name='../tmp/ghana/preds/mosaic/pd_west_v27_2024-08-23.tif' mode='r'>]
Merging 3 tifs.


In [65]:
def calculate_class_distribution(land_use_map, classes):
    '''
    Get the counts and proportions of each land use class (0, 1, 2, 3)
    bincounts occurrences of non-negative integers and can optionally include 
    zeros or no-data values, depending on the minlength argument (4 or 256).
    '''    
    class_dist = {}
    class_prop = {}
    counts = np.bincount(land_use_map.flatten(), minlength=4) 
    valid_counts = counts[classes]
    total_count = valid_counts.sum()
    for value in classes:
        count = counts[value]
        percentage = (count / total_count) * 100
        class_dist[value] = count
        class_prop[value] = round(percentage,2)
        
    return class_dist, class_prop, total_count
    

def stratified_random_sample(land_use_map, classes, total_samples, class_distribution, class_proportions):
    '''
    Perform stratified random sample
    '''
    sample_points = []  
    
    for cls, proportion in class_proportions.items():
        # Samples per class based on proportion
        # this needs to be investigated ## TODO
        cls_sample_count = int(proportion * total_samples)  
        cls_indices = np.argwhere(land_use_map == cls)  
        
        # Randomly sample points for this class
        sampled_indices = resample(cls_indices, 
                                   n_samples=cls_sample_count, 
                                   replace=False, 
                                   random_state=42)
        
        sample_points.extend(sampled_indices.tolist())
    
    return np.array(sample_points)

def run_validation_workflow(map_file, total_samples):

    '''
    Steps:
        1. Calculates total area of map and class distribution
        2. Performs stratified random sampling
    '''
    
    with rs.open(map_file) as src:
        land_use_map = src.read(1)

    classes = [0, 1, 2, 3]
    class_dist, class_prop, total_count = calculate_class_distribution(land_use_map, classes)
    print(f"Class distribution: {class_dist}")
    print(f"Class proportions: {class_prop}")
    print(f"Total count (pixels): {total_count}")
    
    sample_points = stratified_random_sample(land_use_map, 
                                             classes,
                                             total_count, 
                                             class_dist, 
                                             class_prop)
    print(f"Sample points (coordinates): {sample_points}")
    
    return sample_points

In [67]:
map_file = f'../tmp/ghana/preds/mosaic/ghana_v27_2024-09-27.tif'
total_samples = 1000  

sampled_points = run_validation_workflow(map_file, total_samples)

In [57]:
class_dist, class_prop, total_count = calculate_class_distribution(ghana, [0, 1, 2, 3])

In [58]:
class_dist

{0: 269113871, 1: 1272342, 2: 138084189, 3: 99051836}

In [59]:
class_prop

{0: 53.02504025449226,
 1: 0.250696798038631,
 2: 27.207514993658265,
 3: 19.516747953810842}

In [60]:
total_count

507522238