In [82]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from simpledbf import Dbf5

import collections
import random

import arcpy

### First access the points that have extracted NDVI value with

In [2]:
# Determine the path to point_with_NDVI_value

arcpy.env.workspace = '../Step_2_Extract_NDVI_to_point/'

Point_with_NDVI = arcpy.ListFeatureClasses(wild_card= '*.shp')
Point_with_NDVI

['Extract_value_2005_2007_50K.shp',
 'Extract_value_2008_2010_50K.shp',
 'Extract_value_2011_2013_50K.shp',
 'Extract_value_2014_2016_50K.shp',
 'Extract_value_2017_2019_50K.shp']

In [12]:
# read the attribute table of each point_shp and 
# put the record into a dict, here just need to 
# use [CID] and [RASTERVALU]

Point_att = {}

for name,point in zip(Point_with_NDVI,Point_with_NDVI):
   
    # use SearchCursor to read the point attibutetable
    Point_att[name] = [(row[0],row[1]) for row in arcpy.da.SearchCursor(point,['CID','RASTERVALU'])]

### Calculate the frequency of the NDVI value in each 50K_point shp

In [52]:
NDVI_frequency = {}

for point in Point_with_NDVI:
    
    # extarct the NDVI value
    NDVI_value = [val[1] for val in Point_att[point]]
    
    # compute the occurency of each NDVI vlaue
    NDVI_hist  = collections.Counter(NDVI_value)
    # sort the dict 
    NDVI_hist  = collections.OrderedDict(sorted(NDVI_hist.items()))
    
    # Put the frequency into the dict.
    NDVI_frequency[point] = NDVI_hist

### Compute how many points should be selected according to NDVI frequency

In [60]:
# put the frequency into a dftaframe
NDVI_frequency_df = pd.DataFrame.from_dict(NDVI_frequency)

# fill 0 to Nan values to avoid computation error.
NDVI_frequency_df = NDVI_frequency_df.fillna(0)



# compute how many point should be selected according
# to NDVI frequency

for name in  Point_with_NDVI:  
    
    # for each NDVI frequency, select 1/5 of them.
    NDVI_frequency_df[f'{name}_ratio'] = NDVI_frequency_df[name].apply(lambda x: int(x/50000*10000))
    

### Random select sample from NDVI_point

In [85]:
# set the workspace
arcpy.env.workspace = '../Step_2_Extract_NDVI_to_point/'

# innitiate an empty dict to hold the selected CIDs.
sample_CID = {}

for col,point in  zip([f'{name}_ratio' for name in Point_with_NDVI],
                       Point_with_NDVI): 
    
    # get the ndvi and its corespoded frequency
    NDVI_select_num      = zip(NDVI_frequency_df[col].index,NDVI_frequency_df[col].values)
    
    
    
    # temporary list that holds the selected CIDs.
    CID_list = []
    
    # ramdom select from the point_shp according to NDVI_ration
    for NDVI,num in NDVI_select_num:
        
        # get the CID from Point_att if its NDVI euals NDVI here 
        CID = [cid for cid,ndvi in Point_att[point] if ndvi == NDVI]
        # ramdomly
        CID_select = random.sample(CID,num)
        CID_list.extend(CID_select)
    
    
    # updata the smaple dictionary
    sample_CID[point] = CID_list
        

In [106]:
# set the workspace
arcpy.env.workspace = '../Step_2_Extract_NDVI_to_point/'
arcpy.env.overwriteOutput = True


# using the ample_CID as filter to select from each Point_shp and 
# export the result

for point in Point_with_NDVI:
    
    spl = '_'
    out_name = f'Test_{point.split(spl)[2]}_{point.split(spl)[3]}_10K.shp'
    
    # get the CIDs
    CID_select  = tuple(sample_CID[point])
    # Using CIDs to select from point shp
    select_lry = arcpy.MakeFeatureLayer_management(in_features  = point,
                                                   out_layer    = 'select_point_lry',
                                                   where_clause = f''' "CID" IN {CID_select} ''')
    
    arcpy.CopyFeatures_management(select_lry,f'../Step_3_Select_10K_from_50K/{out_name}')
    
    # print out the process
    print(f'Exporting {out_name}')

Exporting Test_2005_2007_10K.shp
Exporting Test_2008_2010_10K.shp
Exporting Test_2011_2013_10K.shp
Exporting Test_2014_2016_10K.shp
Exporting Test_2017_2019_10K.shp
