In [1]:
import ee
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import json
import random

In [2]:
ee.Initialize()

### Define basic parameters

In [3]:
# define the year range
year_span  = [f'{i}_{i+2}' for i in range(1990,2020,3)]

# define the number of non-built control points
num_non_built = 5000

# import some spatial constrains
North_china_plain_boundary = ee.FeatureCollection("users/wangjinzhulala/North_China_Plain_Python/Boundary_shp/North_China_Plain_Boundary")

In [4]:
year_span

['1990_1992',
 '1993_1995',
 '1996_1998',
 '1999_2001',
 '2002_2004',
 '2005_2007',
 '2008_2010',
 '2011_2013',
 '2014_2016',
 '2017_2019']

### Calculate the sample_point_num for each NDVI value

##### Step_1: Compute the histogram of NDVI for each Landsat image

In [5]:
NDVI_hist = {}

# loop through each year_range 
for span in year_span:
    
      
    # Fetch the NDVI value 
    NDVI = ee.Image(f'users/wensomone666/Jinzhu/Mean_NDVI/Year_{span}_Mean_NDVI').clip(North_china_plain_boundary)
    
    # Calculate the area percentage of each NDVI value 
    NDVI_frequency = NDVI.reduceRegion(reducer   = ee.Reducer.histogram(200),
                                       geometry  = North_china_plain_boundary.geometry().bounds(), 
                                       scale     = 30, 
                                       maxPixels = int(1e13)).getInfo()
    
    # unpack the value from histogram
    count    = [round(i) for i in NDVI_frequency['nd']['histogram']]
    nd_value = [round(i) for i in NDVI_frequency['nd']['bucketMeans']]
    
    # put the hist value into the hist dictionary
    NDVI_hist[span] = list(zip(nd_value,count))
    
    # print out the process
    print(f'Histogram calculation of Year_{span}_Mean_NDVI completed!')
    

Histogram calculation of Year_1990_1992_Mean_NDVI completed!
Histogram calculation of Year_1993_1995_Mean_NDVI completed!
Histogram calculation of Year_1996_1998_Mean_NDVI completed!
Histogram calculation of Year_1999_2001_Mean_NDVI completed!
Histogram calculation of Year_2002_2004_Mean_NDVI completed!
Histogram calculation of Year_2005_2007_Mean_NDVI completed!
Histogram calculation of Year_2008_2010_Mean_NDVI completed!
Histogram calculation of Year_2011_2013_Mean_NDVI completed!
Histogram calculation of Year_2014_2016_Mean_NDVI completed!
Histogram calculation of Year_2017_2019_Mean_NDVI completed!


In [7]:
# innitilize an empyty datafram
NDVI_hist_df = pd.DataFrame()

# loop through each year, put the histogram of the nd into a dataframe, 
# and concate the dataframe into one dataframe
for year_name, nd_freq in NDVI_hist.items():
    
    # Create a datafram to hold the histogram of this year_name
    tmp_df = pd.DataFrame(data=nd_freq,
                          index=[year_name]*len(nd_freq),
                          columns=['NDVI','Freq'])
    
    # Calculate the Select_num by multiply num_non_built with the proportion of each NDVI frequency
    tmp_df['Select_num'] = tmp_df['Freq'].apply(lambda x: round(x*num_non_built/(tmp_df['Freq'].sum())))
    
    # concate the tmp_df to 
    NDVI_hist_df = pd.concat([NDVI_hist_df,tmp_df])
    

In [8]:
# save the NDVI_hist_df to local disk
NDVI_hist_df.index.name = 'Year_range'
NDVI_hist_df.to_csv('./Result_df/NDVI_area_propotion.csv')

# load the NDVI_hist_df from locak disk
NDVI_hist_df = pd.read_csv('./Result_df/NDVI_area_propotion.csv')
NDVI_hist_df

Unnamed: 0,Year_range,NDVI,Freq,Select_num
0,1990_1992,-34,2,0.0
1,1990_1992,-33,13,0.0
2,1990_1992,-32,34,0.0
3,1990_1992,-31,75,0.0
4,1990_1992,-30,174,0.0
...,...,...,...,...
737,2017_2019,36,623,0.0
738,2017_2019,37,162,0.0
739,2017_2019,38,28,0.0
740,2017_2019,39,3,0.0


##### Step_2: Create 50K random sample point and extract NDVI value to it

In [10]:
Path = 'users/wangjinzhulala/North_China_Plain_Python/Sample_50K'

for year in year_span:
    
    # create name variables from span
    asset_name = f'Sample_50K_year_{year}'
    
    # fetch the NDVI img
    NDVI_img = ee.Image(f'users/wensomone666/Jinzhu/Mean_NDVI/Year_{year}_Mean_NDVI').clip(North_china_plain_boundary)
    
    # create 50K sample point
    Sample_50k = NDVI_img.sample(region = North_china_plain_boundary.geometry(),
                                 scale = 30,
                                 numPixels=50000,
                                 geometries = True)
    
    # export 
#     task = ee.batch.Export.table.toAsset(   collection  = Sample_50k,
#                                             description = f'Exporting {asset_name}',
#                                             assetId     = f'{Path}/{asset_name}')
#     task.start()
    
    # print out the process
    print(f'{asset_name} exprted sucessfully')

Sample_50K_year_1990_1992 exprted sucessfully
Sample_50K_year_1993_1995 exprted sucessfully
Sample_50K_year_1996_1998 exprted sucessfully
Sample_50K_year_1999_2001 exprted sucessfully
Sample_50K_year_2002_2004 exprted sucessfully
Sample_50K_year_2005_2007 exprted sucessfully
Sample_50K_year_2008_2010 exprted sucessfully
Sample_50K_year_2011_2013 exprted sucessfully
Sample_50K_year_2014_2016 exprted sucessfully
Sample_50K_year_2017_2019 exprted sucessfully


##### Step_3_Random select sample from the 50K points

In [11]:
# create a function for ramdom sampling

def sample_list(x):
    
    L = x['.geo']
    n = int(x['Select_num'])
    
    select = random.sample(L,n)
    
    return select

In [12]:
sample_df_list = []

for year in year_span:
    
    #________________________Step_1: Preprocessing for 50K sample_______________________
    
    # read the 50K random points
    df = pd.read_csv(f'./Random_50K/Sample_50K_{year}.csv')
    
    # transform .geo to json
    df.drop('system:index',1,inplace=True)
    df['.geo'] = df['.geo'].apply(lambda x: json.loads(x))
    
    # collapse all json points with the same NDVI value into one list
    # and store in the df_50K dataframe
    df_50K = pd.DataFrame(df.groupby('nd')['.geo'].apply(lambda x: list(x)))
    
    #_________________________Step_2: Join df_histgrame with df_50K______________________
    
    # Select the df_hist that are in the same year with df_50K
    df_hist = NDVI_hist_df[NDVI_hist_df['Year_range'] == year]
    
    # Join df_hist and df_50K, remove the rows with a 0 select_num
    df_join = df_hist.join(df_50K, on='NDVI',how='inner')
    df_join = df_join[df_join['Select_num'] > 0]
    
    
    #_________________________Step_3: Perform the random sampling
    
    # Apply the function to collapse all json points of the same nd value into one list
    df_join['Sample'] = df_join.apply(sample_list,1)
    
    # Extract only necessay data
    df_join_sample = df_join[['Year_range','NDVI','Sample']]
    
    # Explode the sample column, so we get the random point at each row
    df_join_sample = df_join_sample.explode('Sample')
    
    # add the result to sample_df
    sample_df_list.append(df_join_sample)

In [13]:
# # concat all random-stratified sample into one datafram
# sample_df = pd.concat(sample_df_list).reset_index(drop=True)

# # unravel the sample to get lon/lat
# sample_df['lon'] = sample_df['Sample'].apply(lambda x: x['coordinates'][0])
# sample_df['lat'] = sample_df['Sample'].apply(lambda x: x['coordinates'][1])
# sample_df.drop('Sample',1,inplace=True)

# # Save the sample_df to disk
# sample_df.to_csv('./Result_df/Sample_point.csv',index=False)

# Load the sample_df
sample_df = pd.read_csv('./Result_df/Sample_point.csv')

In [14]:
sample_df

Unnamed: 0,Year_range,NDVI,lon,lat
0,1990_1992,-27,120.053909,31.255406
1,1990_1992,-26,120.094264,31.383617
2,1990_1992,-26,120.139539,31.381661
3,1990_1992,-25,120.008927,31.236220
4,1990_1992,-24,119.975689,31.223462
...,...,...,...,...
49977,2017_2019,28,118.155237,29.995881
49978,2017_2019,29,117.612038,29.674390
49979,2017_2019,29,118.401913,29.692320
49980,2017_2019,30,118.474074,30.210384
