<a href="https://colab.research.google.com/github/surasakcho/A_A/blob/master/01_07_colab_ls8_split_pixel_values_raw_prov_cd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install libraries & Mount google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install rasterio
!pip install geopandas




In [0]:
import numpy as np
import pandas as pd
import os, shutil
from glob import glob
import re
import datetime
from datetime import datetime
import random
from tqdm import tqdm_notebook
import pickle
import matplotlib.pyplot as plt
import geopandas as gpd
from pyproj import CRS
import rasterio
from rasterio import plot
from rasterio.transform import from_origin
from rasterio.warp import reproject, Resampling
from rasterio.mask import mask
from shapely.geometry import Polygon, mapping
from datetime import datetime, timedelta
%matplotlib inline

# Define functions

In [0]:

def list_files_re(rootpath, filename_re=None, folder_re=None ):
    '''
    rootpath : root path to lookup files
    filename_re : regular expression to search for filename
    folder_re : regular expression to search for folder

    return : a list of filepaths
    '''


    list_files = []
    for folder, _, files in os.walk(rootpath):
        for file in files:     
            if filename_re == None:
                filename_re = '.*'
            if folder_re == None:
                folder_re = '.*'
                
            if ((re.search(filename_re, file) != None) & (re.search(folder_re, folder) != None)):
                list_files.append(os.path.join(folder, file))
        
    return list_files    
        
        
    

In [0]:

def read_pickle(in_pickle_path):
    '''
    Read a pickle file.

    Parameters
    ----------------------------------
    in_pickle_path : path of pickle to be read

    Return
    ----------------------------------
    An object / variable of read pickle file
    '''

    pkl_file = open(in_pickle_path, 'rb')
    data = pickle.load(pkl_file)
    pkl_file.close()

    return data

# Define paths & parameters

In [0]:
CYCLE_DAYS = 16


my_drive_path = "/content/drive/My Drive/"

landsat_folder = os.path.join(my_drive_path, '!landsat8-prep')

pixel_rowcol_map_province_folder = os.path.join(landsat_folder, 'pixel_rowcol_map_province')
pixel_rowcol_map_tambon_folder = os.path.join(landsat_folder, 'pixel_rowcol_map_tambon')

pixel_values_raw_folder = os.path.join(landsat_folder, 'pixel_values_raw')
pixel_values_raw_prov_folder = os.path.join(landsat_folder, 'pixel_values_raw_prov')
os.makedirs(pixel_values_raw_prov_folder, exist_ok=True)

# Load data

In [0]:
df_province_cd = pd.read_csv(os.path.join(landsat_folder, 'DOAE_PROVINCE.csv'))

df_province_cd = df_province_cd.sample(frac=1.0)


# Split pixel values by prov_cd

In [0]:
for prov_cd in tqdm_notebook(df_province_cd['PROVINCE_CODE']):    
    
    print(f'Processing : p{prov_cd}')
    
    #get list of province row col map of prov_cd
    list_pixel_rowcol_map_province_path = list_files_re(pixel_rowcol_map_province_folder, f'df_pixel_map_p{prov_cd}_\d*.parquet')
    #shuffle for multi-session processing
    random.shuffle(list_pixel_rowcol_map_province_path)
    
    for pixel_rowcol_map_province_path in list_pixel_rowcol_map_province_path:
        #iterate each scene in prov_cd
        scene_id = pixel_rowcol_map_province_path.split('.')[0][-6:]


        out_path = os.path.join(pixel_values_raw_prov_folder, f'df_pixel_values_raw_p{prov_cd}_{scene_id}.parquet')  
        #check whether file already exists
        if os.path.exists(out_path):
            continue

        print(f'Loading scene_id : {scene_id}')
        df_pixel_values_prov_scene = pd.read_parquet(pixel_rowcol_map_province_path).set_index(['scene_id', 'row', 'col'])


        
        

        #Merge all years into one dataframe
        list_pixel_values_raw_path = list_files_re(pixel_values_raw_folder, f'df_pixel_values_{scene_id}_(2015|2016|2017|2018|2019).parquet')
        
        continue_next_scene = False
        
        for pixel_values_raw_path in list_pixel_values_raw_path:            
            #iterate each year
            year = pixel_values_raw_path.split('.')[0][-4:]

            #If current scene is created by other session, skip it.
            if os.path.exists(out_path):
                continue_next_scene = True
                break
            else:    
                print(f'Loading year : {year}')    

            #join another year to existing df
            for n in range(10):
                try:
                    df_tmp = pd.read_parquet(pixel_values_raw_path).set_index(['scene_id', 'row', 'col'])
                    break
                except:
                    print(f'Read failed (attempt-{n+1}) : {pixel_values_raw_path}')                    
                

            df_tmp['zeros'] = np.where(df_tmp.values == 0, 1, 0).sum(axis=1)
            nbr_columns = len(df_tmp.columns)
            df_tmp['pct_zeros'] = (df_tmp['zeros'].values) / nbr_columns
            #remove pixel with > 20% zeros or missing
            df_tmp = df_tmp[df_tmp['pct_zeros'] <= 0.2]
            df_tmp = df_tmp.drop(columns=['zeros', 'pct_zeros'])
            
            #print(f'Joining year : {year}')
            df_pixel_values_prov_scene = df_pixel_values_prov_scene.merge(df_tmp, how='inner', left_index=True, right_index=True)
            del(df_tmp)
        
        
        if continue_next_scene:
            del(df_pixel_values_prov_scene)
            continue
        

        print('Joining tambon_key')   
        if len(df_pixel_values_prov_scene) > 0:
            df_tambon_scene_rowcol_map = pd.read_parquet(os.path.join(pixel_rowcol_map_tambon_folder, f'df_pixel_map_tambon_{scene_id}.parquet')).set_index(['scene_id', 'row', 'col'])
            df_tambon_scene_rowcol_map = df_tambon_scene_rowcol_map.copy().loc[df_pixel_values_prov_scene.index]
            
            df_pixel_values_prov_scene = df_pixel_values_prov_scene.merge(df_tambon_scene_rowcol_map, how='inner', left_index=True, right_index=True)
            del(df_tambon_scene_rowcol_map)
        else:
            df_pixel_values_prov_scene['tambon_key'] = np.nan

        df_pixel_values_prov_scene = df_pixel_values_prov_scene.reset_index().set_index(['scene_id', 'row', 'col', 'prov_cd', 'tambon_key'], drop=True)        
        df_pixel_values_prov_scene.columns = [col[-8:] for col in df_pixel_values_prov_scene.columns]

        start_date = datetime.strptime(min(df_pixel_values_prov_scene.columns), '%Y%m%d')
        last_date = datetime.strptime(max(df_pixel_values_prov_scene.columns), '%Y%m%d')

        list_complete_dates = []

        for i in range(int((last_date - start_date).days / CYCLE_DAYS)):
            list_complete_dates.append(datetime.strftime(start_date + timedelta(i * CYCLE_DAYS), '%Y%m%d'))
        
        df_pixel_values_prov_scene= df_pixel_values_prov_scene.reindex(columns=list_complete_dates)

        nbr_columns = len(df_pixel_values_prov_scene.columns)

        df_pixel_values_prov_scene['zeros'] = np.where(df_pixel_values_prov_scene.values == 0, 1, 0).sum(axis=1)
        df_pixel_values_prov_scene['nans'] = np.where(np.isnan(df_pixel_values_prov_scene.values), 1, 0).sum(axis=1)
        df_pixel_values_prov_scene['pct_missing'] = (df_pixel_values_prov_scene['zeros'].values + df_pixel_values_prov_scene['nans'].values) / nbr_columns
        #remove pixel with > 20% zeros or missing
        df_pixel_values_prov_scene = df_pixel_values_prov_scene[df_pixel_values_prov_scene['pct_missing'] <= 0.2]

        #recheck whether there is the same file created between processing
        if ~os.path.exists(out_path):
            df_pixel_values_prov_scene.to_parquet(out_path)

        print(f'scene_id : {scene_id} done')
        print()
        del(df_pixel_values_prov_scene)
    
    print('############')        
    print()

HBox(children=(IntProgress(value=0, max=77), HTML(value='')))

Processing : p66
Loading scene_id : 130049
Loading year : 2015
Loading year : 2016
Loading year : 2017


# Test

In [9]:
df_test = df_pixel_values_prov_scene.copy()

NameError: ignored

In [0]:
df_tambon_scene_rowcol_map

In [0]:
df_test['tambon_key'] = df_tambon_scene_rowcol_map['tambon_key']
df_test


In [0]:
df_test