In [None]:
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
from pathlib import Path
import warnings
import os
import random
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler
from collections import deque

warnings.simplefilter('ignore')

In [None]:
def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
fix_all_seeds(0)

In [None]:
SAVE_DF = True
SAVE_DF_DIR = Path("/content/drive/MyDrive/Kaggle/BlueCarbon/proc/FE_20230417")

# データ読み込み

In [None]:
train = pd.read_pickle("/content/drive/MyDrive/Kaggle/BlueCarbon/proc/train.pkl")
test = pd.read_pickle("/content/drive/MyDrive/Kaggle/BlueCarbon/proc/test.pkl")

# 特徴抽出

### Clip and Normalize

In [None]:
# clipのパーセンタイル
low_lim = 0.02

# Landsatの列名抽出
cols_landsat = train.columns[313:-1]



isFirstOne = True
for col in tqdm(cols_landsat):
    tmp_train = train[col]
    tmp_test = test[col]
    p01 = tmp_train.quantile(low_lim)
    p99 = tmp_train.quantile(1-low_lim)
    tmp_train = np.clip(tmp_train, p01, p99)
    tmp_test = np.clip(tmp_test, p01, p99)

    mean_train = np.nanmean(tmp_train)
    std_train = np.nanstd(tmp_train)

    if isFirstOne:
        isFirstOne = False
        landsat_norm_train = (tmp_train - mean_train) / std_train
        landsat_norm_test = (tmp_test - mean_train) / std_train
    else:
        normalized_train = (tmp_train - mean_train) / std_train
        normalized_test = (tmp_test - mean_train) / std_train
        landsat_norm_train = np.vstack((landsat_norm_train, normalized_train))
        landsat_norm_test = np.vstack((landsat_norm_test, normalized_test))

landsat_norm_train.shape

  0%|          | 0/3150 [00:00<?, ?it/s]

(3150, 14140)

In [None]:
df_landsat_norm_train = pd.DataFrame(landsat_norm_train.T, columns=cols_landsat)
df_landsat_norm_test = pd.DataFrame(landsat_norm_test.T, columns=cols_landsat)
df_landsat_norm_train

Unnamed: 0,MAX_Blue_2000,MAX_Green_2000,MAX_Red_2000,MAX_NIR_2000,MAX_SWIR1_2000,MAX_SWIR2_2000,MAX_TIRS1_2000,MAX_TIRS2_2000,MAX_TSAVI_2000,MAX_Alteration_2000,...,MIN_NLI_2020,MIN_NormG_2020,MIN_NormR_2020,MIN_PPR_2020,MIN_PSNDc2_2020,MIN_RDVI_2020,MIN_IF_2020,MIN_SLAVI_2020,MIN_SIPI2_2020,MIN_VARIgreen_2020
0,1.238857,0.705623,1.233627,0.264463,0.687586,1.118129,-0.004696,0.171091,0.242550,-0.449300,...,0.527802,0.118077,0.481797,-0.817520,0.181264,0.646705,0.040186,-0.235696,0.363263,0.343531
1,,,,,,,,,,,...,0.172206,-1.866172,-1.076442,1.862362,0.686274,-0.369408,0.844530,0.269415,-2.620580,0.503828
2,-0.916785,-0.537404,-0.978152,-0.631595,-0.684639,-0.837061,-0.931228,-0.965711,-1.377890,-0.478357,...,-0.937974,1.102768,0.259413,0.483606,-0.447797,-0.237235,0.377018,-0.441428,0.055005,-2.730476
3,,,,,,,,,,,...,-0.277354,2.051741,-1.228557,-0.310734,-0.544051,0.612809,0.260509,-0.510139,1.919950,0.158640
4,,,,,,,,,,,...,0.025151,2.172476,-2.163589,-1.456677,-0.841693,0.913016,0.069136,-0.519096,1.919950,0.387752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14135,-0.309752,-0.587302,-0.431760,-0.038567,-0.067907,-0.076436,1.097760,1.007906,1.016264,-0.347834,...,1.947130,-1.242074,-0.344220,-0.725990,1.620315,1.282197,-0.053973,0.448418,-3.214474,-5.070567
14136,,,,,,,,,,,...,-0.941755,1.768773,-1.065837,0.907607,-0.831231,-1.071211,0.710213,-0.415702,-0.082688,-0.547066
14137,1.015533,0.459600,0.995199,0.176501,0.547977,1.114172,-0.776474,-0.861787,0.200167,-0.540294,...,-0.416193,1.531398,-0.800448,-0.887800,-0.437571,0.507627,0.031476,-0.495861,1.036631,-0.414670
14138,,,,,,,,,,,...,1.943519,-0.434808,1.882670,2.212303,2.246075,-1.716885,-5.381442,5.122770,-1.038454,0.497121


In [None]:
df_landsat_norm_test

Unnamed: 0,MAX_Blue_2000,MAX_Green_2000,MAX_Red_2000,MAX_NIR_2000,MAX_SWIR1_2000,MAX_SWIR2_2000,MAX_TIRS1_2000,MAX_TIRS2_2000,MAX_TSAVI_2000,MAX_Alteration_2000,...,MIN_NLI_2020,MIN_NormG_2020,MIN_NormR_2020,MIN_PPR_2020,MIN_PSNDc2_2020,MIN_RDVI_2020,MIN_IF_2020,MIN_SLAVI_2020,MIN_SIPI2_2020,MIN_VARIgreen_2020
0,0.191871,-0.377705,0.252070,-0.089682,0.014489,0.151485,-2.135342,-2.166718,0.041714,-0.452123,...,2.363367,-1.244954,1.800069,2.212303,3.588233,-0.284307,0.714475,5.040695,-1.065250,0.489556
1,0.085993,0.048776,-0.560234,-0.357503,-0.518990,-0.773985,0.659405,0.756121,-0.172622,0.046137,...,-0.013220,0.571039,0.050087,-1.016342,-0.050399,0.775629,0.008996,-0.473460,0.742203,-3.973820
2,,,,,,,,,,,...,-0.332357,1.414979,-0.810705,-1.427170,-0.554540,0.950325,-0.047833,-0.568544,1.700352,0.362836
3,-0.406347,2.341615,2.423697,0.848576,1.326019,1.984733,0.122974,-0.045951,0.631859,-0.149379,...,-0.659216,-0.211606,-0.138415,-1.456415,-0.580179,0.250423,-4.394400,-0.553388,-1.151226,-0.337352
4,,,,,,,,,,,...,-0.339771,-0.808004,1.473510,-0.116846,-0.049004,-2.493579,0.113180,-0.329377,-0.955698,0.122796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4034,,,,,,,,,,,...,-0.831237,0.054869,-0.490553,0.548552,-0.550006,-0.287923,0.484304,-0.452683,-0.104935,-1.113927
4035,0.305963,-0.295541,0.228749,-0.070319,0.068661,0.295770,0.272879,0.405485,0.132231,0.925778,...,0.018764,0.291887,0.443149,-0.605412,-0.170380,0.599422,0.089571,-0.362005,0.514109,0.359959
4036,,,,,,,,,,,...,-1.362090,0.605961,1.554227,1.520603,-0.313380,-1.573241,0.751496,-0.093781,-0.902666,0.512388
4037,0.735295,0.424112,0.423360,0.114860,0.035879,0.095213,0.674251,0.626276,0.522792,-0.139566,...,0.847866,-1.165502,1.045538,-0.787547,0.448482,-0.951882,-0.011527,-0.067512,-0.640553,0.364644


## 2000年から2020年までのデータまとめる

In [None]:
cols_landsat_mod_dq = deque()
for col in cols_landsat:
    cols_landsat_mod_dq.append(col[3:-4])
cols_landsat_mod = list(set(cols_landsat_mod_dq))
print(len(cols_landsat_mod))
cols_landsat_mod[:5]

50


['_TSAVI_', '_Chlgreen_', '_Cigreen_', '_NIR_', '_PSNDc2_']

In [None]:
df_train_landsat = pd.DataFrame()
df_test_landsat = pd.DataFrame()
for col in cols_landsat_mod:
    df_train_landsat[f"MED{col}median"] =\
        df_landsat_norm_train.loc[:, df_landsat_norm_train.columns.str.contains(f"MED{col}")].median(axis=1)
    df_train_landsat[f"MED{col}std"] =\
        df_landsat_norm_train.loc[:, df_landsat_norm_train.columns.str.contains(f"MED{col}")].std(axis=1)
    df_test_landsat[f"MED{col}median"] =\
        df_landsat_norm_test.loc[:, df_landsat_norm_test.columns.str.contains(f"MED{col}")].median(axis=1)
    df_test_landsat[f"MED{col}std"] =\
        df_landsat_norm_test.loc[:, df_landsat_norm_test.columns.str.contains(f"MED{col}")].std(axis=1)

    max_train = df_landsat_norm_train.loc[:, df_landsat_norm_train.columns.str.contains(f"MAX{col}")].values
    min_train = df_landsat_norm_train.loc[:, df_landsat_norm_train.columns.str.contains(f"MIN{col}")].values
    diff_train = max_train-min_train
    
    diff_mean_train = np.nanmean(diff_train, axis=1)
    diff_std_train = np.nanstd(diff_train, axis=1)
    df_train_landsat[f"MINMAX{col}_mean"] = diff_mean_train
    df_train_landsat[f"MINMAX{col}_std"] = diff_std_train

    max_test = df_landsat_norm_test.loc[:, df_landsat_norm_test.columns.str.contains(f"MAX{col}")].values
    min_test = df_landsat_norm_test.loc[:, df_landsat_norm_test.columns.str.contains(f"MIN{col}")].values
    diff_test = max_test-min_test
    
    diff_mean_test = np.nanmean(diff_test, axis=1)
    diff_std_test = np.nanstd(diff_test, axis=1)
    df_test_landsat[f"MINMAX{col}_mean"] = diff_mean_test
    df_test_landsat[f"MINMAX{col}_std"] = diff_std_test

df_train_landsat

Unnamed: 0,MED_TSAVI_median,MED_TSAVI_std,MINMAX_TSAVI__mean,MINMAX_TSAVI__std,MED_Chlgreen_median,MED_Chlgreen_std,MINMAX_Chlgreen__mean,MINMAX_Chlgreen__std,MED_Cigreen_median,MED_Cigreen_std,...,MINMAX_MCARI2__mean,MINMAX_MCARI2__std,MED_NormR_median,MED_NormR_std,MINMAX_NormR__mean,MINMAX_NormR__std,MED_TIRS2_median,MED_TIRS2_std,MINMAX_TIRS2__mean,MINMAX_TIRS2__std
0,-0.264021,0.823646,-0.021288,1.024085,-0.002227,0.675996,0.040346,1.075656,-0.313951,0.929893,...,-0.006201,1.087738,0.222131,0.699859,-0.086431,1.010979,0.470200,0.632177,-0.219153,0.892955
1,0.702258,2.091681,0.678227,1.671567,-0.788013,1.243748,-0.432950,0.767470,1.692979,2.074727,...,-0.775479,0.251864,0.904064,1.205264,-0.222484,0.889441,-0.002017,0.795383,-0.719698,0.899809
2,-0.385988,1.206409,0.240489,1.225477,-0.182977,1.097435,0.003412,0.908989,-0.268344,1.283726,...,-0.222863,0.631919,0.577412,0.740036,-0.346585,0.713161,-0.283922,0.805219,-0.193737,1.388989
3,-0.351543,0.277463,-1.280552,0.362322,0.682482,0.593529,-1.509468,0.387104,-0.776386,0.259690,...,-1.483318,0.564553,-1.497466,0.694455,-0.937995,0.424149,-0.056761,0.946398,-0.946858,1.012781
4,-0.052152,0.425814,-0.905696,0.666561,0.561257,0.763137,-0.841724,1.018473,-0.653161,0.450328,...,-0.628876,1.382746,-1.824046,0.859363,0.079033,1.272757,-0.347286,0.710107,-1.102535,0.914676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14135,2.264544,0.463507,0.092743,0.763524,-1.506323,0.297590,-0.291874,0.435703,1.941632,0.880250,...,-0.289405,0.373113,0.069610,0.526907,0.316190,0.750954,0.702377,0.655675,0.829005,1.313422
14136,-0.401058,1.158282,-0.096538,0.891352,0.422664,1.563723,-0.153280,1.200973,-0.499391,1.087028,...,-0.047921,1.239300,-0.400305,1.047133,0.048165,1.089812,-0.759813,1.391150,-0.660395,0.907341
14137,0.209582,0.638428,-0.612528,0.449728,-0.047015,0.798177,-0.410247,0.630817,-0.251746,0.596488,...,-0.150387,0.691455,-0.749524,0.885000,-0.011504,0.673395,0.392629,0.490484,-0.585708,0.736146
14138,0.135508,0.515852,-0.789388,0.778067,-0.881785,0.409206,-0.876818,0.361304,0.527070,0.626638,...,-0.478672,0.186568,1.736692,0.393014,-0.646045,0.216272,0.059529,1.139055,-0.055946,0.843535


In [None]:
if SAVE_DF:
    df_train_landsat.to_pickle(SAVE_DF_DIR / "20230429_train_landsat2_2000to2020.pkl")
    df_test_landsat.to_pickle(SAVE_DF_DIR / "20230429_test_landsat2_2000to2020.pkl")