In [13]:
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
from pathlib import Path
import warnings
import os
import random
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler
from collections import deque

warnings.simplefilter('ignore')

In [14]:
def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
fix_all_seeds(0)

In [15]:
SAVE_DF = True
SAVE_DF_DIR = Path("/content/drive/MyDrive/Kaggle/BlueCarbon/proc/FE_20230417")

# データ読み込み

In [16]:
train = pd.read_pickle("/content/drive/MyDrive/Kaggle/BlueCarbon/proc/train.pkl")
test = pd.read_pickle("/content/drive/MyDrive/Kaggle/BlueCarbon/proc/test.pkl")

# 特徴抽出

## あああ

In [17]:
# Landsatの列名抽出
cols_landsat = train.columns[313:-1]
cols_landsat_mod = list(set([col[3:-4] for col in cols_landsat]))
print(len(cols_landsat_mod))
cols_landsat_mod[:5]

50


['_CTVI_', '_PPR_', '_TIRS2_', '_EVI_', '_SIPI2_']

In [18]:
# clipのパーセンタイル
low_lim = 0.02

df_landsat_minmax_train = pd.DataFrame()
df_landsat_minmax_test = pd.DataFrame()
for col in tqdm(cols_landsat_mod):
    for year in range(2000, 2021):
        minmax_train = train[f"MAX{col}{year}"] - train[f"MIN{col}{year}"]
        minmax_test = test[f"MAX{col}{year}"] - test[f"MIN{col}{year}"]
        
        # clipping
        p01 = minmax_train.quantile(low_lim)
        p99 = minmax_train.quantile(1-low_lim)
        minmax_train = np.clip(minmax_train, p01, p99)
        minmax_test = np.clip(minmax_test, p01, p99)

        # いったん収納しておく
        if year == 2000:
            array_minmax_train = minmax_train
            array_minmax_test = minmax_test
        else:
            array_minmax_train = np.vstack((array_minmax_train, minmax_train))
            array_minmax_test = np.vstack((array_minmax_test, minmax_test))
    
    df_landsat_minmax_train[f"MINMAX{col}mean"] = np.nanmean(array_minmax_train, axis=0).T
    df_landsat_minmax_train[f"MINMAX{col}std"] = np.nanstd(array_minmax_train, axis=0).T
    df_landsat_minmax_test[f"MINMAX{col}mean"] = np.nanmean(array_minmax_test, axis=0).T
    df_landsat_minmax_test[f"MINMAX{col}std"] = np.nanstd(array_minmax_test, axis=0).T

print(df_landsat_minmax_train.shape)
print(df_landsat_minmax_test.shape)
df_landsat_minmax_train

  0%|          | 0/50 [00:00<?, ?it/s]

(14140, 100)
(4039, 100)


Unnamed: 0,MINMAX_CTVI_mean,MINMAX_CTVI_std,MINMAX_PPR_mean,MINMAX_PPR_std,MINMAX_TIRS2_mean,MINMAX_TIRS2_std,MINMAX_EVI_mean,MINMAX_EVI_std,MINMAX_SIPI2_mean,MINMAX_SIPI2_std,...,MINMAX_MCARI2_mean,MINMAX_MCARI2_std,MINMAX_BWDRVI_mean,MINMAX_BWDRVI_std,MINMAX_MCARI1_mean,MINMAX_MCARI1_std,MINMAX_SWIR2_mean,MINMAX_SWIR2_std,MINMAX_Ferric_Oxides_mean,MINMAX_Ferric_Oxides_std
0,0.134025,0.111810,0.080669,0.086960,0.339952,0.359248,0.042444,0.058448,1.817571,2.293036,...,0.973567,0.848198,0.028040,0.035760,11.012710,19.343378,1.181849,1.944656,0.056972,0.049894
1,0.035624,0.047987,0.046483,0.045414,0.318083,0.318512,0.187296,0.193427,4.727555,6.033570,...,0.258074,0.266281,0.045798,0.048732,21.390165,24.260161,0.475902,0.772107,0.048084,0.053776
2,0.110606,0.105105,0.063955,0.070680,0.568996,0.729613,0.049042,0.047594,1.311667,1.461022,...,0.768115,0.812829,0.030637,0.031524,11.053711,11.609394,1.522616,1.940104,0.053763,0.043699
3,0.148821,0.130655,0.045470,0.026575,0.388788,0.281831,0.011244,0.007940,0.882064,0.645597,...,1.188964,0.946789,0.006080,0.003360,10.688221,7.478360,0.226605,0.130852,0.061704,0.029714
4,0.297710,0.296254,0.068537,0.083459,0.320385,0.266144,0.030044,0.049183,2.199035,2.444870,...,1.877824,1.799169,0.017573,0.027399,17.219248,15.935239,0.923518,1.892338,0.069362,0.042049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14135,0.140422,0.077910,0.105416,0.065849,0.737821,0.604875,0.057583,0.033938,6.292223,3.924704,...,0.674811,0.390088,0.043387,0.027852,9.746412,5.901077,1.230795,1.036235,0.066596,0.035719
14136,0.212546,0.223163,0.056019,0.046221,0.448990,0.448564,0.046487,0.048123,1.155556,0.951521,...,1.907602,1.737644,0.024809,0.029874,17.319141,18.417098,1.374111,2.036652,0.076402,0.057151
14137,0.200882,0.182917,0.055836,0.052780,0.364006,0.454103,0.024932,0.029184,1.253052,1.182426,...,1.474519,1.347642,0.017267,0.021393,9.592783,8.797777,0.720077,1.154106,0.053727,0.055083
14138,0.058338,0.040333,0.053886,0.063374,0.502219,0.405666,0.081421,0.057140,0.606873,0.545062,...,0.365177,0.268816,0.025086,0.015930,15.293949,10.400612,0.712352,0.617933,0.035995,0.029827


In [19]:
df_landsat_minmax_test

Unnamed: 0,MINMAX_CTVI_mean,MINMAX_CTVI_std,MINMAX_PPR_mean,MINMAX_PPR_std,MINMAX_TIRS2_mean,MINMAX_TIRS2_std,MINMAX_EVI_mean,MINMAX_EVI_std,MINMAX_SIPI2_mean,MINMAX_SIPI2_std,...,MINMAX_MCARI2_mean,MINMAX_MCARI2_std,MINMAX_BWDRVI_mean,MINMAX_BWDRVI_std,MINMAX_MCARI1_mean,MINMAX_MCARI1_std,MINMAX_SWIR2_mean,MINMAX_SWIR2_std,MINMAX_Ferric_Oxides_mean,MINMAX_Ferric_Oxides_std
0,0.072787,0.080870,0.044474,0.066272,0.454942,0.563355,0.128205,0.163944,0.834633,1.281859,...,0.782622,1.071780,0.038899,0.042891,15.854944,19.600599,0.784007,1.158674,0.046059,0.055227
1,0.197061,0.179977,0.081745,0.075347,0.774618,0.801628,0.047764,0.062688,1.543976,1.431317,...,1.410039,1.291747,0.032407,0.032757,19.833653,28.124037,1.527841,1.916612,0.056070,0.050574
2,0.189923,0.137192,0.081572,0.071685,0.819618,0.806008,0.047035,0.053124,2.490470,1.793133,...,1.315295,0.918028,0.027586,0.028267,13.078486,15.718500,1.203211,1.638407,0.068449,0.041700
3,0.201564,0.116781,0.264746,0.139802,0.847323,0.686603,0.193459,0.216763,4.398454,2.311915,...,1.323031,0.864513,0.052872,0.031814,19.012957,24.585742,1.817561,2.143595,0.097849,0.054854
4,0.107232,0.076184,0.055331,0.050499,0.614532,0.348140,0.092255,0.071378,0.728987,0.598661,...,0.984949,0.689745,0.037963,0.026961,19.137446,13.483389,1.677229,1.798152,0.093493,0.061266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4034,0.149796,0.133851,0.052919,0.042010,0.652500,0.635344,0.037573,0.028046,0.998657,0.674379,...,1.373790,1.243960,0.024741,0.026347,11.337802,9.652782,0.969942,1.184428,0.073976,0.070706
4035,0.097304,0.105022,0.040799,0.049738,0.422139,0.396536,0.020893,0.028531,0.926408,0.968210,...,0.741593,0.745435,0.015763,0.022110,5.431432,6.567868,0.534014,0.969326,0.050759,0.037751
4036,0.107008,0.081886,0.061136,0.054778,0.760448,0.680436,0.117333,0.082338,2.233984,2.690919,...,0.790839,0.637402,0.046207,0.035405,19.021522,18.025514,1.220428,1.959582,0.069396,0.050937
4037,0.222550,0.127988,0.128875,0.070134,0.910488,0.829414,0.089457,0.078640,2.948019,1.698894,...,1.421396,0.946331,0.050107,0.035185,27.917286,25.214812,2.711211,2.498678,0.091023,0.052830


In [21]:
if SAVE_DF:
    df_landsat_minmax_train.to_pickle(SAVE_DF_DIR / "20230430_train_landsat4_minmax.pkl")
    df_landsat_minmax_test.to_pickle(SAVE_DF_DIR / "20230430_test_landsat4_minmax.pkl")