In [10]:
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
from pathlib import Path
import warnings
import os
import random
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler
from collections import deque

warnings.simplefilter('ignore')

In [2]:
def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
fix_all_seeds(0)

In [19]:
SAVE_DF = True
SAVE_DF_DIR = Path("/content/drive/MyDrive/Kaggle/BlueCarbon/proc/FE_20230417")

# データ読み込み

In [3]:
train = pd.read_pickle("/content/drive/MyDrive/Kaggle/BlueCarbon/proc/train.pkl")
test = pd.read_pickle("/content/drive/MyDrive/Kaggle/BlueCarbon/proc/test.pkl")

# 特徴抽出

## 2000年から2020年までのデータまとめる

In [12]:
# Landsatの列名抽出
# cols_landsat = train.columns[313:-1].str.contains("Blue")
cols_landsat = train.columns[313:-1]
cols_landsat_mod_dq = deque()
for col in cols_landsat:
    cols_landsat_mod_dq.append(col[:-4])
cols_landsat_mod = list(set(cols_landsat_mod_dq))
print(len(cols_landsat_mod))
cols_landsat_mod[:5]

150


['MAX_BWDRVI_', 'MED_GLI_', 'MED_DSWI_5_', 'MIN_RDVI_', 'MAX_IR550_']

In [18]:
df_train_landsat = pd.DataFrame()
df_test_landsat = pd.DataFrame()
for col in cols_landsat_mod:
    if "MED" in col:
        df_train_landsat[f"{col}_mean"] = train.loc[:, train.columns.str.contains(col)].mean(axis=1)
        df_test_landsat[f"{col}_mean"] = test.loc[:, test.columns.str.contains(col)].mean(axis=1)
    else:
        df_train_landsat[f"{col}_median"] = train.loc[:, train.columns.str.contains(col)].median(axis=1)
        df_test_landsat[f"{col}_median"] = test.loc[:, test.columns.str.contains(col)].median(axis=1)

df_train_landsat

Unnamed: 0,MAX_BWDRVI__median,MED_GLI__mean,MED_DSWI_5__mean,MIN_RDVI__median,MAX_IR550__median,MIN_DSWI__median,MAX_NormR__median,MAX_Cirededge__median,MAX_PPR__median,MAX_IF__median,...,MAX_D678_500__median,MIN_Fe3__median,MAX_Chlgreen__median,MED_CI__mean,MED_Cigreen__mean,MED_NormG__mean,MAX_Alteration__median,MAX_CVI__median,MED_GVMI__mean,MED_H__mean
0,-0.959719,-0.046951,-1.481308,-2.081978,0.038897,0.368567,0.309828,-0.473129,-0.200044,2.612512,...,-34.231005,0.464154,5.596622,-2.412849,-0.765163,0.578941,4.325302,0.177657,0.931891,1.465471
1,-0.898490,0.036395,-0.818406,-3.282602,0.023128,0.870851,0.321514,0.075726,-0.062271,7.475000,...,-26.482969,0.589454,5.961674,-0.998966,-0.499455,0.485871,6.066018,0.463270,0.965646,1.502791
2,-0.943603,0.019099,-1.299259,-2.561742,0.025912,0.654901,0.317555,-0.395339,-0.118680,3.767862,...,-36.248434,0.507211,4.979018,-1.520350,-0.713419,0.551065,4.422743,0.238979,0.937107,1.466764
3,-0.977513,0.001191,-2.072684,-2.349705,0.028352,0.383208,0.271770,-0.574975,-0.198460,3.413092,...,-46.433923,0.367477,7.717710,-3.061309,-0.847304,0.646592,4.864719,0.073679,0.941980,1.553396
4,-0.979579,-0.043618,-2.185394,-2.090157,0.027283,-0.023088,0.262953,-0.525219,-0.273007,2.708534,...,-49.537252,0.314573,8.660221,-3.953649,-0.833108,0.648515,4.636998,0.079681,0.945385,1.557802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14135,-0.924761,-0.047299,-0.840780,-1.512795,0.034739,0.523504,0.312128,-0.098848,-0.176121,2.615508,...,-32.848623,0.497847,3.303192,-1.768460,-0.540018,0.490379,5.076734,0.364744,0.933152,1.545935
14136,-0.946709,0.040333,-1.659805,-3.057790,0.022020,0.538428,0.317451,-0.453606,-0.108976,4.598316,...,-45.381822,0.375252,10.061944,-1.856695,-0.782972,0.595485,4.500612,0.212451,0.932744,1.549193
14137,-0.965645,-0.014158,-1.651801,-2.204425,0.027754,0.434924,0.289903,-0.488275,-0.206824,3.317007,...,-42.120060,0.399249,6.730815,-2.491551,-0.787046,0.598702,4.287888,0.136929,0.930613,1.553506
14138,-0.933280,0.048111,-1.020379,-4.449991,0.011872,0.332225,0.352655,-0.508030,-0.013804,19.070181,...,-31.877151,0.601503,4.434409,-0.767629,-0.705402,0.512771,6.177199,0.250720,0.950823,1.268756


In [20]:
if SAVE_DF:
    df_train_landsat.to_pickle(SAVE_DF_DIR / "20230428_train_landsat_2000to2020.pkl")
    df_test_landsat.to_pickle(SAVE_DF_DIR / "20230428_test_landsat_2000to2020.pkl")

## MinMax Scaling

In [24]:
df_train_landsat_minmax = pd.DataFrame()
df_test_landsat_minmax = pd.DataFrame()
cols_landsat_indicator = list(set([col[4:-1] for col in cols_landsat_mod]))
for col in cols_landsat_indicator:
    df_train_landsat_minmax[f"MINMAX_{col}"] = (df_train_landsat[f"MAX_{col}__median"] - train[col]) / (df_train_landsat[f"MAX_{col}__median"] - df_train_landsat[f"MIN_{col}__median"])
    df_test_landsat_minmax[f"MINMAX_{col}"] = (df_test_landsat[f"MAX_{col}__median"] - test[col]) / (df_test_landsat[f"MAX_{col}__median"] - df_test_landsat[f"MIN_{col}__median"])

df_train_landsat_minmax

Unnamed: 0,MINMAX_MVI,MINMAX_Chlred_edge,MINMAX_NLI,MINMAX_IF,MINMAX_GEMI,MINMAX_Chlgreen,MINMAX_CVI,MINMAX_DSWI_5,MINMAX_TSAVI,MINMAX_PSNDc2,...,MINMAX_EVI,MINMAX_GVMI,MINMAX_RDVI,MINMAX_PPR,MINMAX_SIPI2,MINMAX_Cigreen,MINMAX_D678_500,MINMAX_DVIMSS,MINMAX_Green,MINMAX_Alteration
0,-0.498732,-0.031920,1.075848,1.225176,0.018427,0.065996,0.911240,0.839658,1.035656,1.000000,...,0.558870,0.176156,0.345818,1.035576,0.350692,0.960665,-0.223362,1.000000,1.052035,3.302330
1,0.698608,1.080161,-0.089137,-1.235533,0.525556,1.083810,-0.574570,-0.393703,-0.195017,-0.575291,...,2.011747,0.262679,-0.418989,-0.841783,0.968533,-0.482458,-0.112053,-1.535718,-0.357916,-0.125750
2,-0.589716,0.241959,0.929373,1.032613,-0.132978,0.234190,0.861605,0.799017,0.805075,0.866200,...,0.828010,-0.588490,0.120361,1.229726,0.187842,0.861691,-0.738389,0.900105,1.188305,-2.167550
3,,,,,,,,,,,...,,,,,,,,,,
4,-1.765885,0.822871,0.791095,0.858119,-0.886130,0.583884,0.730754,0.850581,0.193386,0.619524,...,1.450578,-0.663438,-2.234468,0.894568,-0.647034,0.531371,-0.053657,0.410993,1.393185,-2.763286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14135,0.395220,0.906247,0.480544,0.570060,-0.031857,0.690461,0.692777,0.591066,0.116087,0.581328,...,1.093502,-0.044403,0.008718,0.831227,-1.234763,0.448413,0.130258,0.694313,1.007949,-0.850034
14136,0.613476,0.409877,0.480043,-0.294568,1.117685,0.431783,0.821281,0.553034,0.728353,0.753598,...,0.225930,0.449560,1.694484,0.057133,0.851927,0.818559,0.619002,1.045189,0.315951,0.829107
14137,,,,,,,,,,,...,,,,,,,,,,
14138,,,,,,,,,,,...,,,,,,,,,,


In [23]:
df_train_landsat_minmax

Unnamed: 0,MINMAX_MVI,MINMAX_NLI,MINMAX_IF,MINMAX_GEMI,MINMAX_Chlgreen,MINMAX_CVI,MINMAX_TSAVI,MINMAX_PSNDc2,MINMAX_TIRS1,MINMAX_Gossan,MINMAX_mCRIG,MINMAX_MCARI1,MINMAX_SLAVI,MINMAX_IR550,MINMAX_GLI,MINMAX_CTVI,MINMAX_DSWI,MINMAX_NormR,MINMAX_Cirededge,MINMAX_NIR
0,-0.498732,1.075848,1.225176,0.018427,0.065996,0.911240,1.035656,1.000000,1.647973,1.121101,0.354796,1.114975,0.996289,-0.105476,1.209264,0.623091,-2.130664,0.375551,1.022163,1.000000
1,0.698608,-0.089137,-1.235533,0.525556,1.083810,-0.574570,-0.195017,-0.575291,0.540794,-0.620475,0.258118,-2.022961,-6.211305,1.212433,0.005991,-0.466332,-7.345806,1.348359,-0.385230,-2.242454
2,-0.589716,0.929373,1.032613,-0.132978,0.234190,0.861605,0.805075,0.866200,1.094944,0.955602,0.595199,0.860152,0.996759,-0.323631,1.045710,0.747430,-13.654279,0.740797,0.831071,0.991347
3,,,,,,,,,,,,,,,,,,,,
4,-1.765885,0.791095,0.858119,-0.886130,0.583884,0.730754,0.193386,0.619524,1.309486,1.263467,0.662397,0.733488,1.140726,-0.673618,0.842392,0.885495,-2.158023,0.971586,0.220573,1.031365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14135,0.395220,0.480544,0.570060,-0.031857,0.690461,0.692777,0.116087,0.581328,1.336709,0.513477,0.901465,0.154937,0.978129,-0.014592,0.590517,0.899343,-4.026987,1.082598,0.138837,0.898466
14136,0.613476,0.480043,-0.294568,1.117685,0.431783,0.821281,0.728353,0.753598,0.817270,0.825930,0.104435,0.528651,0.575819,0.782368,0.081353,0.436093,-14.190859,0.346558,0.776134,0.684001
14137,,,,,,,,,,,,,,,,,,,,
14138,,,,,,,,,,,,,,,,,,,,


In [25]:
df_train_landsat_minmax.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14140 entries, 0 to 14139
Data columns (total 50 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   MINMAX_MVI                10524 non-null  float64
 1   MINMAX_Chlred_edge        10524 non-null  float64
 2   MINMAX_NLI                10524 non-null  float64
 3   MINMAX_IF                 10535 non-null  float64
 4   MINMAX_GEMI               10524 non-null  float64
 5   MINMAX_Chlgreen           10524 non-null  float64
 6   MINMAX_CVI                10524 non-null  float64
 7   MINMAX_DSWI_5             10524 non-null  float64
 8   MINMAX_TSAVI              10524 non-null  float64
 9   MINMAX_PSNDc2             10524 non-null  float64
 10  MINMAX_TIRS1              10558 non-null  float64
 11  MINMAX_Gossan             10535 non-null  float64
 12  MINMAX_mCRIG              10524 non-null  float64
 13  MINMAX_MCARI1             10524 non-null  float64
 14  MINMAX

In [28]:
if SAVE_DF:
    df_train_landsat_minmax.to_pickle(SAVE_DF_DIR / "20230428_train_landsat_minmax.pkl")
    df_test_landsat_minmax.to_pickle(SAVE_DF_DIR / "20230428_test_landsat_minmax.pkl")