In [2]:
import glob
import tifffile
import numpy as np
import lightgbm as lgb
import warnings
import tqdm
import pandas as pd
warnings.simplefilter('ignore')

In [3]:
train_path =  '../train/s2_image/'
mask_path = '../train/mask/'

masks = glob.glob(f'{mask_path}/*')
trains = glob.glob(f'{train_path}/*')
masks.sort()
trains.sort()

In [4]:
X = []
y = []
g = []

for i, (t, m) in enumerate(zip(trains, masks)):
    img = tifffile.imread(t).astype(np.float64)
    mask = tifffile.imread(m).astype(np.float64)
    X.append(img.reshape(-1,12))
    y.append(mask.reshape(-1))
    g.append(np.ones_like(mask.reshape(-1))*i)
    
X = np.vstack(X)
y = np.hstack(y)
g = np.hstack(g) 

In [5]:
X.shape

(1133572, 12)

In [6]:
df_x = pd.DataFrame(X, columns=['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'B11', 'B12'])
df_x['NDRE_1'] = (df_x['B11']-df_x['B12'])/(df_x['B11']+df_x['B12'])

# df_x = df_x.drop(columns=['B8', 'B3', 'B11', 'B4', 'B2', 'B5'], axis=1)
X = np.array(df_x)
print(X.shape)

(1133572, 13)


In [7]:
from catboost import CatBoostClassifier, Pool, cv

In [8]:
model = CatBoostClassifier(iterations=8000,
                           task_type="GPU",
                           verbose=100,
                           learning_rate=0.03,
                        #    eval_metric = 'F1',
                           devices='0:1')

In [9]:
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold

gkfold = GroupKFold(n_splits=4)

models = []

for i, (train_idx, valid_idx) in enumerate(gkfold.split(X, y, g)):
    train_x = X[train_idx]
    train_y = y[train_idx]
    
    val_x = X[valid_idx]
    val_y = y[valid_idx]
    
    m = model.fit(train_x,
            train_y,eval_set=(val_x, val_y),
            verbose=1)
    
    models.append(m)

0:	learn: 0.6279636	test: 0.6278055	best: 0.6278055 (0)	total: 8.78ms	remaining: 1m 10s
1:	learn: 0.5755120	test: 0.5747143	best: 0.5747143 (1)	total: 17.7ms	remaining: 1m 10s
2:	learn: 0.5232098	test: 0.5223628	best: 0.5223628 (2)	total: 26.1ms	remaining: 1m 9s
3:	learn: 0.4801480	test: 0.4789352	best: 0.4789352 (3)	total: 33.2ms	remaining: 1m 6s
4:	learn: 0.4430993	test: 0.4414157	best: 0.4414157 (4)	total: 41.7ms	remaining: 1m 6s
5:	learn: 0.4099987	test: 0.4078176	best: 0.4078176 (5)	total: 50.1ms	remaining: 1m 6s
6:	learn: 0.3800457	test: 0.3774802	best: 0.3774802 (6)	total: 57.8ms	remaining: 1m 5s
7:	learn: 0.3535676	test: 0.3506973	best: 0.3506973 (7)	total: 64.7ms	remaining: 1m 4s
8:	learn: 0.3300724	test: 0.3268787	best: 0.3268787 (8)	total: 72.5ms	remaining: 1m 4s
9:	learn: 0.3082891	test: 0.3048834	best: 0.3048834 (9)	total: 79.4ms	remaining: 1m 3s
10:	learn: 0.2900335	test: 0.2863500	best: 0.2863500 (10)	total: 86.8ms	remaining: 1m 3s
11:	learn: 0.2750357	test: 0.2711047	be

In [None]:
#0.083, 0.091, 0.0920, 0.090
#0.082, 

In [10]:
test_path =  '../evaluation/'
test_mask_path = '../sample/'

masks = glob.glob(f'{test_mask_path}/*')
tests = glob.glob(f'{test_path}/*')
masks.sort()
tests.sort()

In [11]:
import os
if not os.path.isdir('output'):
    os.mkdir('output')

In [19]:
threshold = 0.09

pred_mask_list=[]
proba_list=[]
for i, (m, t) in tqdm.tqdm(enumerate(zip(masks, tests))):
    basename = os.path.basename(m)
    output_file = f'output/{basename}'
    
    img = tifffile.imread(t).astype(np.float64)
    mask = tifffile.imread(m).astype(np.float64)

    X = img.reshape(-1, 12)
    #######################
    df_x = pd.DataFrame(X, columns=['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'B11', 'B12'])
    df_x['NDRE_1'] = (df_x['B11']-df_x['B12'])/(df_x['B11']+df_x['B12'])
    X = np.array(df_x)
    #######################
    shape_mask = mask.shape
    
    pred = 0
    for model in models:
        pred = model.predict_proba(X) / len(models)

    pred_mask = pred[:,1].reshape(shape_mask[0], shape_mask[1]) #(23,23)
    proba_list.append(pred_mask)
    pred_mask = (pred_mask > threshold).astype(np.uint8)

    pred_mask_list.append(pred_mask)
    tifffile.imwrite(output_file, pred_mask)
    

2066it [00:22, 90.64it/s]


In [20]:
import zipfile

output_folder_path = 'output'
with zipfile.ZipFile('output_zip_1025_ml_cat_4.zip', 'w') as zipf:
    for root, dirs, files in os.walk(output_folder_path):
        for file in files:
            zipf.write(os.path.join(root, file), file)

In [None]:
import pickle
with open("test", "wb") as fp:   #Pickling
    pickle.dump(proba_list, fp)
with open("test", "rb") as fp:   # Unpickling
    b = pickle.load(fp)
    
import matplotlib.pyplot as plt
plt.imshow(np.array(proba_list[0])*4)
