In [2]:
import glob
import tifffile
import numpy as np
import lightgbm as lgb
import warnings
import tqdm
import pandas as pd
import matplotlib.pyplot as plt
warnings.simplefilter('ignore')

In [3]:
train_path =  '../train/s2_image/'
mask_path = '../train/mask/'

masks = glob.glob(f'{mask_path}/*')
trains = glob.glob(f'{train_path}/*')
masks.sort()
trains.sort()

In [4]:
X = []
y = []
g = []

for i, (t, m) in enumerate(zip(trains, masks)):
    img = tifffile.imread(t).astype(np.float64)
    mask = tifffile.imread(m).astype(np.float64)
    X.append(img.reshape(-1,12))
    y.append(mask.reshape(-1))
    g.append(np.ones_like(mask.reshape(-1))*i)
    
X = np.vstack(X)
y = np.hstack(y)
g = np.hstack(g) 

In [5]:
df_x = pd.DataFrame(X, columns=['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'B11', 'B12'])
df_x['NDRE_1'] = (df_x['B11']-df_x['B12'])/(df_x['B11']+df_x['B12'])

# df_x = df_x.drop(columns=['B8', 'B3', 'B11', 'B4', 'B2', 'B5'], axis=1)
X = np.array(df_x)
print(X.shape)

(1133572, 13)


In [6]:
from catboost import CatBoostClassifier, Pool, cv
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold

gkfold = GroupKFold(n_splits=4)

models = []

for i, (train_idx, valid_idx) in enumerate(gkfold.split(X, y, g)):
    train_x = X[train_idx]
    train_y = y[train_idx]
    
    val_x = X[valid_idx]
    val_y = y[valid_idx]

    model = CatBoostClassifier(iterations=8000,
                                task_type="GPU",
                                verbose=100,
                                learning_rate=0.03,
                                # class_weights= class_weights,
                                #    eval_metric = 'F1',
                                devices='0:1')

    m = model.fit(train_x,
            train_y,eval_set=(val_x, val_y),
            verbose=1)
    
    models.append(m)

0:	learn: 0.6279633	test: 0.6278054	best: 0.6278054 (0)	total: 7.59ms	remaining: 1m
1:	learn: 0.5755003	test: 0.5747029	best: 0.5747029 (1)	total: 15.1ms	remaining: 1m
2:	learn: 0.5232007	test: 0.5223533	best: 0.5223533 (2)	total: 22.5ms	remaining: 1m
3:	learn: 0.4801398	test: 0.4789271	best: 0.4789271 (3)	total: 29.5ms	remaining: 59s
4:	learn: 0.4430921	test: 0.4414088	best: 0.4414088 (4)	total: 36.9ms	remaining: 59s
5:	learn: 0.4099928	test: 0.4078119	best: 0.4078119 (5)	total: 43.5ms	remaining: 57.9s
6:	learn: 0.3800406	test: 0.3774755	best: 0.3774755 (6)	total: 50.8ms	remaining: 58s
7:	learn: 0.3535631	test: 0.3506932	best: 0.3506932 (7)	total: 58.4ms	remaining: 58.3s
8:	learn: 0.3300684	test: 0.3268752	best: 0.3268752 (8)	total: 65.7ms	remaining: 58.3s
9:	learn: 0.3082858	test: 0.3048802	best: 0.3048802 (9)	total: 72.3ms	remaining: 57.8s
10:	learn: 0.2900311	test: 0.2863474	best: 0.2863474 (10)	total: 79.2ms	remaining: 57.5s
11:	learn: 0.2750335	test: 0.2711024	best: 0.2711024 (11

In [7]:
#0.083, 0.091, 0.0920, 0.090
#0.082, 

In [8]:
test_path =  '../evaluation/'
test_mask_path = '../sample/'

masks = glob.glob(f'{test_mask_path}/*')
tests = glob.glob(f'{test_path}/*')
masks.sort()
tests.sort()

In [9]:
import os
if not os.path.isdir('output'):
    os.mkdir('output')

In [11]:
# threshold = 0.09
threshold = 0.5

pred_mask_list=[]
proba_list=[]

for i, (m, t) in tqdm.tqdm(enumerate(zip(masks, tests))):
    basename = os.path.basename(m)
    output_file = f'output/{basename}'
    
    img = tifffile.imread(t).astype(np.float64)
    mask = tifffile.imread(m).astype(np.float64)

    test_X = img.reshape(-1, 12)
    #######################
    df_x = pd.DataFrame(test_X, columns=['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'B11', 'B12'])
    df_x['NDRE_1'] = (df_x['B11']-df_x['B12'])/(df_x['B11']+df_x['B12'])
    test_X = np.array(df_x)
    #######################
    shape_mask = mask.shape
    
    pred = 0
    for model in models:
        pred += model.predict_proba(test_X) / len(models)

    pred_mask = pred[:,1].reshape(shape_mask[0], shape_mask[1]) #(23,23)
    proba_list.append(pred_mask)
    pred_mask = (pred_mask > threshold).astype(np.uint8)

    pred_mask_list.append(pred_mask)
    tifffile.imwrite(output_file, pred_mask)
    

2066it [00:23, 86.14it/s]


In [None]:
import zipfile

output_folder_path = 'output'
with zipfile.ZipFile('model_1025.zip', 'w') as zipf:
    for root, dirs, files in os.walk(output_folder_path):
        for file in files:
            zipf.write(os.path.join(root, file), file)

In [12]:
lgb_params = {    
    'boosting_type':'gbdt',
    'num_leaves':31,
    'max_depth':-1,
    'learning_rate' : 0.03,
    'n_estimators':5000,
    'random_state':136,
    }

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold

gkfold = GroupKFold(n_splits=4)

models_lgb_list = []
model_lgb = lgb.LGBMClassifier(**lgb_params)

for i, (train_idx, valid_idx) in enumerate(gkfold.split(X, y, g)):
    train_x = X[train_idx]
    train_y = y[train_idx]
    
    val_x = X[valid_idx]
    val_y = y[valid_idx]
    
    

    model_lgb.fit(train_x,
            train_y,eval_set=(val_x, val_y)
            ,callbacks=[lgb.log_evaluation(200)])
    
    models_lgb_list.append(model_lgb)

[LightGBM] [Info] Number of positive: 81882, number of negative: 768528
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013329 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 850410, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.096285 -> initscore=-2.239198
[LightGBM] [Info] Start training from score -2.239198
[200]	valid_0's binary_logloss: 0.0916588
[400]	valid_0's binary_logloss: 0.088421
[600]	valid_0's binary_logloss: 0.087011
[800]	valid_0's binary_logloss: 0.0860149
[1000]	valid_0's binary_logloss: 0.0852253
[1200]	valid_0's binary_logloss: 0.0846262
[1400]	valid_0's binary_logloss: 0.0842206
[1600]	valid_0's binary_logloss: 0.0839047
[1800]	valid_0's binary_logloss: 0.083598
[2000]	valid_0's binary_logloss: 0.0833329
[2200]	valid_0's binary_logloss: 0.0830949
[2400]	valid_0's binary_logloss: 0.082

In [13]:
import os
if not os.path.isdir('output_lgb'):
    os.mkdir('output_lgb')


In [14]:
threshold = 0.5

pred_lgb_mask_list=[]
proba_lgb_list=[]

for i, (m, t) in tqdm.tqdm(enumerate(zip(masks, tests))):
    basename = os.path.basename(m)
    output_file = f'output_lgb/{basename}'
    
    img = tifffile.imread(t).astype(np.float64)
    mask = tifffile.imread(m).astype(np.float64)

    test_X = img.reshape(-1, 12)
    #######################
    df_x = pd.DataFrame(test_X, columns=['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'B11', 'B12'])
    df_x['NDRE_1'] = (df_x['B11']-df_x['B12'])/(df_x['B11']+df_x['B12'])
    test_X = np.array(df_x)
    #######################
    shape_mask = mask.shape
    
    pred_lgb = 0
    for model_lgb in models_lgb_list:
        pred_lgb += model_lgb.predict_proba(test_X) / len(models_lgb_list)

    pred_mask_lgb = pred_lgb[:,1].reshape(shape_mask[0], shape_mask[1])
    proba_lgb_list.append(pred_mask_lgb)

    pred_mask_lgb = (pred_mask_lgb > threshold).astype(np.uint8)
    pred_lgb_mask_list.append(pred_mask_lgb)
    ####

    tifffile.imwrite(output_file, pred_mask_lgb)

    

2066it [02:54, 11.84it/s]


In [15]:
import zipfile

output_folder_path = 'output_lgb'
with zipfile.ZipFile('model_lgb_1025.zip', 'w') as zipf:
    for root, dirs, files in os.walk(output_folder_path):
        for file in files:
            zipf.write(os.path.join(root, file), file)

In [16]:
if not os.path.isdir('output_ensemble'):
    os.mkdir('output_ensemble')

In [25]:
temp =  (np.array(proba_list[0]) + np.array(proba_lgb_list[0])) /2

In [22]:
for i, (m, t) in tqdm.tqdm(enumerate(zip(masks, tests))):
    basename = os.path.basename(m)
    output_file = f'output_ensemble/{basename}'

    final_pred = (np.array(proba_list[i]) + np.array(proba_lgb_list[i])) /2
    final_pred_mask =(final_pred > 0.5).astype(np.uint8)
    tifffile.imwrite(output_file, final_pred_mask)


2066it [00:01, 1088.59it/s]


In [23]:
import zipfile

output_folder_path = 'output_ensemble'
with zipfile.ZipFile('model_ensemble_1025_last.zip', 'w') as zipf:
    for root, dirs, files in os.walk(output_folder_path):
        for file in files:
            zipf.write(os.path.join(root, file), file)