<a href="https://colab.research.google.com/github/yuriao/API/blob/master/solafune_solarPanelDetection_LGBM_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import glob
import tifffile
import numpy as np
import lightgbm as lgb
import warnings
import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split
import optuna
warnings.simplefilter('ignore')

### 1. read data

In [None]:
train_path =  'D:/D_drive_proj/DS_projects/solafune_solarPanelDetection/train/s2_image'
mask_path = 'D:/D_drive_proj/DS_projects/solafune_solarPanelDetection/train/mask'

masks = glob.glob(f'{mask_path}/*')
trains = glob.glob(f'{train_path}/*')
masks.sort()
trains.sort()

In [None]:
X = []
y = []
g = []

for i, (t, m) in enumerate(zip(trains, masks)):
    img = tifffile.imread(t).astype(float)
    mask = tifffile.imread(m).astype(float)
    X.append(img.reshape(-1,12))
    y.append(mask.reshape(-1))
    g.append(np.ones_like(mask.reshape(-1))*i)

X = np.vstack(X)
y = np.hstack(y)
g = np.hstack(g)

### 2. EDA: check nans in data

In [None]:
print('# of nans in image = '+str(np.sum(np.isnan(X))))
print('# of nans in mask = '+str(np.sum(np.isnan(y))))

# of nans in image = 0
# of nans in mask = 0


### 3. training and testing utility functions

In [None]:
def GroupKFold_model_training(X, y, g,lgb_params):
    gkfold = GroupKFold(n_splits=4)
    models = []
    for i, (train_idx, valid_idx) in enumerate(gkfold.split(X, y, g)):
        train_x = X[train_idx]
        train_y = y[train_idx]
        val_x = X[valid_idx]
        val_y = y[valid_idx]
        m = lgb.LGBMClassifier(**lgb_params)
        m.fit(train_x, train_y,
            eval_metric='logloss',
            eval_set=[(val_x, val_y)],
            callbacks=[
                lgb.log_evaluation(100),
            ],
        )
        models.append(m)

    return models

In [None]:
def GroupKFold_model_test(X,models):
    pred = 0
    for model in models:
        pred = pred+model.predict_proba(X) / len(models)

    final_preds=[]
    for i in range(0,pred.shape[0]):
        if pred[i][0]>pred[i][1]:
            final_preds.append(0)
        else:
            final_preds.append(1)

    return final_preds


### 4. Hyperparameter tuning with Optuna

In [None]:
def objective(trial):

    param = {
        'n_estimators': trial.suggest_int('n_estimators', 1,1000),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.00001,0.00005,0.001,0.005,0.01,0.02,0.05,0.1]),
        'max_depth': trial.suggest_int('max_depth', 1 , 200),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
    }

    X_train,X_test,y_train,y_test,g_train,g_test=train_test_split(X,y,g,test_size=0.3,shuffle=True)
    models = GroupKFold_model_training(X_train,y_train,g_train,param)
    y_pred=GroupKFold_model_test(X_test,models)
    f= f1_score(y_test,y_pred)

    return f

In [None]:
optuna.logging.set_verbosity(optuna.logging.CRITICAL)
study = optuna.create_study(direction='maximize')
#study.optimize(objective, n_trials=100)
#best_params=study.best_trial.params

[LightGBM] [Info] Number of positive: 56286, number of negative: 538676
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000959 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3060
[LightGBM] [Info] Number of data points in the train set: 594962, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.094604 -> initscore=-2.258668
[LightGBM] [Info] Start training from score -2.258668
[100]	valid_0's binary_logloss: 0.268576
[200]	valid_0's binary_logloss: 0.243537
[300]	valid_0's binary_logloss: 0.223601
[400]	valid_0's binary_logloss: 0.207636
[500]	valid_0's binary_logloss: 0.194857
[600]	valid_0's binary_logloss: 0.183587
[LightGBM] [Info] Number of positive: 55335, number of negative: 539634
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007934 seconds.
You can set `force

In [None]:
best_params={'n_estimators': 748,
 'reg_alpha': 0.01033323111097222,
 'reg_lambda': 0.001696493306975717,
 'colsample_bytree': 1.0,
 'subsample': 0.8,
 'learning_rate': 0.1,
 'max_depth': 36,
 'num_leaves': 966,
 'min_child_samples': 20}

{'n_estimators': 748,
 'reg_alpha': 0.01033323111097222,
 'reg_lambda': 0.001696493306975717,
 'colsample_bytree': 1.0,
 'subsample': 0.8,
 'learning_rate': 0.1,
 'max_depth': 36,
 'num_leaves': 966,
 'min_child_samples': 20,
 'min_data_per_groups': 15}

### 5. training

In [None]:
models = GroupKFold_model_training(X,y,g,best_params)

[LightGBM] [Info] Number of positive: 81882, number of negative: 768528
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007940 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3060
[LightGBM] [Info] Number of data points in the train set: 850410, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.096285 -> initscore=-2.239198
[LightGBM] [Info] Start training from score -2.239198
[100]	valid_0's binary_logloss: 0.0868067
[200]	valid_0's binary_logloss: 0.0897498
[300]	valid_0's binary_logloss: 0.0945851
[400]	valid_0's binary_logloss: 0.1007
[500]	valid_0's binary_logloss: 0.107158
[600]	valid_0's binary_logloss: 0.114206
[700]	valid_0's binary_logloss: 0.121026
[LightGBM] [Info] Number of positive: 78111, number of negative: 771836
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007590 seconds.
You can set `force_col_wise=true` to rem

### 6. generate results

In [None]:
test_path =  'D:/D_drive_proj/DS_projects/solafune_solarPanelDetection/evaluation'
test_mask_path = 'D:/D_drive_proj/DS_projects/solafune_solarPanelDetection/sample'

masks = glob.glob(f'{test_mask_path}/*')
tests = glob.glob(f'{test_path}/*')
masks.sort()
tests.sort()

In [None]:
import os
if not os.path.isdir('D:/D_drive_proj/DS_projects/solafune_solarPanelDetection/output2'):
    os.mkdir('D:/D_drive_proj/DS_projects/solafune_solarPanelDetection/output2')

In [None]:
threshold = 0.5

for i, (m, t) in tqdm.tqdm(enumerate(zip(masks, tests))):
    basename = os.path.basename(m)
    output_file = f'D:/D_drive_proj/DS_projects/solafune_solarPanelDetection/output2/{basename}'

    img = tifffile.imread(t).astype(float)
    mask = tifffile.imread(m).astype(float)

    X = img.reshape(-1, 12)
    shape_mask = mask.shape

    pred=GroupKFold_model_test(X,models)

    pred_mask = np.array(pred).astype(np.uint8)
    pred_mask = pred_mask.reshape(shape_mask[0], shape_mask[1])

    tifffile.imwrite(output_file, pred_mask)


3it [00:00, 29.10it/s]



11it [00:00, 33.64it/s]



20it [00:00, 37.38it/s]



28it [00:00, 36.74it/s]



37it [00:01, 37.61it/s]



45it [00:01, 37.87it/s]



55it [00:01, 39.58it/s]



63it [00:01, 38.82it/s]



71it [00:01, 38.58it/s]



80it [00:02, 39.43it/s]



84it [00:02, 39.02it/s]



92it [00:02, 36.97it/s]



101it [00:02, 38.17it/s]



109it [00:02, 37.14it/s]



117it [00:03, 34.70it/s]



122it [00:03, 36.89it/s]



131it [00:03, 36.16it/s]



139it [00:03, 36.72it/s]



147it [00:03, 36.72it/s]



155it [00:04, 36.19it/s]



163it [00:04, 36.38it/s]



172it [00:04, 37.18it/s]



181it [00:04, 38.28it/s]



189it [00:05, 37.07it/s]



194it [00:05, 37.34it/s]



202it [00:05, 35.71it/s]



210it [00:05, 36.79it/s]



218it [00:05, 37.35it/s]



226it [00:06, 37.82it/s]



234it [00:06, 33.92it/s]



242it [00:06, 33.18it/s]



246it [00:06, 31.57it/s]



254it [00:06, 32.63it/s]



262it [00:07, 32.75it/s]



270it [00:07, 33.42it/s]



278it [00:07, 35.72it/s]



286it [00:07, 36.19it/s]



294it [00:08, 37.63it/s]



302it [00:08, 38.06it/s]



311it [00:08, 39.08it/s]



315it [00:08, 39.01it/s]



324it [00:08, 38.53it/s]



332it [00:09, 38.14it/s]



340it [00:09, 36.58it/s]



348it [00:09, 36.60it/s]



356it [00:09, 36.99it/s]



364it [00:09, 36.58it/s]



372it [00:10, 36.28it/s]



380it [00:10, 35.22it/s]



384it [00:10, 35.23it/s]



393it [00:10, 36.41it/s]



402it [00:10, 37.62it/s]



410it [00:11, 36.94it/s]



414it [00:11, 35.52it/s]



422it [00:11, 34.29it/s]



430it [00:11, 34.92it/s]



438it [00:12, 33.28it/s]



442it [00:12, 34.15it/s]



450it [00:12, 34.53it/s]



459it [00:12, 37.29it/s]



468it [00:12, 37.89it/s]



472it [00:12, 38.37it/s]



482it [00:13, 38.64it/s]



490it [00:13, 38.82it/s]



498it [00:13, 37.82it/s]



506it [00:13, 37.57it/s]



514it [00:14, 38.28it/s]



523it [00:14, 39.45it/s]



531it [00:14, 38.27it/s]



539it [00:14, 37.31it/s]



547it [00:14, 38.15it/s]



556it [00:15, 38.54it/s]



564it [00:15, 38.25it/s]



572it [00:15, 36.79it/s]



581it [00:15, 36.14it/s]



590it [00:16, 36.46it/s]



594it [00:16, 37.09it/s]



602it [00:16, 34.73it/s]



610it [00:16, 33.29it/s]



614it [00:16, 31.40it/s]



622it [00:17, 32.42it/s]



626it [00:17, 31.50it/s]



634it [00:17, 32.30it/s]



642it [00:17, 33.69it/s]



651it [00:17, 34.41it/s]



655it [00:18, 32.52it/s]



663it [00:18, 30.31it/s]



667it [00:18, 31.68it/s]



676it [00:18, 35.99it/s]



684it [00:18, 36.84it/s]



693it [00:19, 37.79it/s]



702it [00:19, 38.45it/s]



710it [00:19, 37.34it/s]



714it [00:19, 37.60it/s]



722it [00:19, 38.01it/s]



730it [00:20, 37.88it/s]



738it [00:20, 38.36it/s]



746it [00:20, 37.14it/s]



754it [00:20, 37.86it/s]



762it [00:20, 36.88it/s]



770it [00:21, 36.08it/s]



779it [00:21, 38.15it/s]



787it [00:21, 38.31it/s]



796it [00:21, 39.16it/s]



800it [00:21, 35.99it/s]



808it [00:22, 33.35it/s]



812it [00:22, 30.18it/s]



820it [00:22, 33.05it/s]



828it [00:22, 35.10it/s]



836it [00:23, 36.37it/s]



844it [00:23, 36.75it/s]



852it [00:23, 36.87it/s]



856it [00:23, 35.59it/s]



864it [00:23, 37.14it/s]



873it [00:24, 38.23it/s]



881it [00:24, 36.93it/s]



889it [00:24, 34.71it/s]



898it [00:24, 37.27it/s]



902it [00:24, 37.45it/s]



910it [00:25, 33.44it/s]



918it [00:25, 33.64it/s]



923it [00:25, 35.77it/s]



932it [00:25, 37.14it/s]



940it [00:25, 37.92it/s]



950it [00:26, 39.48it/s]



954it [00:26, 38.14it/s]



963it [00:26, 36.53it/s]



972it [00:26, 37.70it/s]



976it [00:26, 37.75it/s]



984it [00:27, 36.15it/s]



994it [00:27, 37.93it/s]



1003it [00:27, 36.76it/s]



1007it [00:27, 34.30it/s]



1015it [00:27, 32.66it/s]



1023it [00:28, 33.00it/s]



1027it [00:28, 34.22it/s]



1035it [00:28, 32.74it/s]



1043it [00:28, 33.78it/s]



1047it [00:28, 31.76it/s]



1055it [00:29, 29.83it/s]



1059it [00:29, 30.11it/s]



1067it [00:29, 30.37it/s]



1075it [00:29, 31.81it/s]



1083it [00:30, 32.06it/s]



1087it [00:30, 33.36it/s]



1095it [00:30, 29.96it/s]



1099it [00:30, 30.17it/s]



1107it [00:30, 30.77it/s]



1111it [00:30, 30.77it/s]



1119it [00:31, 32.01it/s]



1127it [00:31, 32.92it/s]



1135it [00:31, 32.01it/s]



1139it [00:31, 30.02it/s]



1147it [00:32, 30.15it/s]



1155it [00:32, 33.89it/s]



1159it [00:32, 31.76it/s]



1167it [00:32, 32.26it/s]



1175it [00:32, 32.50it/s]



1179it [00:33, 30.43it/s]



1187it [00:33, 30.73it/s]



1195it [00:33, 30.36it/s]



1203it [00:33, 33.93it/s]



1211it [00:34, 32.30it/s]



1215it [00:34, 30.60it/s]



1223it [00:34, 29.13it/s]



1230it [00:34, 29.81it/s]



1234it [00:34, 30.57it/s]



1242it [00:35, 32.28it/s]



1250it [00:35, 32.16it/s]



1255it [00:35, 34.63it/s]



1263it [00:35, 31.13it/s]



1267it [00:35, 31.02it/s]



1275it [00:36, 33.39it/s]



1283it [00:36, 33.82it/s]



1291it [00:36, 35.66it/s]



1299it [00:36, 34.43it/s]



1307it [00:37, 35.02it/s]



1315it [00:37, 32.74it/s]



1319it [00:37, 32.61it/s]



1327it [00:37, 32.82it/s]



1336it [00:37, 34.73it/s]



1341it [00:38, 35.84it/s]



1349it [00:38, 35.67it/s]



1357it [00:38, 35.35it/s]



1365it [00:38, 35.36it/s]



1373it [00:38, 36.84it/s]



1381it [00:39, 36.37it/s]



1389it [00:39, 36.24it/s]



1393it [00:39, 36.83it/s]



1401it [00:39, 33.68it/s]



1409it [00:40, 32.89it/s]



1413it [00:40, 33.04it/s]



1421it [00:40, 32.57it/s]



1429it [00:40, 34.13it/s]



1437it [00:40, 35.44it/s]



1445it [00:41, 35.93it/s]



1449it [00:41, 35.60it/s]



1457it [00:41, 33.70it/s]



1465it [00:41, 34.11it/s]



1473it [00:41, 35.96it/s]



1482it [00:42, 37.14it/s]



1490it [00:42, 37.92it/s]



1495it [00:42, 39.27it/s]



1503it [00:42, 38.65it/s]



1511it [00:42, 38.53it/s]



1520it [00:43, 38.73it/s]



1528it [00:43, 37.55it/s]



1538it [00:43, 38.96it/s]



1546it [00:43, 37.66it/s]



1554it [00:43, 35.06it/s]



1558it [00:44, 34.61it/s]



1566it [00:44, 34.79it/s]



1574it [00:44, 35.63it/s]



1582it [00:44, 34.83it/s]



1590it [00:45, 35.10it/s]



1594it [00:45, 35.90it/s]



1602it [00:45, 33.82it/s]



1610it [00:45, 34.36it/s]



1618it [00:45, 36.30it/s]



1626it [00:46, 36.32it/s]



1634it [00:46, 35.33it/s]



1642it [00:46, 36.68it/s]



1646it [00:46, 36.53it/s]



1654it [00:46, 35.33it/s]



1663it [00:47, 36.85it/s]



1671it [00:47, 35.78it/s]



1679it [00:47, 36.30it/s]



1687it [00:47, 35.84it/s]



1695it [00:47, 34.87it/s]



1703it [00:48, 33.81it/s]



1707it [00:48, 34.28it/s]



1715it [00:48, 36.02it/s]



1723it [00:48, 36.61it/s]



1731it [00:48, 36.27it/s]



1739it [00:49, 36.17it/s]



1747it [00:49, 33.88it/s]



1755it [00:49, 33.90it/s]



1763it [00:49, 35.24it/s]



1767it [00:50, 34.51it/s]



1775it [00:50, 32.17it/s]



1783it [00:50, 33.26it/s]



1787it [00:50, 33.45it/s]



1795it [00:50, 33.07it/s]



1803it [00:51, 33.35it/s]



1807it [00:51, 32.32it/s]



1815it [00:51, 33.76it/s]



1823it [00:51, 33.62it/s]



1831it [00:51, 34.50it/s]



1835it [00:52, 33.33it/s]



1843it [00:52, 31.96it/s]



1851it [00:52, 33.12it/s]



1859it [00:52, 35.79it/s]



1868it [00:53, 37.38it/s]



1878it [00:53, 37.74it/s]



1886it [00:53, 38.14it/s]



1890it [00:53, 37.87it/s]



1898it [00:53, 38.32it/s]



1906it [00:54, 37.08it/s]



1915it [00:54, 36.61it/s]



1924it [00:54, 37.79it/s]



1928it [00:54, 37.92it/s]



1937it [00:54, 39.41it/s]



1946it [00:55, 38.48it/s]



1955it [00:55, 38.48it/s]



1964it [00:55, 38.32it/s]



1972it [00:55, 38.21it/s]



1977it [00:55, 39.15it/s]



1985it [00:56, 38.22it/s]



1995it [00:56, 39.05it/s]



2003it [00:56, 36.81it/s]



2007it [00:56, 36.16it/s]



2017it [00:56, 38.60it/s]



2026it [00:57, 38.90it/s]



2034it [00:57, 37.62it/s]



2042it [00:57, 36.27it/s]



2047it [00:57, 37.61it/s]



2057it [00:57, 37.79it/s]



2066it [00:58, 39.11it/s]



2066it [00:58, 35.50it/s]
