## Base Machine Learning Model

In [1]:
import glob
import tifffile
import numpy as np
import lightgbm as lgb
import warnings
import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# import seaborn as sns 

warnings.simplefilter('ignore')

In [16]:
path = "/home/sebastien/Documents/projects/solafune-solar-panel-detection/data/train/"
images_train_path = os.path.join(path,'s2_image')
images_train = sorted(glob.glob(os.path.join(images_train_path, "**/*.tif"), recursive=True))
print("IMAGES {} ### Total:  {}".format(images_train_path, len(images_train)))
mask_train_path = os.path.join(path,'mask')
mask_train = sorted(glob.glob(os.path.join(mask_train_path, "**/*.tif"), recursive=True))
print("MASK {} ### Total:  {}".format(mask_train_path, len(mask_train)))
df_train = pd.DataFrame(columns=['rgb_path','mask_path','dataset'])
df_train['rgb_path'] = images_train
df_train['mask_path'] = mask_train

df_train, df_test = train_test_split(df_train, test_size=0.05, random_state=42)
print("train data: {}".format(len(df_train)))
print("test data: {}".format(len(df_test)))

IMAGES /home/sebastien/Documents/projects/solafune-solar-panel-detection/data/train/s2_image ### Total:  2066
MASK /home/sebastien/Documents/projects/solafune-solar-panel-detection/data/train/mask ### Total:  2066
train data: 1962
test data: 104


In [17]:
X = []
y = []

for i, (t, m) in enumerate(zip(df_train['rgb_path'].values, df_train['mask_path'].values)):
    img = tifffile.imread(t).astype(np.float64)   
    mask = tifffile.imread(m).astype(np.float64)
    
    X.append(img.reshape(-1,12))
    y.append(mask.reshape(-1))

X_train = np.vstack(X)
y_train = np.hstack(y)


In [18]:
X = []
y = []

for i, (t, m) in enumerate(zip(df_test['rgb_path'].values, df_test['mask_path'].values)):
    img = tifffile.imread(t).astype(np.float64)
    mask = tifffile.imread(m).astype(np.float64)
    
    X.append(img.reshape(-1,12))
    y.append(mask.reshape(-1))

X_valid = np.vstack(X)
y_valid = np.hstack(y)

In [19]:
def evaluate_macroF1_lgb(y_true, y_pred):  

    y_pred_label = np.round(y_pred)
    f1 = f1_score(y_true, y_pred_label, average='binary')
    print('f1 score:', f1)
    return ('f1_score', f1, True)


## Make Model

In [20]:
lgb_params = {    
    'boosting_type':'gbdt',
    'learning_rate' : 0.1,
    'num_leaves':31,
    'max_depth':-1,
    'random_state':136,
	'n_estimators':500,
	'objective':'binary',
    }

model = lgb.LGBMClassifier(**lgb_params)
model.fit(X_train, y_train, eval_metric='logloss', 
      eval_set=[(X_valid, y_valid)],callbacks=[lgb.log_evaluation(100)])

[LightGBM] [Info] Number of positive: 100220, number of negative: 976023
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3060
[LightGBM] [Info] Number of data points in the train set: 1076243, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.093120 -> initscore=-2.276118
[LightGBM] [Info] Start training from score -2.276118
[100]	valid_0's binary_logloss: 0.116058
[200]	valid_0's binary_logloss: 0.119354
[300]	valid_0's binary_logloss: 0.119441
[400]	valid_0's binary_logloss: 0.131518
[500]	valid_0's binary_logloss: 0.133105


In [29]:
from catboost import CatBoostClassifier, Pool, cv

model_cat = CatBoostClassifier(iterations=8000,
                           task_type="GPU",
                           verbose=100,
                           learning_rate=0.03,
                        #    eval_metric = 'F1',
                           devices='0:1')
model_cat.fit(X_train,
        y_train,eval_set=(X_valid, y_valid),
        verbose=1)

0:	learn: 0.6398161	test: 0.6409335	best: 0.6409335 (0)	total: 11.3ms	remaining: 1m 30s
1:	learn: 0.5916118	test: 0.5936684	best: 0.5936684 (1)	total: 19.3ms	remaining: 1m 17s
2:	learn: 0.5488598	test: 0.5518204	best: 0.5518204 (2)	total: 27.3ms	remaining: 1m 12s
3:	learn: 0.5082718	test: 0.5123630	best: 0.5123630 (3)	total: 35.4ms	remaining: 1m 10s
4:	learn: 0.4731216	test: 0.4774535	best: 0.4774535 (4)	total: 43.6ms	remaining: 1m 9s
5:	learn: 0.4421155	test: 0.4467604	best: 0.4467604 (5)	total: 51.8ms	remaining: 1m 8s
6:	learn: 0.4159051	test: 0.4208295	best: 0.4208295 (6)	total: 60.1ms	remaining: 1m 8s
7:	learn: 0.3911953	test: 0.3968334	best: 0.3968334 (7)	total: 68.2ms	remaining: 1m 8s
8:	learn: 0.3687103	test: 0.3746094	best: 0.3746094 (8)	total: 76.5ms	remaining: 1m 7s
9:	learn: 0.3488815	test: 0.3549284	best: 0.3549284 (9)	total: 84.6ms	remaining: 1m 7s
10:	learn: 0.3312517	test: 0.3377523	best: 0.3377523 (10)	total: 92.4ms	remaining: 1m 7s
11:	learn: 0.3149512	test: 0.3221760	

<catboost.core.CatBoostClassifier at 0x7fa1da7eaca0>

In [21]:
import pandas as pd
import matplotlib.pyplot as plt

df_feature_importance = pd.DataFrame({'Value':m.feature_importances_})
df_feature_importance = df_feature_importance.T
plt.figure(figsize=(40, 20))
sns.set(font_scale = 5)
sns.barplot(df_feature_importance)
plt.title('LightGBM Features Model 1 (avg over folds)')
plt.tight_layout()
# plt.savefig('lgbm_importances-01.png')
plt.show()

AttributeError: 'str' object has no attribute 'feature_importances_'

In [None]:
df_feature_importance = pd.DataFrame({'Value':models[1].feature_importances_})
df_feature_importance = df_feature_importance.T
plt.figure(figsize=(40, 20))
sns.set(font_scale = 5)
sns.barplot(df_feature_importance)
plt.title('LightGBM Features Model 2 (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances-02.png')
plt.show()

In [None]:
def normalize(band):
    band_min, band_max = (band.min(), band.max())
    return ((band-band_min)/
            ((band_max - band_min)))




# Prediction on sample data

In [30]:
test_path =  '/home/sebastien/Documents/projects/solafune-solar-panel-detection/data/evaluation/'
test_mask_path = '/home/sebastien/Documents/projects/solafune-solar-panel-detection/data/sample/'


masks = glob.glob(f'{test_mask_path}/*')

tests = glob.glob(f'{test_path}/*')
masks.sort()
tests.sort()

In [31]:
print("IMAGES Total:  {}".format(len(tests)))

IMAGES Total:  2066


In [32]:
import os
if not os.path.isdir('output_cat'):
    os.mkdir('output_cat')

In [33]:
for i, (m, t) in tqdm.tqdm(enumerate(zip(masks, tests))):

    
    basename = os.path.basename(m)
    basename = basename.replace("tif","png")
    output_file = f'output_cat/{basename}'
    img = tifffile.imread(t).astype(np.float64)
    mask_gt = tifffile.imread(m).astype(np.float64)
        
    X = img.reshape(-1, 12)
    shape_mask = mask_gt.shape   
    pred = model_cat.predict(X)
    
    pred_mask = pred.reshape(shape_mask[0], shape_mask[1])       
    tifffile.imwrite(output_file, pred_mask)
 

2066it [00:07, 277.08it/s]


# Output directory into zip file

In [34]:
import zipfile

output_folder_path = 'output_cat'
with zipfile.ZipFile('output_cat.zip', 'w') as zipf:
    for root, dirs, files in os.walk(output_folder_path):
        for file in files:
            zipf.write(os.path.join(root, file), file)