In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from PIL import Image
import joblib

In [2]:
df = pd.read_csv('../../corpus/calculated_metrics_full.csv', index_col=False)
df = df.rename(columns={"expert_evaluation": "review", "Sample 1": "review"})
df['true_mask_pixels'] = 0
df['pred_mask_pixels'] = 0

In [3]:
def calc_pixels(path):
    img = Image.open(f'../../{path}').convert('L')
    np_img = np.array(img)
    np_img[np_img > 0] = 1
    return np.count_nonzero(np_img)

In [4]:
for i in tqdm(df.index, ncols='100%'):
    true_mask_path = df.loc[i, ['path_truth_mask']][0][3:]
    pred_mask_path = df.loc[i, ['path_pred_mask']][0][3:]
    
    df.loc[i,['true_mask_pixels']] = calc_pixels(true_mask_path)
    df.loc[i,['pred_mask_pixels']] = calc_pixels(pred_mask_path)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, layout=Layout(flex='2'), max=146.0), HTML(value='')), …




In [5]:
df

Unnamed: 0,case,path_orig,path_pred_mask,path_truth_mask,review,DICE,JACRD,AUC,KAPPA,RNDIND,...,ACURCY,FALLOUT,TP,FP,TN,FN,REFVOL,SEGVOL,true_mask_pixels,pred_mask_pixels
0,00000150_002.png,../dataset/Origin/00000150_002.png,../dataset/sample_1/00000150_002_s1.png,../dataset/Expert/00000150_002_expert.png,5,0.520065,0.351411,0.675705,0.519415,0.996630,...,0.998312,0.000000,959.0,0.0,1045847.0,1770.0,2729.0,959.0,2729,959
1,00000181_061.png,../dataset/Origin/00000181_061.png,../dataset/sample_1/00000181_061_s1.png,../dataset/Expert/00000181_061_expert.png,4,0.519559,0.350949,0.935726,0.473689,0.807597,...,0.892172,0.113676,61136.0,112143.0,874374.0,923.0,62059.0,173279.0,62059,173279
2,00000211_019.png,../dataset/Origin/00000211_019.png,../dataset/sample_1/00000211_019_s1.png,../dataset/Expert/00000211_019_expert.png,4,0.708670,0.548791,0.936005,0.648735,0.806597,...,0.891534,0.124482,138332.0,113248.0,796509.0,487.0,138819.0,251580.0,138819,251580
3,00000211_041.png,../dataset/Origin/00000211_041.png,../dataset/sample_1/00000211_041_s1.png,../dataset/Expert/00000211_041_expert.png,3,0.562957,0.391747,0.872117,0.476487,0.716678,...,0.829149,0.184200,115382.0,170256.0,754044.0,8894.0,124276.0,285638.0,124276,285638
4,00000344_003.png,../dataset/Origin/00000344_003.png,../dataset/sample_1/00000344_003_s1.png,../dataset/Expert/00000344_003_expert.png,2,0.169319,0.092489,0.776563,0.123700,0.654238,...,0.777703,0.222226,23756.0,226212.0,791725.0,6883.0,30639.0,249968.0,30639,249968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,00010815_006.png,../dataset/Origin/00010815_006.png,../dataset/sample_3/00010815_006_s3.png,../dataset/Expert/00010815_006_expert.png,5,0.856152,0.748485,0.874242,0.849609,0.974763,...,0.987218,0.000000,39886.0,0.0,995287.0,13403.0,53289.0,39886.0,53289,39886
142,00011157_001.png,../dataset/Origin/00011157_001.png,../dataset/sample_3/00011157_001_s3.png,../dataset/Expert/00011157_001_expert.png,2,0.338908,0.204027,0.605593,0.323076,0.910738,...,0.953176,0.002939,12585.0,2909.0,986893.0,46189.0,58774.0,15494.0,58774,15494
143,00011237_006.png,../dataset/Origin/00011237_006.png,../dataset/sample_3/00011237_006_s3.png,../dataset/Expert/00011237_006_expert.png,5,0.519079,0.350511,0.837288,0.494596,0.904282,...,0.949601,0.041161,28520.0,41520.0,967209.0,11327.0,39847.0,70040.0,39847,70040
144,00011355_011.png,../dataset/Origin/00011355_011.png,../dataset/sample_3/00011355_011_s3.png,../dataset/Expert/00011355_011_expert.png,3,0.304123,0.179331,0.872778,0.257612,0.737018,...,0.844252,0.158072,35687.0,159508.0,849575.0,3806.0,39493.0,195195.0,39493,195195


In [6]:
model = RandomForestRegressor(n_estimators=100, max_features ='sqrt')

In [7]:
features = ['true_mask_pixels', 'pred_mask_pixels', 'HDRFDST', 'AVGDIST']
target = 'review'

In [34]:
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print('Mean AE train: {:.3f}, test: {:.3f}'.format(
    mean_absolute_error(y_train, y_train_pred),
    mean_absolute_error(y_test, y_test_pred)))

Mean AE train: 0.235, test: 0.529


In [35]:
joblib.dump(model, '../../models/random_forest/rf_2m-0.529.cbm')

['../../models/random_forest/rf_2m-0.529.cbm']