# Example 3 - Train classification models (ML)

In this notebook, we'll train classification models (CNN, RF, PU-learning, Co-training), see how they performs? Also, saved those models for prediction on test set. 

In [57]:
import os
import numpy as np
import pandas as pd
import glob
import pickle
import random

from src.read_echogram import EchogramReader
from src.detect_ROI import ROIDetector
from src.ROI_features import FeatureExtractor
from src.transform_annotations import AnnotationTransformer
from src.match_annotations import OverlapAnnotation
from src.crop_ROI import ROICropper

%matplotlib inline

## Step 1. Separate train & test data

In this step, we'll separate 2019 data (861 annotated ones) into train & test files. In total, 50 files for testing. 

In [58]:
# load train
pkl_dir = "pkl/"
with open(pkl_dir + 'train_echogram_li.pickle', 'rb') as handle:
    train_echogram_li = pickle.load(handle)

In [59]:
# load test
pkl_dir = "pkl/"
with open(pkl_dir + 'test_echogram_li.pickle', 'rb') as handle:
    test_echogram_li = pickle.load(handle)

In [60]:
# add label map
label_map = {'Unclassified regions': 1, 'krill_schools': 2, 'fish_school': 3, 'AH_School': 4}

## Step 2. Train classification models (RF, PU, Co-training)

In this step, we'll train classification models to differentiate AH and other categories. 

In [61]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from src.classifiers import CoTrainingClassifier
from pulearn import ElkanotoPuClassifier, BaggingPuClassifier
from sklearn.metrics import classification_report

In [62]:
# label map
label_map = {'None': 0, 'Unclassified regions': 1, 'krill_schools': 2, 'fish_school': 3, 'AH_School': 4}

In [63]:
# load ROI features
pkl_dir = "pkl/"
df_roi_features = pd.read_pickle(pkl_dir + 'df_roi_features.pkl') # *_new, kernel_size = 1, for abundance estimates

In [64]:
df_roi_features['thickness'].min()

2.100876733660698

In [65]:
df_roi_features['length'].min()

1.027441581837877

In [66]:
print(df_roi_features.shape[0])

26438


Define feature set. 

In [84]:
acoustic_features = ['Sv_18kHz_min', 'Sv_18kHz_p5', 'Sv_18kHz_p25', 'Sv_18kHz_p50', 'Sv_18kHz_p75', 'Sv_18kHz_p95', 'Sv_18kHz_max', 'Sv_18kHz_std', 'Sv_38kHz_min', 'Sv_38kHz_p5', 'Sv_38kHz_p25', 'Sv_38kHz_p50', 'Sv_38kHz_p75', 'Sv_38kHz_p95', 'Sv_38kHz_max', 'Sv_38kHz_std', 'Sv_120kHz_min', 'Sv_120kHz_p5', 'Sv_120kHz_p25', 'Sv_120kHz_p50', 'Sv_120kHz_p75', 'Sv_120kHz_p95', 'Sv_120kHz_max', 'Sv_120kHz_std', 'Sv_200kHz_min', 'Sv_200kHz_p5', 'Sv_200kHz_p25', 'Sv_200kHz_p50', 'Sv_200kHz_p75', 'Sv_200kHz_p95', 'Sv_200kHz_max', 'Sv_200kHz_std', 'Sv_ref_18kHz', 'Sv_ref_120kHz', 'Sv_ref_200kHz']
geometric_features = ['length', 'thickness', 'area', 'perimeter', 'rectangularity', 'compact', 'circularity', 'elongation']
geographic_features_vertical = ['total_water_column', 'depth', 'relative_altitude']
geographic_features_horizontal = ['latitude', 'longitude']
sel_features = acoustic_features # + geometric_features + geographic_features_vertical + geographic_features_horizontal

In [85]:
# normalization
min_max_scaler = MinMaxScaler()
df_roi_features[sel_features] = min_max_scaler.fit_transform(df_roi_features[sel_features])
df_roi_features = df_roi_features.dropna(how='any')

Get train set. 

In [86]:
train_features = df_roi_features[df_roi_features['filename'].isin(train_echogram_li)]
# separate into pos, neg, unlabeled set
positive_set = train_features[train_features['label']==4]
negative_set = train_features[train_features['label'].isin([1, 2, 3])]
unlabeled_set = train_features[train_features['label']==0]

So there, do we need to keep the original ratio of the dataset?

In [87]:
print(train_features.shape[0], positive_set.shape[0], negative_set.shape[0], unlabeled_set.shape[0])

23391 2087 1007 20297


Get test set. 

In [88]:
test_features = df_roi_features[df_roi_features['filename'].isin(test_echogram_li)]
# select only labeled
test_set = test_features[test_features['label']!=0]
test_set['label'] = test_set['label'].apply(lambda x: 1 if x==4 else 0) # Use 1 and 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set['label'] = test_set['label'].apply(lambda x: 1 if x==4 else 0) # Use 1 and 0


In [89]:
print(test_features.shape[0])

3047


In [90]:
X_test = test_set[sel_features].to_numpy()
y_test = test_set['label'].tolist()

In [91]:
np.unique(y_test, return_counts=True)

(array([0, 1]), array([105, 453]))

### Use different number of positive samples

Try using different number of positive samples for model training. 

In [92]:
max_k = positive_set.shape[0]
print(max_k)
# k_li = list(range(500, max_k, 100))
k_li = [max_k]

2087


In [93]:
pkl_dir = "pkl/"

### RF model (only positive, only acoustic features)

In [94]:
res_RF = []
for idx, k in enumerate(k_li):
    positive_set_sel = positive_set.sample(k, random_state=0)
    unlabeled_set_sel = unlabeled_set.sample(1 * k - negative_set.shape[0])
    train_set = pd.concat([positive_set_sel, negative_set, unlabeled_set_sel], ignore_index=True)
    train_set['label'] = train_set['label'].apply(lambda x: 1 if x==4 else 0)
    X_train = train_set[sel_features].to_numpy()
    y = np.array(train_set['label'].tolist())
    base_rf = RandomForestClassifier(n_estimators=100, class_weight='balanced')
    base_rf.fit(X_train, y)
    c_y_predict = base_rf.predict(X_test)
    RF_report = classification_report(y_test, c_y_predict, output_dict=True)
    print(k, RF_report['1'])
    # add
    res_RF.append({'size': k, 'RF_recall': RF_report['1']['recall'], 'RF_precision': RF_report['1']['precision'], 'RF_f1': RF_report['1']['f1-score']})
    # save model
    if idx == (len(k_li) - 1):       
        with open(pkl_dir + "model_RF_only_acoustic.pkl", 'wb') as f:
            pickle.dump(base_rf, f)
# save
res_RF_pd = pd.DataFrame(res_RF)
res_RF_pd.to_pickle(pkl_dir + 'results_RF_only_acoustic.pkl')  

2087 {'precision': 0.9470899470899471, 'recall': 0.7902869757174393, 'f1-score': 0.861612515042118, 'support': 453}


### RF model (only positive)

In [77]:
res_RF = []
for idx, k in enumerate(k_li):
    positive_set_sel = positive_set.sample(k, random_state=0)
    unlabeled_set_sel = unlabeled_set.sample(k - negative_set.shape[0]) # minus negative set
    train_set = pd.concat([positive_set_sel, negative_set, unlabeled_set_sel], ignore_index=True)
    train_set['label'] = train_set['label'].apply(lambda x: 1 if x==4 else 0)
    X_train = train_set[sel_features].to_numpy()
    y = np.array(train_set['label'].tolist())
    base_rf = RandomForestClassifier(n_estimators=100, class_weight='balanced')
    base_rf.fit(X_train, y)
    c_y_predict = base_rf.predict(X_test)
    RF_report = classification_report(y_test, c_y_predict, output_dict=True)
    print(k, RF_report['1'])
    # add
    res_RF.append({'size': k, 'RF_recall': RF_report['1']['recall'], 'RF_precision': RF_report['1']['precision'], 'RF_f1': RF_report['1']['f1-score']})
    # save model
    if idx == (len(k_li) - 1):       
        with open(pkl_dir + "model_RF.pkl", 'wb') as f:
            pickle.dump(base_rf, f)

2087 {'precision': 0.9951100244498777, 'recall': 0.8984547461368654, 'f1-score': 0.9443155452436195, 'support': 453}


In [78]:
# save
res_RF_pd = pd.DataFrame(res_RF)
res_RF_pd.to_pickle(pkl_dir + 'results_RF.pkl')    

### PU learning model

In [79]:
res_PU = []
for idx, k in enumerate(k_li):
    positive_set_sel = positive_set.sample(k, random_state=0)
    unlabeled_set_sel = unlabeled_set.sample(k - negative_set.shape[0])
    train_set = pd.concat([positive_set_sel, negative_set, unlabeled_set_sel], ignore_index=True)
    train_set['label'] = train_set['label'].apply(lambda x: 1 if x==4 else -1)
    X_train = train_set[sel_features].to_numpy()
    y = np.array(train_set['label'].tolist())
    # change 0 to -1
    y_test_new = [-1 if x==0 else x for x in y_test]
    c = RandomForestClassifier(n_estimators=100, class_weight='balanced')
    pu_estimator = ElkanotoPuClassifier(estimator=c, hold_out_ratio=0.2)
    pu_estimator.fit(X_train, y)
    c_y_predict = pu_estimator.predict(X_test)
    PU_report = classification_report(y_test_new, c_y_predict, output_dict=True)
    print(k, PU_report['1'])
    # add
    res_PU.append({'size': k, 'PU_recall': PU_report['1']['recall'], 'PU_precision': PU_report['1']['precision'], 'PU_f1': PU_report['1']['f1-score']})
    # save model
    if idx == (len(k_li) - 1):       
        with open(pkl_dir + "model_PU.pkl", 'wb') as f:
            pickle.dump(pu_estimator, f)
# save res
res_PU_pd = pd.DataFrame(res_PU)
res_PU_pd.to_pickle(pkl_dir + 'results_PU.pkl')    

2087 {'precision': 0.9954022988505747, 'recall': 0.9558498896247241, 'f1-score': 0.9752252252252254, 'support': 453}


Let's test out how the labeled:unlabeled ratio impact the performance. 

In [80]:
res_times = []
times = list(range(1, 11, 1)) # can be max 10
for item in times:
    positive_set_sel = positive_set.sample(max_k, random_state=0)
    unlabeled_set_sel = unlabeled_set.sample(item * max_k - negative_set.shape[0], replace=True) # 
    train_set = pd.concat([positive_set_sel, negative_set, unlabeled_set_sel], ignore_index=True)
    train_set['label'] = train_set['label'].apply(lambda x: 1 if x==4 else -1)
    X_train = train_set[sel_features].to_numpy()
    y = np.array(train_set['label'].tolist())
    # change 0 to -1
    y_test_new = [-1 if x==0 else x for x in y_test]
    c = RandomForestClassifier(n_estimators=100, class_weight='balanced')
    pu_estimator = ElkanotoPuClassifier(estimator=c, hold_out_ratio=0.2)
    pu_estimator.fit(X_train, y)
    c_y_predict = pu_estimator.predict(X_test)
    PU_report = classification_report(y_test_new, c_y_predict, output_dict=True)
    print(item, PU_report['1'])
    res_times.append({'time': item, 'PU_recall': PU_report['1']['recall'], 'PU_precision': PU_report['1']['precision'], 'PU_f1': PU_report['1']['f1-score']})
# save res
res_times_pd = pd.DataFrame(res_times)
res_times_pd.to_pickle(pkl_dir + 'results_PU_times.pkl')    

1 {'precision': 0.9953810623556582, 'recall': 0.9514348785871964, 'f1-score': 0.9729119638826185, 'support': 453}
2 {'precision': 0.9951456310679612, 'recall': 0.9050772626931567, 'f1-score': 0.9479768786127167, 'support': 453}
3 {'precision': 0.9873737373737373, 'recall': 0.8631346578366446, 'f1-score': 0.9210836277974087, 'support': 453}
4 {'precision': 0.9894736842105263, 'recall': 0.8300220750551877, 'f1-score': 0.9027611044417767, 'support': 453}
5 {'precision': 0.9943502824858758, 'recall': 0.7770419426048565, 'f1-score': 0.8723667905824039, 'support': 453}
6 {'precision': 0.9915966386554622, 'recall': 0.7814569536423841, 'f1-score': 0.874074074074074, 'support': 453}
7 {'precision': 0.9940119760479041, 'recall': 0.7328918322295805, 'f1-score': 0.843710292249047, 'support': 453}
8 {'precision': 0.9940298507462687, 'recall': 0.7350993377483444, 'f1-score': 0.8451776649746192, 'support': 453}
9 {'precision': 0.9882697947214076, 'recall': 0.7439293598233996, 'f1-score': 0.8488664987

### Co-training model

In [83]:
res_co = []
for idx, k in enumerate(k_li):
    positive_set_sel = positive_set.sample(k, random_state=0)
    unlabeled_set_sel = unlabeled_set.sample(k - negative_set.shape[0])
    train_set = pd.concat([positive_set_sel, negative_set, unlabeled_set_sel], ignore_index=True)
    # get -1: unlabeled, 0: negative, 1: positive
    new_label_map = {0: -1, 1: 0, 2: 0, 3: 0, 4: 1}
    train_set['label'] = train_set['label'].apply(lambda x: new_label_map[x]) # reset
    X1 = train_set[acoustic_features + geometric_features].to_numpy()
    X2 = train_set[geographic_features_horizontal + geographic_features_vertical].to_numpy()
    y = np.array(train_set['label'].tolist())
    rf_co_clf = CoTrainingClassifier(RandomForestClassifier(n_estimators=100))
    rf_co_clf.fit(X1, X2, y)
    # add test
    X1_test = test_set[acoustic_features + geometric_features].to_numpy()
    X2_test = test_set[geographic_features_horizontal + geographic_features_vertical].to_numpy()
    c_y_predict = rf_co_clf.predict(X1_test, X2_test)
    CO_report = classification_report(y_test, c_y_predict, output_dict=True)
    print(k, CO_report['1'])
    res_co.append({'size': k, 'CO_recall': CO_report['1']['recall'], 'CO_precision': CO_report['1']['precision'], 'CO_f1': CO_report['1']['f1-score']})
    # save model
    if idx == (len(k_li) - 1):       
        with open(pkl_dir + "model_CO.pkl", 'wb') as f:
            pickle.dump(rf_co_clf, f)
# save res
res_co_pd = pd.DataFrame(res_co)
res_co_pd.to_pickle(pkl_dir + 'results_co.pkl')

2087 {'precision': 0.9933774834437086, 'recall': 0.9933774834437086, 'f1-score': 0.9933774834437086, 'support': 453}
