In [1]:
import pickle
import sklearn
import modAL
from modAL.models import ActiveLearner
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer
import numpy as np
from modAL.uncertainty import uncertainty_sampling
from modAL.uncertainty import entropy_sampling
from modAL.uncertainty import margin_sampling
from modAL.utils.selection import multi_argmax
import sys
import os

import audio_decryption
import IPython.display
import random

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
project_path = '/scratch/yw3004/projects/ICASSP2019-AL/'
data_path = os.path.join(project_path, 'data')
model_path = os.path.join(project_path, 'models')

# 1. Load Data and Train Initial Classifier

## With all training data

In [3]:
for cv in range(1,6):
    X_train = pickle.load(open(os.path.join(data_path, 'X_train_all_'+str(cv)+'.pickle'), "rb" ))
    y_train = pickle.load(open(os.path.join(data_path, 'y_train_all_'+str(cv)+'.pickle'), "rb" ))
    
    clf_trainall = RandomForestClassifier(n_estimators=100, random_state=0)
    clf_trainall.fit(X_train, y_train)
    with open(os.path.join(model_path, 'model_train_all_'+str(cv)+'.pickle'), 'wb') as f:
        pickle.dump(clf_trainall, f, protocol=pickle.HIGHEST_PROTOCOL) 

## With only two training data

In [13]:
for cv in range(1,6):
    X_train = pickle.load(open(os.path.join(data_path, 'X_train_'+str(cv)+'.pickle'), "rb" ))
    y_train = pickle.load(open(os.path.join(data_path, 'y_train_'+str(cv)+'.pickle'), "rb" ))
    
    clf = RandomForestClassifier(n_estimators=100, random_state=0)
    clf.fit(X_train, y_train)
    with open(os.path.join(model_path, 'model_'+str(cv)+'_initial.pickle'), 'wb') as f:
        pickle.dump(clf, f, protocol=pickle.HIGHEST_PROTOCOL)

# 2. Active Learning

## Load initial model, and get its performance

In [33]:
clf = pickle.load(open(os.path.join(model_path, 'model_'+str(cv_round)+'_initial.pickle'), "rb" ))
score = np.mean(clf.predict(X_test) == y_test)
pred = clf.predict(X_test)
starting_precision = precision_score(y_test,pred)
starting_recall = recall_score(y_test,pred)
starting_confusion_matrix = confusion_matrix(y_test,pred)
print('accuracy: ', score)
print('starting precision: ', starting_precision)
print('starting recall: ', starting_recall)
print('starting confusion matrix: ', starting_confusion_matrix)

accuracy:  0.6583333333333333
starting precision:  1.0
starting recall:  0.31666666666666665
starting confusion matrix:  [[60  0]
 [41 19]]


## Customize query strategy

In [3]:
def uncertaity_with_moving_threshold(classifier, X_pool, threshold):
    positive_probs = classifier.predict_proba(X_pool)[:,1]

    # for each point, select the maximum uncertainty
    uncertainty = 1-np.abs(threshold - positive_probs)
    return uncertainty

In [4]:
def moving_threshold_sampling(classifier, X_pool, threshold, n_instances=1):
    uncertainty = uncertaity_with_moving_threshold(classifier, X_pool, threshold)
    query_idx = multi_argmax(uncertainty, n_instances=n_instances)
    return query_idx, X_pool[query_idx]

In [5]:
def adjusted_classes(y_scores, t):
    """
    This function adjusts class predictions based on the prediction threshold (t).
    Will only work for binary classification problems.
    """
    return [1 if y >= t else 0 for y in y_scores]

In [6]:
def get_moving_threshold(cv_round, thresholds, X_val, y_val, iterations=0):
    fmeasures = []
    for threshold in thresholds:
        if iterations == 0:
            model = pickle.load(open(os.path.join(model_path,'model_'+str(cv_round)+'_initial.pickle'), "rb" ))
        else:
            model = pickle.load(open(os.path.join(model_path,'model_'+str(cv_round)+'_'+str(iterations)+'.pickle'), "rb" ))
            
        pred_proba = model.predict_proba(X_val)
        pred = adjusted_classes(pred_proba[:,1], threshold)
        pred = np.array(pred)
        fmeasures.append(f1_score(y_val, pred))
        
    f_measures = np.asarray(fmeasures)  
    fmeasure_best = np.max(f_measures)    
    threshold_best = thresholds[np.argmax(f_measures)]

    return fmeasure_best, threshold_best

In [7]:
#set audio_path
audio_path = '/beegfs/work/sonyc/audio/'

In [24]:
def active_learning(cv_round, n_queries, sampling_strategy=uncertainty_sampling):
    #Load data
    X_train = pickle.load(open(os.path.join(data_path, 'X_train_'+str(cv_round)+'.pickle'), "rb" ))
    y_train = pickle.load(open(os.path.join(data_path, 'y_train_'+str(cv_round)+'.pickle'), "rb" ))
    X_val = pickle.load(open(os.path.join(data_path, 'X_val_'+str(cv_round)+'.pickle'), "rb" ))
    y_val = pickle.load(open(os.path.join(data_path, 'y_val_'+str(cv_round)+'.pickle'), "rb" ))
    X_test = pickle.load(open(os.path.join(data_path, 'X_test_'+str(cv_round)+'.pickle'), "rb" ))
    y_test = pickle.load(open(os.path.join(data_path, 'y_test_'+str(cv_round)+'.pickle'), "rb" ))
    pool = pickle.load(open(os.path.join(data_path, 'X_pool_'+str(cv_round)+'.pickle'), "rb" ))
    X_pool = pool[:, :128]
    X_pool = X_pool.astype(int)
    info_pool = pool[:, 128:]
    
    #Load initial classifier
    clf = pickle.load(open(os.path.join(model_path, 'model_'+str(cv_round)+'_initial.pickle'), "rb" ))
    pred = clf.predict(X_test)
    starting_precision = precision_score(y_test,pred)
    starting_recall = recall_score(y_test,pred)
    starting_confusion_matrix = confusion_matrix(y_test,pred)
    
    #Initialize parameters
    count = 1
    queries = {}
    best_fmeasures = []
    best_thresholds = []
    
    #Initialize active leaner
    learner = ActiveLearner(
    estimator=clf,
    query_strategy=sampling_strategy,
    X_training=X_train, y_training=y_train
    )
    
    #Active learning loop
    while count < n_queries+1:
        # For moving threshold strategy, use current best threshold to sample query
        if sampling_strategy == moving_threshold_sampling:
            thresholds = np.arange(0.1,0.9,0.01)
            iterations = count-1
            fmeasure_best, threshold_best = get_moving_threshold(cv_round, thresholds, X_val, y_val, iterations)
            query_idx, query_instance = learner.query(X_pool, threshold=threshold_best)
            best_fmeasures.append(fmeasure_best)
            best_thresholds.append(threshold_best)
    
        else:
            query_idx, query_instance = learner.query(X_pool)

        sensor_id = info_pool[query_idx[0]][0]
        timestamp = info_pool[query_idx[0]][1]
        frame = int(info_pool[query_idx[0]][2])
        filepath = info_pool[query_idx[0]][3]
        
        #If quried labeled data, use the label directly, otherwise get human annotation
        if type(filepath) == int:
            y_new = filepath
        else:
            decrypt_path = os.path.join(audio_path, filepath.split('/')[-2], filepath.split('/')[-1])
            IPython.display.display(audio_decryption.decrypt_and_load_audio(decrypt_path, 
                                                                            sensor_id,
                                                                            timestamp, 
                                                                            sample_rate=44100, 
                                                                            frame=frame, 
                                                                            url=, 
                                                                            cacert='/scratch/yw3004/projects/ICASSP2019-AL/CA.pem', 
                                                                            cert='/scratch/yw3004/projects/ICASSP2019-AL/yuwang-decrypt.pem',
                                                                            key='/scratch/yw3004/projects/ICASSP2019-AL/<filename>_key.pem'))
            print('query #: ', count)
            y_new = input("Please input label for\nsensor_id = '%s'\ntimestamp = %s\nframe = %d \
                          (1 if noise is present, 0 if noise is not present, 2 if not sure)" \
                          % (sensor_id,timestamp,frame))
        
        #Teach learner with new assured quried label
        if int(y_new) != 2:
            learner.teach(
                X=X_pool[query_idx],
                y=np.array(y_new).reshape(-1, ))
    
            #update queries dictionary
            if sensor_id not in list(queries.keys()):
                queries[sensor_id] = np.array([timestamp, frame, y_new])
            else:
                queries[sensor_id] = np.vstack((queries[sensor_id],np.array([timestamp, frame, y_new])))
            
            #Save the model
            with open(os.path.join(model_path, 'model_'+str(cv_round)+ '_'+str(count)+'.pickle'), 'wb') as f:
                pickle.dump(learner, f, protocol=pickle.HIGHEST_PROTOCOL)
                
            count += 1
            
        #Update pool
        X_pool = np.delete(X_pool, query_idx, axis=0)
        info_pool = np.delete(info_pool, query_idx, axis=0)
    
    #save parameters
    with open(os.path.join(project_path, 'evaluation', 'queries_'+str(cv_round)+ '.pickle'), 'wb') as f:
        pickle.dump(queries, f, protocol=pickle.HIGHEST_PROTOCOL)
        
    if sampling_strategy == moving_threshold_sampling:
        with open(os.path.join(project_path, 'evaluation', 'best_fmeasures_'+str(cv_round)+ '.pickle'), 'wb') as f:
            pickle.dump(best_fmeasures, f, protocol=pickle.HIGHEST_PROTOCOL)
        with open(os.path.join(project_path, 'evaluation', 'best_thresholds_'+str(cv_round)+ '.pickle'), 'wb') as f:
            pickle.dump(best_thresholds, f, protocol=pickle.HIGHEST_PROTOCOL)
        
    return queries, best_fmeasures, best_thresholds