In [7]:
from modAL.models import ActiveLearner
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import h5py
import pickle
from sklearn.ensemble import RandomForestClassifier
import random

## Sample unlabeled pool

In [8]:
features = h5py.File('/scratch/yw3004/sonyc/sonyc_distortion_classification/_old_features.h5', 'r')
d_features = list(features.values())[0]

In [9]:
d_features.dtype

dtype([('path', 'S96'), ('identifier', 'S32'), ('features_z', 'u1', (10, 128))])

In [10]:
d_features.shape

(2556319,)

In [11]:
frames = h5py.File('/scratch/yw3004/sonyc/sonyc_distortion_classification/clusters_frames.hdf5', 'r')
d_frames = list(frames.values())[0]

In [118]:
d_frames.dtype

dtype([('assignment', '<u4'), ('frame', 'u1'), ('sensor_id', 'S32'), ('timestamp', '<f8')])

In [12]:
d_frames['frame'].shape

(25563190,)

In [13]:
positive_clusters = [9,11,24,25,40,49,52,60,61,75,78,94,95,106,107,124,129]

In [66]:
def build_pool(num_samples):
    positive_samples = []
    negative_samples = []
    
    #generate random index sequence
    idx = list(range(0,len(d_frames)))
    random.shuffle(idx)
    
    for ind in idx:
        #if  got enough samples, stop the loop
        if len(negative_samples) ==  len(positive_samples) == num_samples//2:
            break
            
        #if assigned to negative clusters, add to negative samples
        elif d_frames[ind]['assignment'] not in positive_clusters and len(negative_samples) < num_samples//2:
            identifier = (d_frames[ind]['sensor_id'].decode('UTF-8')+'_'+str(d_frames[ind]['timestamp'])).encode('UTF-8')
            negative_samples.append((identifier, int(d_frames[ind]['frame'])))
       
        #if assigned to positive clusters, add to positive samples
        elif d_frames[ind]['assignment'] in positive_clusters and len(positive_samples) < num_samples//2:
            identifier = (d_frames[ind]['sensor_id'].decode('UTF-8')+'_'+str(d_frames[ind]['timestamp'])).encode('UTF-8')
            positive_samples.append((identifier, int(d_frames[ind]['frame'])))

    #list of identifier(sensor id, timestamp and frame) of samples in the pool
    id_pool = positive_samples + negative_samples
        
    X_pool = np.empty([num_samples, 128])
    #get VGG features for each index
    notfound_inds = []
    notfound_ids = []
    for i in range(num_samples):
        identifier = id_pool[i][0]
        frame = id_pool[i][1]
        ind = np.where(d_features['identifier'] == identifier)
        try:
            X_pool[i] = d_features[ind[0][0]]['features_z'][frame,:]
        except IndexError:
            notfound_inds.append(i)
            notfound_ids.append(identifier)
            
    X_pool = np.delete(X_pool, notfound_inds, 0)    
            
    return id_pool, X_pool, notfound_ids, notfound_inds

In [67]:
id_pool, X_pool, notfound_ids, notfound_inds = build_pool(1000)

In [69]:
X_pool.shape

(998, 128)

In [71]:
notfound_ids

[b'74da385c687d_1481419829.11', b'74da385c683d_1483808189.4']

In [72]:
pickle.dump(X_pool, open("X_pool.pickle", "wb" ))

In [73]:
pickle.dump(id_pool, open("id_pool.pickle", "wb" ))

## Load initial training set

In [99]:
positive_xy = pickle.load( open( "positive_xy.pickle", "rb" ) )
negative_xy = pickle.load( open( "negative_xy.pickle", "rb" ) )

In [100]:
positive_xy.shape

(170, 129)

## Active learner

In [71]:
X_training = np.array([[1,2],[1,1],[-1,-2],[-3,-1]])
y_training = np.array([1,1,0,0])

X_pool = np.array([[-1,1],[-5,-3],[3,-2],[0,0]])

# initializing the learner
learner = ActiveLearner(
    estimator=RandomForestClassifier(),
    X_training=X_training, y_training=y_training)

# query for labels
query_idx, query_inst = learner.query(X_pool)

# ...obtaining new labels from the Oracle...

# supply label for queried instance
# y_new = np.array([0])
learner.teach(X_pool[query_idx], y_new)

In [87]:
query_idx

array([3])

## Update training set and pool

In [64]:
def update_train_and_pool(X_training, y_training, X_pool, queried_samples, query_idx, new_labels):
    
    X_training = np.vstack(X_training, X_pool(query_idx))
    y_training = np.append(y_training, new_labels)
    queried_samples = np.append(queried_samples, X_pool[query_idx])
    X_pool = np.delete(X_pool, (query_idx), axis=0)

## Retrain the model and evaluate

In [37]:
#retrain the model with new training set
train(X_training, y_training)
w = RandomForestClassifier.get_params()

In [48]:
#evaluate model performance on test set
mean_acc = RandomForestClassifier(X_val, y_val, w)