In [None]:
import numpy as np
import pandas as pd
import scipy
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score

import sys
sys.path.append('../data/simulation_study/')
from knn_models import KNNModel


In [None]:
def my_load_npz(file):
    """
    allows pickle files as opposed to the scipy version
    """
    with np.load(file, allow_pickle=True) as loaded:
        try:
            matrix_format = loaded['format']
        except KeyError as e:
            raise ValueError('The file {} does not contain a sparse matrix.'.format(file)) from e

        matrix_format = matrix_format.item()

        if not isinstance(matrix_format, str):
            # Play safe with Python 2 vs 3 backward compatibility;
            # files saved with SciPy < 1.0.0 may contain unicode or bytes.
            matrix_format = matrix_format.decode('ascii')

        try:
            cls = getattr(scipy.sparse, '{}_matrix'.format(matrix_format))
        except AttributeError as e:
            raise ValueError('Unknown matrix format "{}"'.format(matrix_format)) from e

        if matrix_format in ('csc', 'csr', 'bsr'):
            return cls((loaded['data'], loaded['indices'], loaded['indptr']), shape=loaded['shape'])
        elif matrix_format == 'dia':
            return cls((loaded['data'], loaded['offsets']), shape=loaded['shape'])
        elif matrix_format == 'coo':
            return cls((loaded['data'], (loaded['row'], loaded['col'])), shape=loaded['shape'])
        else:
            raise NotImplementedError('Load is not implemented for '
                                      'sparse matrix of format {}.'.format(matrix_format))


In [None]:
# VAST Challenge Dataset

# load the full VAST Challenge 2011 microblog dataset
all_data_microblogs = pd.read_pickle(f'../data/user_study/pickle/microblogs_dataset.pkl.gz', compression='gzip')

# list of keyword stems indicating a tweet is relevant
keywords = ['sore', 'throat', 'fever', 'fatigu', 'cough', 'short', 'breath', 'chill', 'sick', 
            'pain', 'diarrhea', 'stomach', 'sweat', 'pneumonia', 'flu', 'ach', 'nausea', 'vomit', 
            'nauseou', 'declin', 'health', 'headach', 'nose', 'runni']

# Label data points as relevant (1) or irrelevant (0) according to the keywords above
all_data_microblogs['label'] =  all_data_microblogs.apply(lambda row: int(any([w in keywords for w in row.porter_stems])), axis=1)

subset_data_ids_2 = np.load(f'../data/simulation_study/model_50000_1/ids.npy')

In [None]:
num_experiments = 50
total_budget = 250

knn_text_weights = my_load_npz('../data/simulation_study/model_50000_1/text_cos_weights_weighted.npz')
knn_text_model = KNNModel([0.99, 0.01], knn_text_weights)


In [None]:

def run_experiments(policy, num_experiments=num_experiments, total_budget=total_budget):
    num_discoveries = np.array([])
    num_keywords = np.array([])
    model_recall = np.array([])
    model_f1_score = np.array([])
    model_auc_score = np.array([])

    for exp in range(num_experiments):

        train_ind = np.random.choice([i for i in range(len(subset_data_ids_2)) if all_data_microblogs.iloc[subset_data_ids_2[i]]['label'] == 1], 1)
        train_labels = np.array([1 for i in train_ind])
        all_indices = np.array(range(len(subset_data_ids_2)))

        for iteration in range(total_budget):
            test_ind = np.delete(all_indices, np.argwhere(np.isin(all_indices, train_ind)))
            
            if policy == 'random':
                queries = np.random.choice(test_ind, 1)
                train_ind = np.append(train_ind, queries[0])
                l = all_data_microblogs.iloc[subset_data_ids_2[queries[0]]]['label']
                train_labels = np.append(train_labels, l)
                
            elif policy =='as_greedy':
                probs = knn_text_model.predict(test_ind, train_ind, train_labels)
                max_prob = np.max(probs)
                max_inds = np.argwhere(probs==max_prob).ravel()
                queries = np.random.choice(test_ind[max_inds], 1)
                train_ind = np.append(train_ind, queries[0])
                l = all_data_microblogs.iloc[subset_data_ids_2[queries[0]]]['label']
                train_labels = np.append(train_labels, l)
                
                
            elif policy == 'unc':
                probs = knn_text_model.predict(test_ind, train_ind, train_labels)
                unc = np.abs(probs - 0.5)
                unc_min = np.min(unc)
                min_inds = np.argwhere(unc==unc_min).ravel()
                queries = np.random.choice(test_ind[min_inds], 1)
                train_ind = np.append(train_ind, queries[0])
                l = all_data_microblogs.iloc[subset_data_ids_2[queries[0]]]['label']
                train_labels = np.append(train_labels, l)
                
                
            else:
                raise Exception

        print('-', end='')

        # Discovery
        num_discoveries = np.append(num_discoveries, all_data_microblogs.iloc[subset_data_ids_2[train_ind[1:]]]['label'].sum())

        # Detection
        kw_detected = np.intersect1d(np.array(keywords), np.array(all_data_microblogs.iloc[subset_data_ids_2[train_ind[1:]]].porter_stems.sum()))
        num_keywords = np.append(num_keywords, len(kw_detected))

        # Training
        test_ind = np.delete(all_indices, np.argwhere(np.isin(all_indices, train_ind)))
        train_labels = all_data_microblogs.iloc[subset_data_ids_2[train_ind]]['label']
        probs = knn_text_model.predict(test_ind, train_ind, train_labels)
        inferred_labels = (probs>=0.5).astype(int)
        true_label = all_data_microblogs.iloc[subset_data_ids_2[test_ind]]['label']
        accuracy = (true_label == inferred_labels).sum() / len(test_ind)
        model_recall = np.append(model_recall, recall_score(true_label, inferred_labels))
        model_f1_score = np.append(model_f1_score, f1_score(true_label, inferred_labels))
        model_auc_score = np.append(model_auc_score, roc_auc_score(true_label, probs))


    print(f'\n{policy}')
    print (f'Discovery: {num_discoveries.mean():.2f} ± {1.98 * num_discoveries.std()/np.sqrt(len(num_discoveries)):.3f}')
    print (f'Detection: {num_keywords.mean():.2f} ± {1.98 * num_keywords.std()/np.sqrt(len(num_keywords)):.3f}')
    print (f'Recall: {model_recall.mean():.2f} ± {1.98 * model_recall.std()/np.sqrt(len(model_recall)):.3f}')
    print (f'F1 Score: {model_f1_score.mean():.2f} ± {1.98 * model_f1_score.std()/np.sqrt(len(model_f1_score)):.3f}')
    print (f'ROC-AUC Score: {model_auc_score.mean():.2f} ± {1.98 * model_auc_score.std()/np.sqrt(len(model_auc_score)):.3f}')


In [None]:
run_experiments('random')

In [None]:
run_experiments('as_greedy')

In [None]:
run_experiments('unc')