In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm, laplace
import torch.nn as nn
import torch.optim as optim
import torch
import torch.nn.functional as F

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.lines import Line2D
%matplotlib inline
from IPython import display

import random
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.stats import gaussian_kde
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, log_loss, mean_absolute_error, mean_squared_error, brier_score_loss
from sklearn.metrics import precision_score, recall_score, roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA

from scipy.stats import linregress, probplot, t, wilcoxon, ttest_rel
from scipy.optimize import minimize
from statsmodels.stats.multitest import multipletests

from algorithms import *
from utils import *
from KMPE import *
from NN_functions import *

from torchvision.datasets import MNIST

from tqdm import tqdm_notebook as tqdm
# from tqdm import tqdm as tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
def normalize_by_max(x, y):
    return (x - y) / np.max([x, y], axis=0)

def wilcoxon_n(x, y, **kwargs):
    return wilcoxon(normalize_by_max(x, y), **kwargs)

In [33]:
res = pd.read_csv('alpha_synth_raw.csv', sep=';', decimal=',').rename(columns={'distribution': 'dataset'})
res = res[(res['dataset'] == 'laplace')]
res['alpha_mae'] = (res['cons_alpha'] - res['est_alpha']).abs()
res_pivot_alpha = res.pivot_table(index=['dataset', 'ds', 'dmu', 'alpha', 'random_state'],
                        columns=['estimator'],
                        values='alpha_mae').rename(columns={'KM': 'KM_2'})

res = pd.read_csv('alpha_real_raw.csv', sep=';', decimal=',').rename(columns={'distribution': 'dataset'})
res = res[(~res['dataset'].isin({'mnist_2', 'mnist_3', 'housing'}))].rename(columns={'mnist_1': 'mnist'})
res['alpha_mae'] = (res['real_alpha'] - res['est_alpha']).abs()
res_pivot_alpha_uci = res.pivot_table(index=['dataset', 'alpha', 'random_state'],
                        columns=['estimator'],
                        values='alpha_mae').rename(columns={'KM': 'KM_2'})

res = pd.read_csv('mae_synth_raw.csv', sep=';', decimal=',').rename(columns={'distribution': 'dataset'})
res = res[(res['dataset'] == 'laplace')]
res['alpha_mae'] = (res['cons_alpha'] - res['est_alpha']).abs()
res_pivot_mae= res.pivot_table(index=['dataset', 'ds', 'dmu', 'alpha', 'random_state'],
                        columns=['estimator'],
                        values='mae')
res_pivot_mae = res_pivot_mae[['e3_en', 'sigmoid_nnre', 'brier_nnre', 'dedpul', 'random_dedpul']].\
    rename(columns={'e3_en': 'en'})


res = pd.read_csv('acc_real_raw.csv', sep=';', decimal=',')
res = res[(~res['dataset'].isin({'mnist_2', 'mnist_3', 'housing'}))].rename(columns={'mnist_1': 'mnist'})
res['alpha_mae'] = (res['real_alpha'] - res['est_alpha']).abs()
res['accuracy'] = 1 - res['accuracy']
res_pivot_acc = res.pivot_table(index=['dataset', 'alpha', 'random_state'],
                        columns=['estimator'],
                        values='accuracy')
res_pivot_acc = res_pivot_acc[['e1_en', 'sigmoid_nnre', 'brier_nnre', 'dedpul', 'random_dedpul']].\
    rename(columns={'e1_en': 'en'})

### DEDPUL (proposed) outperforms KM (state-of-the-art) and EN on Mixture Proportions Estimation

In [56]:
print('errors:')
print('dedpul mean:', res_pivot_alpha['dedpul'].mean(),
      '\nKM_2 mean:', res_pivot_alpha['KM_2'].mean(),
      '\ne1_en mean:', res_pivot_alpha['e1_en'].mean(),
      '\ne3_en mean:', res_pivot_alpha['e3_en'].mean(),
      '\nem_en mean:', res_pivot_alpha['em_en'].mean(),
      )

errors:
dedpul mean: 0.03482985714285716 
KM_2 mean: 0.053359428571428606 
e1_en mean: 0.1617116285714286 
e3_en mean: 0.0534276285714286 
em_en mean: 0.0404159142857143


In [37]:
print('alpha star')
wilcoxon_n(res_pivot_alpha['dedpul'].values, res_pivot_alpha['KM_2'].values), \
wilcoxon_n(res_pivot_alpha['dedpul'].values, res_pivot_alpha['e1_en'].values), \
wilcoxon_n(res_pivot_alpha['dedpul'].values, res_pivot_alpha['e3_en'].values), \
wilcoxon_n(res_pivot_alpha['dedpul'].values, res_pivot_alpha['em_en'].values)

alpha star


(WilcoxonResult(statistic=18562.0, pvalue=1.4141772573369528e-10),
 WilcoxonResult(statistic=4265.0, pvalue=4.218922304845355e-44),
 WilcoxonResult(statistic=18975.0, pvalue=5.779437733435377e-10),
 WilcoxonResult(statistic=23753.5, pvalue=0.0002390460771696664))

In [45]:
print('dedpul mean:', res_pivot_alpha_uci['dedpul'].mean(),
      '\nKM_2 mean:', res_pivot_alpha_uci['KM_2'].mean(),
      '\ne1_en mean:', res_pivot_alpha_uci['e1_en'].mean(),
      '\ne3_en mean:', res_pivot_alpha_uci['e3_en'].mean(),
      '\nem_en mean:', res_pivot_alpha_uci['em_en'].mean(),
      )

dedpul mean: 0.03185037777777778 
KM_2 mean: 0.098014606741573 
e1_en mean: 0.15292895555555533 
e3_en mean: 0.20536588888888885 
em_en mean: 0.2143315333333334


In [39]:
print('alpha')
wilcoxon_n(res_pivot_alpha_uci['dedpul'].values, res_pivot_alpha_uci['KM_2'].values), \
wilcoxon_n(res_pivot_alpha_uci['dedpul'].values, res_pivot_alpha_uci['e1_en'].values), \
wilcoxon_n(res_pivot_alpha_uci['dedpul'].values, res_pivot_alpha_uci['e3_en'].values), \
wilcoxon_n(res_pivot_alpha_uci['dedpul'].values, res_pivot_alpha_uci['em_en'].values)

alpha


(WilcoxonResult(statistic=16289.0, pvalue=9.583312300482657e-36),
 WilcoxonResult(statistic=1646.0, pvalue=1.37600392476376e-70),
 WilcoxonResult(statistic=3013.0, pvalue=5.6142276811016345e-67),
 WilcoxonResult(statistic=8787.0, pvalue=3.6469734327015804e-52))

dedpul outperforms EN and KM

### DEDPUL (proposed) outperforms EN and nnRE (state-of-the-art) on Positive-Unlabeled Classification

In [52]:
print('dedpul mean:', res_pivot_mae['dedpul'].mean(),
      '\nbrier_nnre mean:', res_pivot_mae['brier_nnre'].mean(),
      '\nsigmoid_nnre mean:', res_pivot_mae['sigmoid_nnre'].mean(),
      '\nen mean:', res_pivot_mae['en'].mean()
      )

dedpul mean: 0.03177765714285712 
brier_nnre mean: 0.034691800000000016 
sigmoid_nnre mean: 0.08339682857142855 
en mean: 0.03516128571428574


In [50]:
print('mae')
wilcoxon_n(res_pivot_mae['dedpul'].values, res_pivot_mae['sigmoid_nnre'].values), \
wilcoxon_n(res_pivot_mae['dedpul'].values, res_pivot_mae['brier_nnre'].values), \
wilcoxon_n(res_pivot_mae['dedpul'].values, res_pivot_mae['en'].values)

mae


(WilcoxonResult(statistic=4463.0, pvalue=1.1484220955502075e-43),
 WilcoxonResult(statistic=21248.0, pvalue=5.841215800473716e-07),
 WilcoxonResult(statistic=22330.0, pvalue=9.635438400329568e-06))

In [53]:
print('dedpul mean:', res_pivot_acc['dedpul'].mean(),
      '\nbrier_nnre mean:', res_pivot_acc['brier_nnre'].mean(),
      '\nsigmoid_nnre mean:', res_pivot_acc['sigmoid_nnre'].mean(),
      '\nen mean:', res_pivot_acc['en'].mean()
      )

dedpul mean: 0.06325473333333347 
brier_nnre mean: 0.06657617777777783 
sigmoid_nnre mean: 0.07851133333333332 
en mean: 0.0995056222222221


In [55]:
print('accuracy')
wilcoxon_n(res_pivot_acc['dedpul'].values, res_pivot_acc['sigmoid_nnre'].values), \
wilcoxon_n(res_pivot_acc['brier_nnre'].values, res_pivot_acc['dedpul'].values), \
wilcoxon_n(res_pivot_acc['dedpul'].values, res_pivot_acc['en'].values)

accuracy


(WilcoxonResult(statistic=21461.0, pvalue=1.68345603470731e-23),
 WilcoxonResult(statistic=25873.0, pvalue=1.2635178082095275e-14),
 WilcoxonResult(statistic=4375.0, pvalue=9.513622262822845e-63))

dedpul outperforms nnre and en

### Proposed modification of nnRE improves its performance on Positive-Unlabeled Classification

In [57]:
print('mae')
wilcoxon_n(res_pivot_mae['brier_nnre'].values, res_pivot_mae['sigmoid_nnre'].values)

mae


WilcoxonResult(statistic=12923.0, pvalue=5.930000488931337e-21)

In [58]:
print('accuracy')
wilcoxon_n(res_pivot_acc['brier_nnre'].values, res_pivot_acc['sigmoid_nnre'].values)

accuracy


WilcoxonResult(statistic=20647.0, pvalue=1.0191564741751708e-17)

conclusion: brier_nnre outperforms sigmoid_nnre