In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
%load_ext line_profiler

# Synthetic data


In [37]:
np.random.seed(0)
n_feat = 5
size1, size2 = 100, 100
s1 = pd.DataFrame(np.random.normal(size=(size1, n_feat)))
s2 = pd.DataFrame(s1.values[:size2] + np.random.normal(scale=1, size=(size2, n_feat)))

In [6]:
s1.head()

Unnamed: 0,0,1,2,3,4
0,1.764052,0.400157,0.978738,2.240893,1.867558
1,-0.977278,0.950088,-0.151357,-0.103219,0.410599
2,0.144044,1.454274,0.761038,0.121675,0.443863
3,0.333674,1.494079,-0.205158,0.313068,-0.854096
4,-2.55299,0.653619,0.864436,-0.742165,2.269755


In [7]:
s2.head()

Unnamed: 0,0,1,2,3,4
0,2.073776,-0.337299,-0.558182,1.678638,0.268047
1,-0.152888,3.441575,0.860498,-0.384457,0.427305
2,1.297963,0.853358,0.893975,-2.048299,-0.672696
3,-0.235694,2.130346,-0.972807,0.931547,-0.670935
4,-0.693639,-0.421896,-1.154784,-3.179629,2.121191


# Testing

In [4]:
import big_roc
import big_roc.metrics as metrics

In [5]:
sim_min = -1
sim_max = 1
eps = 1e-6
n_intervals = 100
intervals = np.append(np.linspace(sim_min - eps, sim_max + eps, n_intervals), [np.inf])

In [208]:
%%time
gen_hist, imp_hist = big_roc.calc_gen_imp_hist(s1, s2, intervals, batch_size=1000)

Wall time: 13.4 s


In [158]:
conf_mat = metrics.confusion_matrix(gen_hist, imp_hist)

In [160]:
conf_mat['fpr'] = metrics.false_positive_rate(conf_mat)
conf_mat['fnr'] = metrics.false_negative_rate(conf_mat)

In [166]:
metrics.equal_error_rate(conf_mat['fpr'], conf_mat['fnr'])

(0.1820244380169731, 0.1815, 0.1825143414341434, 0.0005244380169731055)

In [173]:
metrics.fnr_at_fpr(conf_mat['fpr'], conf_mat['fnr'], 0.01)

MetricEstimate(name='fnr@fpr0.01', value=0.7736798229822983, value_min=0.7672170617061707, value_max=0.7801425842584259, error=0.00646276127612766)

# Lee's data

In [5]:
from big_roc.run_analysis import run_analysis
from big_roc.utils import read_dataset

In [6]:
dataset_path = Path("data", "SendToVlad_II", "VladDataSet_Band7_N20000_NFeat_10.csv")

In [7]:
s1, s2 = read_dataset(dataset_path)

In [79]:
%%time
roc_metrics, gen_metrics, imp_metrics = run_analysis(s1, s2, Path("Band7_NSub020000_NFeat010_Repeat001"), safe_output=False)

Wall time: 3min 5s


In [80]:
roc_metrics

Unnamed: 0_level_0,Value,ValueMin,ValueMax,Error
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
EER,0.07015,0.07015,0.07015,0.0
FNR_AT_FPR_001,0.647683,0.647683,0.647683,0.0
FNR_AT_FPR_0001,0.80486,0.80486,0.80486,0.0
FNR_AT_FPR_00001,0.853291,0.816613,1.0,0.146709
FNR_AT_FPR_000001,0.820281,0.816613,1.0,0.179719
AUC,0.980111,,,
Rank1_IR,0.059,,,
GenMdn,0.885264,,,
GenIqr,0.095176,,,
GenSkew,-1.443386,,,


In [19]:
%%time
roc_metrics, gen_metrics, imp_metrics = run_analysis(s1, s2, Path("out4"), safe_output=False)

KeyboardInterrupt: 

In [72]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_01(x, y):
    return (cosine_similarity(x, y) + 1) / 2


def rank1_ir(s1: pd.DataFrame, s2: pd.DataFrame, similarity_measure=cosine_similarity_01) -> float:
    correctly_identified = 0.
    for subj_id, row in s1.iterrows():
        sims = similarity_measure(row.values.reshape(1, -1), s2).flatten()
        ind_max = np.argwhere(sims == np.amax(sims)).flatten()
        if subj_id in s1.index[ind_max]:
            correctly_identified += 1 / len(ind_max)
    return correctly_identified, correctly_identified / s1.shape[0]


In [65]:
s2.index[[1, 2]]

Int64Index([5, 10], dtype='int64', name='Sub')

In [32]:
s1.iloc[:1000].values.shape

(1000, 10)

In [73]:
%%time
rank1_ir(s1, s2)

Wall time: 50.1 s


(1180.0, 0.059)

In [43]:
%%time
indices = rank1_ir(s1, s2)

Wall time: 46.2 s


In [61]:
is_identified = np.array(indices) == np.array(range(len(indices)))

In [62]:
np.sum(is_identified) / len(is_identified)

0.059

In [11]:
roc_metrics

Unnamed: 0_level_0,Value,ValueMin,ValueMax,Error
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
EER,0.067,0.067,0.067,0.0
FNR_AT_FPR_001,0.591944,0.591944,0.591944,0.0
FNR_AT_FPR_0001,0.63275,0.591944,1.0,0.36725
FNR_AT_FPR_00001,0.596025,0.591944,1.0,0.403975
FNR_AT_FPR_000001,0.592352,0.591944,1.0,0.407648
AUC,0.98035,,,
Rank1_IR,0.0,,,
GenMdn,0.887754,,,
GenIqr,0.089421,,,
GenSkew,-1.645273,,,


In [63]:
roc = roc_metrics.copy()

In [64]:
roc = roc.set_index("name")

In [65]:
roc

Unnamed: 0_level_0,value,value_min,value_max,error
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
eer,0.07015,0.07015,0.07015,0.0
fnr@fpr0.001,0.647683,0.647683,0.647683,0.0
fnr@fpr0.0001,0.80486,0.80486,0.80486,0.0
fnr@fpr1e-05,0.853291,0.816613,1.0,0.146709


In [66]:
for key, value in gen_metrics._asdict().items():
    roc.loc["Gen" + key] = pd.Series({"value": value})
for key, value in imp_metrics._asdict().items():
    roc.loc["Imp" + key] = pd.Series({"value": value})


In [89]:
x = 0.0000001

print(x)
print(lreplace(f"{x:.20f}".rstrip("0"), "0."))

1e-07


NameError: name 'lreplace' is not defined

In [68]:
roc["value"]

name
eer              0.070150
fnr@fpr0.001     0.647683
fnr@fpr0.0001    0.804860
fnr@fpr1e-05     0.853291
Genmedian        0.885277
GenIQR           0.094978
Genskewness     -1.444710
Genkurtosis      3.091522
Impmedian        0.499672
ImpIQR           0.227921
Impskewness      0.001321
Impkurtosis     -0.497151
Name: value, dtype: float64

In [55]:
roc

Unnamed: 0,name,value,value_min,value_max,error
0,eer,0.07015,0.07015,0.07015,0.0
1,fnr@fpr0.001,0.647683,0.647683,0.647683,0.0
2,fnr@fpr0.0001,0.80486,0.80486,0.80486,0.0
3,fnr@fpr1e-05,0.853291,0.816613,1.0,0.146709
4,Genmedian,0.885277,,,
5,GenIQR,0.094978,,,
6,Genskewness,-1.44471,,,
7,Genkurtosis,3.091522,,,
8,Impmedian,0.499672,,,
9,ImpIQR,0.227921,,,


In [56]:
roc.set_index("name")

Unnamed: 0_level_0,value,value_min,value_max,error
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
eer,0.07015,0.07015,0.07015,0.0
fnr@fpr0.001,0.647683,0.647683,0.647683,0.0
fnr@fpr0.0001,0.80486,0.80486,0.80486,0.0
fnr@fpr1e-05,0.853291,0.816613,1.0,0.146709
Genmedian,0.885277,,,
GenIQR,0.094978,,,
Genskewness,-1.44471,,,
Genkurtosis,3.091522,,,
Impmedian,0.499672,,,
ImpIQR,0.227921,,,


In [46]:
roc[["name", "value"]] = list(zip(*gen_metrics._asdict().items()))

ValueError: Must have equal len keys and value when setting with an ndarray

In [41]:
roc

Unnamed: 0,name,value,value_min,value_max,error
0,eer,,0.07015,0.07015,0.0
1,fnr@fpr0.001,,0.647683,0.647683,0.0
2,fnr@fpr0.0001,,0.80486,0.80486,0.0
3,fnr@fpr1e-05,,0.816613,1.0,0.146709


In [38]:
pd.Series(gen_metrics._asdict())

median      0.885277
IQR         0.094978
skewness   -1.444710
kurtosis    3.091522
dtype: float64

In [37]:
pd.DataFrame(gen_metrics._asdict(), columns=["value"])

Unnamed: 0,value
0,


# Distribution stats approximation

In [1]:
from pathlib import Path
from scipy import stats
import pandas as pd
import numpy as np

from big_roc import HistogramSampler

In [6]:
def distribution_stats(sample):
    return {
        'median': np.median(sample),
        'IQR': stats.iqr(sample),
        'skewness': stats.skew(sample),
        'kurtosis': stats.kurtosis(sample)
    }

In [2]:
data_path = Path("out", "gen_imp_hist_roc.csv")

In [3]:
df = pd.read_csv(data_path)
df['bin_edges'] = (df['bin_edges'] + 1) / 2

In [4]:
df.head()

Unnamed: 0,bin_edges,gen_hist,imp_hist,fpr,fnr
0,-5e-07,0,0,1.0,0.0
1,5.00002e-07,0,0,1.0,0.0
2,1.500004e-06,0,0,1.0,0.0
3,2.500006e-06,0,0,1.0,0.0
4,3.500008e-06,0,0,1.0,0.0


In [5]:
gen_hist_sampler = HistogramSampler(df['gen_hist'], df['bin_edges'])
imp_hist_sampler = HistogramSampler(df['imp_hist'], df['bin_edges'])

### Genuine

In [7]:
distribution_stats(gen_hist_sampler.naive_sample())

{'median': 0.8851902703797703,
 'IQR': 0.09502369004719002,
 'skewness': -1.4487407967393933,
 'kurtosis': 3.119485198560535}

In [12]:
distribution_stats(gen_hist_sampler.probabilistic_sample(10**6))

{'median': 0.8851900433885185,
 'IQR': 0.09505442922146101,
 'skewness': -1.4489052083222218,
 'kurtosis': 3.1225499646015553}

### Impostor

In [13]:
distribution_stats(imp_hist_sampler.naive_sample())

{'median': 0.500003500007,
 'IQR': 0.22807745615445613,
 'skewness': -0.00014986564932556307,
 'kurtosis': -0.5001786946240618}

In [14]:
distribution_stats(imp_hist_sampler.probabilistic_sample(10**6))

{'median': 0.4997920470076376,
 'IQR': 0.22754663608605435,
 'skewness': 0.001621352506498595,
 'kurtosis': -0.49706580861378624}