In [1]:
import numpy as np

In [2]:
N_splits = 1000
steps = 1000000 // N_splits

t0 = 1.96

In [3]:
name1 = 'c1.txt'
name2 = 'c2.txt'

In [4]:
def process(name):
    split = []
    with open(name, 'r', encoding='utf-8-sig') as f:
        for session in f.readlines():
            clicks = np.array(list(map(int, session.split())))
            split.append(clicks)
    return np.array(split)


def getTstat(split_A, split_B):
    sigma = np.sqrt(split_A.std() ** 2 / split_A.shape[0] + split_B.std() ** 2 / split_B.shape[0])
    return (split_A.mean() - split_B.mean()) / sigma


#сэмплинг элементов слишком долгий
def sampling_sessions(split, N_splits=N_splits, steps=steps):
    size = split.shape[0]
    samples = []
    for step in range(steps):
        sample = []
        for j in range(N_splits):
            idx = np.random.randint(size)
            sample.append(split[idx])
        samples.append(sample)
    return np.array(samples)


def makeGroups(split, steps=steps, N_splits=N_splits):
    groups = []
    for step in range(steps):
        groups.append(split[step * N_splits: (step + 1) * N_splits])
    return np.array(groups)


def sampling_metrics(split):
    samples = []
    size = split.shape[0]
    for i in range(size):
        idx = np.random.randint(size)
        samples.append(split[idx])
    return np.array(samples)


def bootstrapping(split_A, split_B, metric, N=1000, t0=t0):
    print(metric.__name__)
    count = 0
    groups_A = makeGroups(split_A)
    groups_B = makeGroups(split_B)
    A_metric = metric(groups_A)
    B_metric = metric(groups_B)
    for i in range(N):
        print('\r{}'.format(i), end='')
        sample_A = sampling_metrics(A_metric)
        sample_B = sampling_metrics(B_metric)
        t = getTstat(sample_A, sample_B)
        if t > t0:
            count += 1
    ASL = count / N
    print('\nASL:\t', ASL)

    
def getCPQ(split, N_splits=N_splits):
    split_cpq = []
    for group in split:
        cpq = 0
        for session in group:
            if session.any():
                cpq += 1
        cpq /= N_splits
        split_cpq.append(cpq)
    return np.array(split_cpq)


def getFirstClick(split):
    split_clicks_to_1 = []
    for group in split:
        clicks_to_1 = 0
        for session in group:
            clicks_to_1 += session[0]
        split_clicks_to_1.append(clicks_to_1)
    return np.array(split_clicks_to_1)


def getCTR(split):
    group_size = 100
    split_ctr = np.empty(split.shape[0] // group_size)
    for i in range(split_ctr.size):
        click = np.sum(split[i * 100: (i + 1) * group_size], axis=1)
        split_ctr[i] = np.count_nonzero(click != 0)  / group_size
    return split_ctr


In [5]:
split_A = process(name1)
split_B = process(name2)

In [6]:
bootstrapping(split_A, split_B, getCPQ)

getCPQ
999
ASL:	 1.0


In [7]:
bootstrapping(split_A, split_B, getCTR)

getCTR
999
ASL:	 1.0


In [8]:
bootstrapping(split_A, split_B, getFirstClick)

getFirstClick
999
ASL:	 1.0


Т.к. в трех случаях значение ASL $\approx$ 1.0, следовательно гипотезу H0 о том, что CTR (Clicks@1) на двух сплитах можно объяснить шумами, нужно отвергнуть.