In [97]:
import numpy as np
from scipy.stats import norm, ttest_ind

In [98]:
# implementation for mapreducer iterable
def iter_group(queue):
    buf = []
    prev_key = None

    for val in queue:
        cur_key, cur_val = val
        #print cur_key, cur_val
        if cur_key == prev_key or prev_key is None:
            buf.append(cur_val)
        else:
            yield prev_key, buf
            buf = []
            buf.append(cur_val)
        prev_key = cur_key

    if buf:
        yield cur_key, buf



In [99]:
# mapreducer class - adding new elements to the map, reducing the map 
class MapReduce:
    def __init__(self):
        self.queue = []

#         adding new elements into the map
    def add(self, a,b):
        self.queue.append((a,b))

    def count(self):
        return len(self.queue)    

#     mapreducer should be iterable 
    def __iter__(self):
        return iter_group(sorted(self.queue))


In [100]:
x = MapReduce()

Reading log.txt, adding each line's info into the mapreduce object

In [103]:
def mapper(filename='log.txt', map_reduce=x):
    with open(filename,'r') as f:
        for line in f:
            words = line.strip().split(',')
            map_reduce.add(words[1],float(words[2]))

In [104]:
mapper(map_reduce=x)

Reducing the map by counting mean values for each request frequencies, calculating its confidence interval + calculating T-test for the means of two independent samples of request types '/index' and '/test'

In [105]:
def reducer(map_reduce=x):
    means = []
    X = []
    Y = []
#     calculating confidence interval with scipy.stats.norm.interval (supposing normal distribution in data)
    for word, freqs in map_reduce:
        means.append((word, np.mean(freqs), 
                      norm.interval(alpha=0.95, loc=np.mean(freqs), scale=np.std(freqs)/np.sqrt(len(freqs)))))
        if word == '/index':
            X = freqs
        if word == '/test':
            Y = freqs
#     suppose we have to data sets with equal unlnown variance
# let's apply student's test to examine the hypothesis that 2 independent samples have identical average 
    _, p_value = ttest_ind(X, Y)
    return means, p_value

Printing result to 'output.txt'

In [106]:
with open('output.txt', 'w') as f:
    means, p_value = reducer(x)
    print(means,file=f)
    if p_value < 0.05:
        print('reject the null hypothesis of equal averagesas p-value {:05.3f} < 0.05 - confidence level' .format(p_value), file=f)
    else:
        print('cannot reject the null hypothesis of identical average scores as p-value {:05.3f} > 0.05 - confidence level' .format(p_value), file=f)