In [1]:
# simple implementations of the range and xrange

def range2(start, stop, step=1): # this version uses N*x more memory
    numbers = []
    while start < stop:
        numbers.append(start)
        start += step
        return numbers
    
def xrange2(start, stop, step=1):
    while start < stop: # StopIteration excpetion is thrown when the function reaches its end.
        yield start # generator is able to 'return' many values, emits value
        start += step
        
for i in range2(1,10000):
    pass

for i in xrange2(1,10000):
    pass

In [None]:
# Python for loop deconstructed
# The python loop
for i in object:
    do_work(i)
    
# Is equipvalent to
object_iteraor = iter(object)
while True:
    try: 
        i = object_iterator.next()
        do_work(i)
    except StopIteration:
        break;

In [2]:
def test_range():
    # range(100,000,000) would create a list 3.1 GB large
    for i in range2(1, 10000000):
        pass
    
def test_xrange():
    for i in xrange2(1, 10000000):
        pass
    
%timeit test_range()
%timeit test_xrange()

The slowest run took 5.65 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 548 ns per loop
1 loop, best of 3: 1.42 s per loop


In [3]:
# list comprehension vs generator comprehension
def list_comprehension(list_of_numbers):
    return len([n for n in list_of_numbers if n % 3 == 0])
  
    
def gen_comprehension(list_of_numbers):
    return sum((1 for n in list_of_numbers if n % 3 == 0))

list_of_numbers = range(0, 10000000)
%timeit list_comprehension(list_of_numbers)
%timeit gen_comprehension(list_of_numbers)

1 loop, best of 3: 1.26 s per loop
1 loop, best of 3: 1.41 s per loop


In [4]:
r = range(0, 1000)
l = [n*2 for n in r] # List comprehension
g = (n*2 for n in r) # Generator expression
 
print(type(l))  # <class 'list'>
print(type(g))  # <class 'generator'>

%timeit [n*2 for n in r] 
%timeit (n*2 for n in r)

import sys
print(sys.getsizeof(l))  # 9024
print(sys.getsizeof(g))  # 88, same performance but low memory

<class 'list'>
<class 'generator'>
10000 loops, best of 3: 83.4 µs per loop
The slowest run took 5.50 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 425 ns per loop
9024
88


In [6]:
# infinite series
def fibonacchi():
    i, j = 0, 1
    while True:
        yield j
        i, j = j, i + j
        
# count of fibonacci numbers below 5,000
def fibonacchi_native():
    i, j = 0, 1
    count = 0
    while j <= 5000:
        if j % 2:
            count += 1
        i, j = j, i + j
    return count

def fibonacci_transform():
    count = 0
    for f in fibonacchi():
        if f > 5000:
            break
        if f % 2:
            count += 1
    return count

# generating data and transforming data
from itertools import islice
def fibonacci_succinct():
    is_odd = lambda x : x % 2
    first_5000 = islice(fibonacci(), 0, 5000)
    return sum(1 for x in first_5000 if is_odd(x))

In [7]:
# Lazily reading data
import math
from random import normalvariate, random
from itertools import count, groupby, islice
from datetime import (date, datetime)


def read_data(filename):
    with open(filename) as fd:
        for line in fd:
            data = line.strip().split(',')
            yield map(int, data)


def read_fake_data(filename):
    for i in count():
        sigma = random() * 10
        yield (i, normalvariate(0, sigma))


# Grouping data
from datetime import date
from itertools import groupby
def day_grouper(iterable):
    key = lambda timestamp_value: date.fromtimestamp(timestamp_value[0])
    return groupby(iterable, key)


def check_anomaly(xxx_todo_changeme):
    # We find the mean, standard deviation and maximum values for the day.
    # Using a single pass mean/standard deviation algorithm allows us to only
    # read through the day's data once.
    (day, day_data) = xxx_todo_changeme
    n = 0
    mean = 0
    M2 = 0
    max_value = None
    for timestamp, value in day_data:
        n += 1
        delta = value - mean
        mean = mean + delta / n
        M2 += delta * (value - mean)
        max_value = max(max_value, value) if max_value else value
    variance = M2 / (n - 1)
    standard_deviation = math.sqrt(variance)

    # Here is the actual check of whether that day's data is anomalous.  If it
    # is, we return the value of the day, otherwise we return false
    if max_value > mean + 6 * standard_deviation:
        return day
    return False


def rolling_window_grouper(data, window_size):
    window = tuple(islice(data, 0, window_size))
    while True:
        current_datetime = datetime.fromtimestamp(window[0][0])
        yield (current_datetime, window)
        window = window[1:] + (next(data),)

        
data = read_fake_data("fake_filename")
data_day = day_grouper(data)
anomalous_dates = filter(None, map(check_anomaly, data_day))

first_anomalous_date = next(anomalous_dates)
print ("The first anomalous date is: ", first_anomalous_date)
next_10_dates = islice(anomalous_dates, 10)
print ("The next 10 anomalous dates are: ", list(next_10_dates))

print ("Using rolling_window_grouper:")
data = read_fake_data("fake_filename")
data_day = rolling_window_grouper(data, window_size=86400)
anomalous_dates = filter(None, map(check_anomaly, data_day))
first_anomalous_date = next(anomalous_dates)
print ("The first anomalous date is: ", first_anomalous_date)
next_10_dates = islice(anomalous_dates, 10)
print ("The next 10 anomalous dates are: ", list(next_10_dates))

The first anomalous date is:  1970-01-01
The next 10 anomalous dates are:  [datetime.date(1970, 1, 3), datetime.date(1970, 1, 5), datetime.date(1970, 1, 6), datetime.date(1970, 1, 8), datetime.date(1970, 1, 9), datetime.date(1970, 1, 11), datetime.date(1970, 1, 12), datetime.date(1970, 1, 13), datetime.date(1970, 1, 14), datetime.date(1970, 1, 15)]
Using rolling_window_grouper:
The first anomalous date is:  1970-01-01 09:00:00
The next 10 anomalous dates are:  [datetime.datetime(1970, 1, 1, 9, 0, 1), datetime.datetime(1970, 1, 1, 9, 0, 2), datetime.datetime(1970, 1, 1, 9, 0, 3), datetime.datetime(1970, 1, 1, 9, 0, 4), datetime.datetime(1970, 1, 1, 9, 0, 5), datetime.datetime(1970, 1, 1, 9, 0, 6), datetime.datetime(1970, 1, 1, 9, 0, 7), datetime.datetime(1970, 1, 1, 9, 0, 8), datetime.datetime(1970, 1, 1, 9, 0, 9), datetime.datetime(1970, 1, 1, 9, 0, 10)]


In [8]:
# Pipeline using Generator, https://brett.is/writing/about/generator-pipelines-in-python/

# Without Generators
def process(num):
    # filter out non-evens
    if num % 2 != 0:
        return
    num = num * 3
    num = 'The Number: %s' % num
    return num
nums = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for num in nums:
    print (process(num))

None
The Number: 6
None
The Number: 12
None
The Number: 18
None
The Number: 24
None
The Number: 30


In [10]:
# With Generators
def even_filter(nums):
    for num in nums:
        if num % 2 == 0:
            yield num
            
def multiply_by_three(nums):
    for num in nums:
        yield num * 3
        
def convert_to_string(nums):
    for num in nums:
        yield "The Number: %s" % num
    
nums = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
pipeline = convert_to_string(multiply_by_three(even_filter(nums)))
for num in pipeline:
    print (num)

The Number: 6
The Number: 12
The Number: 18
The Number: 24
The Number: 30
