# Lesson 35: Python Advanced - Generators

## Generators and yields

In [1]:
# Generators produce visible results when they are asked to do it.

# We will create a generator using the following class, where we replace an iterator by generator:

import datetime as dt

class MillionDays:
    
    def __init__(self, year, month, day, maxdays):
        self.date = dt.date(year, month, day)
        self.maxdays = maxdays
    
    def __next__(self):
        if self.maxdays <= 0:
            raise StopIteration()
        ret = self.date
        self.date += dt.timedelta(days = 1)
        self.maxdays -= 1
        return ret
    
    def __iter__(self):
        return self
    
md = MillionDays(2000, 1, 1, 3)

for d in md:
    print(d)

2000-01-01
2000-01-02
2000-01-03


In [2]:
# Generator is a function:

def MillionDays(year, month, day, maxdays):
        date = dt.date(year, month, day)
        
        for i in range(maxdays):
            yield(date + dt.timedelta(days = i))
            
# "yield" is used to return the results of the function and to freeze it. 
# The next results will be generated only when the function will be called again and again.

# Calling the function (note that the function is called d times and each case returns the frozen result):

for d in MillionDays(2000, 1, 1, 3):
    print(d)

2000-01-01
2000-01-02
2000-01-03


In [3]:
# Another example of a generator with yield:

def GetMagicNumbers():
    yield(22)
    yield(3)
    yield(17)
    
    
# To call this function:

r = GetMagicNumbers()
print(next(r))
print(next(r))

# If I call the function 4 times, I will get an error StopIteration()

# It is probably better to use "for" loop:

# for m in r:
# print(m)

# It would return anly those values which are defined in the generator.

# Generating values only when they are needed is called Lazy Evaluation.
# Main advanted of generators (over for example the class with an iterator) is memory saving.


22
3


## Example of generator - data stream

In [4]:
# We have a piece of code, which has to do some actions, and it is written in a separate file

file = open(r"data_file.txt")

data = file.read()

file.close()

for line in data.splitlines():
    if line.startswith("ACTION"):
        print(line)

# But, if we had big data then printing would overload the system.

ACTION: GET STATUS
ACTION: GET STATUS 
ACTION: CALL OPERATOR


In [5]:
# Let us try it in a different way:

file = open(r"data_file.txt")

for line in file:
    if line.startswith("ACTION"):
        print(line.replace("\n",""))
        
file.close()

ACTION: GET STATUS
ACTION: GET STATUS 
ACTION: CALL OPERATOR


In [6]:
# Now we want to complicate the example. We want to store the words before and after ":" separately in 
# a new veriable so that "action" and its description will be stored separately.

file = open(r"data_file.txt")


records = []

for line in file:
    if ":" in line:
        type, action = line.rstrip("\n").split(":",1)
        record = (type, action)
        records.append(record)
        
print(records)
        
file.close()

# But again, in case of big data I will get a list of large memory size.

[('ALARM', ' WAITING FOR JOB'), ('ALARM', ' WAITING FOR JOB'), ('ERROR', ' TIMEOUT'), ('ACTION', ' GET STATUS'), ('ALARM', ' WAITING FOR STATUS'), ('ALARM', ' WAITING FOR STATUS'), ('ERROR', ' TIMEOUT'), ('ACTION', ' GET STATUS '), ('ALARM', ' WAITING FOR STATUS'), ('ALARM', ' WAITING FOR STATUS'), ('ERROR', ' TIMEOUT'), ('ACTION', ' CALL OPERATOR')]


In [7]:
# Let us build a generator:

def GetRecords(filePath):
    print("-------- opening file ----------")
    file = open(filePath)
    
    for line in file:
        if ":" in line:
            type, action = line.rstrip("\n").split(":",1)
            record = (type, action)

            # Note that now we inactivate the next results, which are btw tuples:

            yield(record)
    
    print("-------- closing file ----------")
    file.close()
    
for record in GetRecords(r"data_file.txt"):
    print("The type of action is {}, and the action is {}".format(record[0],record[1]))
    
# Note that now the values of record existed only when the "for" loop called them and then they were inactive.
    

-------- opening file ----------
The type of action is ALARM, and the action is  WAITING FOR JOB
The type of action is ALARM, and the action is  WAITING FOR JOB
The type of action is ERROR, and the action is  TIMEOUT
The type of action is ACTION, and the action is  GET STATUS
The type of action is ALARM, and the action is  WAITING FOR STATUS
The type of action is ALARM, and the action is  WAITING FOR STATUS
The type of action is ERROR, and the action is  TIMEOUT
The type of action is ACTION, and the action is  GET STATUS 
The type of action is ALARM, and the action is  WAITING FOR STATUS
The type of action is ALARM, and the action is  WAITING FOR STATUS
The type of action is ERROR, and the action is  TIMEOUT
The type of action is ACTION, and the action is  CALL OPERATOR
-------- closing file ----------


## Example of generator - grep() function

In [8]:
# grep() - looks for a chain of chacracters in system files

import os

path = r"/home/alina/Dokumenty/Data-Science-World/my_python"
search_string = "Ford"
file_extension = ".ipynb"

# To show all directories, and files in a given path I can use:

#for dir_name, subdirs, filenames in os.walk(path):
#    print(dir_name, subdirs, filenames)

# To search for a given word in a given file:

for dir_name, subdirs, filenames in os.walk(path):
    for filename in filenames:
        if filename.endswith(file_extension):
            fullFileName = os.path.join(dir_name, filename)
            for line in open(fullFileName):
                if search_string in line:
                    print(filename)
                    

32_python_adv_class_extension.ipynb
32_python_adv_class_extension.ipynb
32_python_adv_class_extension.ipynb
32_python_adv_class_extension.ipynb
35_python_adv_generators.ipynb
35_python_adv_generators.ipynb
35_python_adv_generators.ipynb
35_python_adv_generators.ipynb
35_python_adv_generators.ipynb


In [9]:
# I create a generator which will be searching files of a given extension:

def generate_files(base_dir, file_extension):
    for dir_name, subdirs, filenames in os.walk(base_dir):
        for filename in filenames:
            if filename.endswith(file_extension):
                fullFileName = os.path.join(dir_name, filename)
                
                yield fullFileName
                

# I create the next generator which will be searching a given word in files:

def grep_files(search_string, files):
    for file in files:
        with open(file) as text:
            if search_string in text.read():
                yield file

# Calling first generator:

files_generator = generate_files(path, file_extension)

# Calling second generator:

for file in grep_files(search_string, files_generator):
    print(file)
    

/home/alina/Dokumenty/Data-Science-World/my_python/32_python_adv_class_extension.ipynb
/home/alina/Dokumenty/Data-Science-World/my_python/35_python_adv_generators.ipynb


## Itertools - permutations and combinations

In [10]:
# Generators can be nicely used in combinations and permutations which belong to the package itertools:

import itertools as it

myList = ["a", "b", "c", "d"]

for combination in it.combinations(myList, 3):
    print(combination)
    
# For combinations the order of elements is not relevant.

('a', 'b', 'c')
('a', 'b', 'd')
('a', 'c', 'd')
('b', 'c', 'd')


In [11]:
# If order is important then I need to do permuatations:

for permutation in it.permutations(myList, 3):
    print(permutation)

('a', 'b', 'c')
('a', 'b', 'd')
('a', 'c', 'b')
('a', 'c', 'd')
('a', 'd', 'b')
('a', 'd', 'c')
('b', 'a', 'c')
('b', 'a', 'd')
('b', 'c', 'a')
('b', 'c', 'd')
('b', 'd', 'a')
('b', 'd', 'c')
('c', 'a', 'b')
('c', 'a', 'd')
('c', 'b', 'a')
('c', 'b', 'd')
('c', 'd', 'a')
('c', 'd', 'b')
('d', 'a', 'b')
('d', 'a', 'c')
('d', 'b', 'a')
('d', 'b', 'c')
('d', 'c', 'a')
('d', 'c', 'b')


In [12]:
# If I need combinations with repetition:

for combination in it.combinations_with_replacement(myList, 3):
    print(combination)


('a', 'a', 'a')
('a', 'a', 'b')
('a', 'a', 'c')
('a', 'a', 'd')
('a', 'b', 'b')
('a', 'b', 'c')
('a', 'b', 'd')
('a', 'c', 'c')
('a', 'c', 'd')
('a', 'd', 'd')
('b', 'b', 'b')
('b', 'b', 'c')
('b', 'b', 'd')
('b', 'c', 'c')
('b', 'c', 'd')
('b', 'd', 'd')
('c', 'c', 'c')
('c', 'c', 'd')
('c', 'd', 'd')
('d', 'd', 'd')


In [13]:
# Example: I have wallet with money and in how many ways I can pay 100 PLN:

myMoney = [20, 20, 20, 20, 10, 10, 10, 5, 5, 1, 1, 1, 1, 1]

# I choose an arbitrary number of combinations:

results = []

for i in range(1, 101):
    for combination in it.combinations(myMoney, i):
        if sum(combination) == 100:
            results.append(combination)
            
# I convert my list to a set (which has no ordering and results do not repeat)

results = set(results)

for result in results:
    print(result)

(20, 20, 20, 10, 10, 10, 5, 1, 1, 1, 1, 1)
(20, 20, 20, 20, 10, 10)
(20, 20, 20, 10, 10, 10, 5, 5)
(20, 20, 20, 20, 10, 5, 1, 1, 1, 1, 1)
(20, 20, 20, 20, 10, 5, 5)


In [14]:
# How many ways I can get 100 PLN if my money is 50, 20, 10, but I can have many banknotes of each type.

myMoney = [50, 20, 10]

results =[]

for i in range(1, 101):
    for combination in it.combinations_with_replacement(myMoney, i):
        if sum(combination) == 100:
            results.append(combination)
            
results

# Note however that if I add more distinguished values to myList, the process of calculation of all combinations
# ce be very long. So for big data it is much more convenient to use generators and choose only those results
# which are needed.

[(50, 50),
 (50, 20, 20, 10),
 (50, 20, 10, 10, 10),
 (20, 20, 20, 20, 20),
 (50, 10, 10, 10, 10, 10),
 (20, 20, 20, 20, 10, 10),
 (20, 20, 20, 10, 10, 10, 10),
 (20, 20, 10, 10, 10, 10, 10, 10),
 (20, 10, 10, 10, 10, 10, 10, 10, 10),
 (10, 10, 10, 10, 10, 10, 10, 10, 10, 10)]

## Itertools - grouping data

In [15]:
# We wan to group data which are available in the file data_file.txt (it was used before):

import itertools as it

file_path = r"data_file.txt"

data = []

with open(file_path) as file:
    for line in file:
        elements = line.rstrip("\n").split(":",1)
        d = {"type" : elements[0], "action" : elements[1]}
        data.append(d)
        
#print(data)

# The data must be sorted, otherwise groupby() will not work correctly.

data = sorted(data, key = lambda x: x["type"])

for key, elements in it.groupby(data, key = lambda x: x["type"]):
    print("The key is {} and the group is {}".format(key, len(list(elements))))

The key is ACTION and the group is 3
The key is ALARM and the group is 6
The key is ERROR and the group is 3


## Itertools - overview of functions

In [16]:
# accumulate()

import itertools as it
import operator

data = [1, 2, 3, 4, 5]

# operator.mu - means that the function accumulate returns a value coming from multiplication of all elements
# up to "each".

result = it.accumulate(data, operator.mul)
for each in result:
    print(each)

1
2
6
24
120


In [17]:
# Here it return max value found up to "each":

data = [1, 4, 6, 3, 2]

result = it.accumulate(data, max)
for each in result:
    print(each)

1
4
6
6
6


In [18]:
# count(start, step):

for i in it.count(10, 3):
    print(i)
    if i > 20:
        break

10
13
16
19
22


In [19]:
# cycle(list):
# it goes cyclicly through a list over and over again and never stops

#for i in it.cycle(data):
#    print(i)

In [20]:
# chain() - combines two lists:

colors_1 = ["red", "blue", "black"]
colors_2 = ["green", "white", "yellow"]

result = it.chain(colors_1, colors_2)
for each in result:
    print(each)

red
blue
black
green
white
yellow


In [21]:
# compress(data, selectors):

cars = ["Toyota", "Ford", "Opel"]
selections = [True, False, True]

result = it.compress(cars, selections)

for each in result:
    print(each)

Toyota
Opel


In [22]:
# dropwhile(condition, data) - it drops all values up to the condition is satisfied, then it return all values

data = [1, 2, 4, 6, 8, 9, 3, 5]

result = it.dropwhile(lambda x: x < 4, data)
for each in result:
    print(each)

4
6
8
9
3
5


In [23]:
# filterfalse() - drops all values that do not satisfy the condition: (compare to dropwhile)

result = it.filterfalse(lambda x: x < 4, data)
for each in result:
    print(each)

4
6
8
9
5


In [24]:
# islice(data, start, stop) - selecting some values from a list by giving their positions in this list:

months = ["jan", "feb", "mar", "apr", "may"]

result = it.islice(months, 2, 4)
for each in result:
    print(each)

mar
apr


In [25]:
# product(data_1, data_2)

days = ["mon", "tue", "wed", "thu", "fri"]

result = it.product(months, days)
for each in result:
    print(each)

('jan', 'mon')
('jan', 'tue')
('jan', 'wed')
('jan', 'thu')
('jan', 'fri')
('feb', 'mon')
('feb', 'tue')
('feb', 'wed')
('feb', 'thu')
('feb', 'fri')
('mar', 'mon')
('mar', 'tue')
('mar', 'wed')
('mar', 'thu')
('mar', 'fri')
('apr', 'mon')
('apr', 'tue')
('apr', 'wed')
('apr', 'thu')
('apr', 'fri')
('may', 'mon')
('may', 'tue')
('may', 'wed')
('may', 'thu')
('may', 'fri')


In [26]:
# repeat(what, times):

what = "no"

result = it.repeat(what, 5)
for each in result:
    print(each)

no
no
no
no
no


In [27]:
# startmap(function, data):

data = [(1, 2), (3, 4), (5, 6)]

# it adds up only 2 elements in the tuple in the list (and no more)

result = it.starmap(operator.add, data)
for each in result:
    print(each)

3
7
11


In [28]:
# takewhile(condition, data) - opposite to dropwhile()

data = [1, 2, 4, 6, 8, 9, 3, 5]

result = it.takewhile(lambda x: x < 4, data)
for each in result:
    print(each)

1
2


In [29]:
# tee() - to generate a copy of iterator:

cars = ["Toyota", "Ford", "Opel"]

cars1, cars2 = it.tee(cars, 2)

for each in cars1:
    print(each)
    
print("*********")

for each in cars2:
    print(each)

Toyota
Ford
Opel
*********
Toyota
Ford
Opel


In [31]:
# zip_longest() - to zip lists that have different lengths

list_1 = ["ala", "ola", "jola", "ula", "viola"]
list_2 = ["mon", "tue", "wed"]

result = it.zip_longest(list_1, list_2, fillvalue = "unknown")

for each in result:
    print(each)

('ala', 'mon')
('ola', 'tue')
('jola', 'wed')
('ula', 'unknown')
('viola', 'unknown')
