In [None]:
!pip install --upgrade pip
!pip install joblib
!pip install scikit-learn

# Documentation
- [itertools](https://docs.python.org/3/library/itertools.html)
- [joblib](https://joblib.readthedocs.io)

# itertools

The itertools library in Python is a powerful tool for working with iterators and creating efficient, memory-friendly code.

## itertools.count ([start[, step]])
iterates over start, start+step, start+2*step, …


In [None]:
import itertools

for i in range(10, 0, -1):
    print(i)

# infinte loop
# for i in itertools.count(start=10, step=-1):
#    print(i)

In [None]:
import itertools

for i in range(10, 0, -1):
    print(i)

for i in itertools.count(start=10, step=-1):
    if i<1:
        break
    print(i)

In [None]:
import itertools
import time

# Benchmarking range
start_time = time.time()
lr = []
for i in range(10000000, 0, -1):
    lr.append(i)
range_time = time.time() - start_time

# Benchmarking itertools
start_time = time.time()
li = []
for i in itertools.count(start=10000000, step=-1):
    if i<1:
        break
    li.append(i)
itertools_time = time.time() - start_time

# Display the results
print(f"itertools time: {itertools_time:.6f} seconds")
print(f"range loops time: {range_time:.6f} seconds")

## itertools.islice (seq, [start,] stop [, step])
iterates over elements from seq[start:stop:step]

In [None]:
import itertools
import time

# Benchmarking range
start_time = time.time()
lr = []
for i in range(10000000, 0, -1):
    lr.append(i)
range_time = time.time() - start_time

# Benchmarking itertools
start_time = time.time()
li = []
for i in itertools.islice(itertools.count(start=10000000, step=-1),1):
    li.append(i)
itertools_time = time.time() - start_time

# Benchmarking itertools
start_time = time.time()
li = list(itertools.islice(itertools.count(start=10000000, step=-1),1))
itertools_time2 = time.time() - start_time

# Display the results
print(f"itertools time: {itertools_time:.6f} seconds")
print(f"itertools time 2: {itertools_time2:.6f} seconds")
print(f"range loops time: {range_time:.6f} seconds")

## itertools.chain (p, q, …)
iterates over p0, p1, … plast, q0, q1, …

In [None]:
import itertools

list1 = [1, 2, 3]
tuple1 = (4, 5, 6)
string1 = "789"

for item in itertools.chain(list1, tuple1, string1):
    print(item)

## itertools.cycle (p)
enumerates p0, p1, … plast, p0, p1, …
## itertools.repeat (e,n)
enumerate element e n times
## itertools.chain_from_iterable (p)
enumerate all elements of p (also enumerable)

In [None]:
import itertools

l = [1, 2, 3, 4, 5, 6, 7, 8, 9]

# Infinite loop
#for item in itertools.cycle(l,10):
#    print(item)
    
for item in itertools.chain.from_iterable(itertools.repeat(l,10)):
    print(item)

## itertools.filterfalse (predicate, seq)
enumerate elements that invalidate predicate

In [None]:
import itertools

l = [1, 2, 3, 4, 5, 6, 7, 8, 9]

# Infinite loop
#for item in itertools.cycle(l,10):
#    print(item)
seq = itertools.chain.from_iterable(itertools.repeat(l,10))    
for item in itertools.filterfalse(lambda x: x < 5, seq):
    print(item)

In [None]:
import itertools
import time

l = [1, 2, 3, 4, 5, 6, 7, 8, 9]

# Benchmarking range
start_time = time.time()
lr = []
for i in range(1000000):
    for j in l:
        if j > 4:
            lr.append(j)
range_time = time.time() - start_time

# Benchmarking itertools
start_time = time.time()
li = []
for i in itertools.filterfalse(lambda x: x < 5, itertools.chain.from_iterable(itertools.repeat(l,1000000))):
    li.append(i)
itertools_time = time.time() - start_time

# Benchmarking itertools 2
start_time = time.time()
li = []
for i in itertools.filterfalse(lambda x: x < 5, itertools.islice(itertools.cycle(l),1000000*len(l))):
    li.append(i)
itertools_time2 = time.time() - start_time

# Benchmarking itertools 3
start_time = time.time()
li = list(itertools.filterfalse(lambda x: x < 5, itertools.islice(itertools.cycle(l),1000000*len(l))))
itertools_time3 = time.time() - start_time

# Display the results
print(f"itertools time: {itertools_time:.6f} seconds")
print(f"itertools time 2: {itertools_time2:.6f} seconds")
print(f"itertools time 3: {itertools_time3:.6f} seconds")
print(f"range loops time: {range_time:.6f} seconds")

## itertools.groupby(iterable[, key])
returns sub-iterators grouped by value of key(v)

In [None]:
import itertools

data = [1, 1, 2, 2, 3, 3, 3]
grouped = itertools.groupby(data)

for key, group in grouped:
    print(f"Key: {key}, Group: {list(group)}")

In [None]:
import itertools

data = ['apple', 'banana', 'cherry', 'date', 'fig', 'grape']
grouped = itertools.groupby(data, key=len)

for key, group in grouped:
    print(f"Length: {key}, Group: {list(group)}")

In [None]:
import itertools

data = ['apple', 'apricot', 'banana', 'blueberry', 'cherry', 'date']
grouped = itertools.groupby(data, key=lambda x: x[0])

for key, group in grouped:
    print(f"First Letter: {key}, Group: {list(group)}")

In [None]:
import itertools

data = [1, 2, 3, 4, 5, 6, 7, 8]
grouped = itertools.groupby(data, key=lambda x: 'Even' if x % 2 else "Odd" )

print(grouped)

for key, group in grouped:
    print(f"{key}, Group: {list(group)}")

In [None]:
import itertools

data = ['apple', 'banana', 'apricot', 'blueberry', 'cherry', 'date']
grouped = itertools.groupby(data, key=lambda x: x[0])

for key, group in grouped:
    print(f"First Letter: {key}, Group: {list(group)}")

Data should be sorted (according to the same key) before goruping!

In [None]:
import itertools

data = [1, 2, 3, 4, 5, 6, 7, 8]
sorted_data = sorted(data, key=lambda x: 'Odd' if x % 2 else "Even") 
grouped = itertools.groupby(sorted_data, key=lambda x: 'Odd' if x % 2 else "Even" )

for key, group in grouped:
    print(f"{key}, Group: {list(group)}")

data = ['apple', 'banana', 'apricot', 'blueberry', 'cherry', 'date']
sorted_data = sorted(data, key=lambda x: x[0])
grouped = itertools.groupby(sorted_data, key=lambda x: x[0])

for key, group in grouped:
    print(f"First Letter: {key}, Group: {list(group)}")

## itertools.combinations(items,n)
generates all subsets of n elements of items

In [None]:
import itertools

# List of items
items = ['A', 'B', 'C', 'D']

# Generate all possible pairs
pairs = list(itertools.combinations(items, 2))

print("All possible pairs:", pairs)

In [None]:
import itertools
import time

# Define the elements
elements = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

# Benchmarking itertools.combinations
start_time = time.time()
itertools_combinations = list(itertools.combinations(elements, 3))
itertools_time = time.time() - start_time

# Benchmarking nested loops
start_time = time.time()
nested_combinations = []
for i in range(len(elements)):
    for j in range(i + 1, len(elements)):
        for k in range(j + 1, len(elements)):
            nested_combinations.append((elements[i], elements[j], elements[k]))
nested_time = time.time() - start_time

# Display the results
print(f"itertools.combinations time: {itertools_time:.6f} seconds")
print(f"nested loops time: {nested_time:.6f} seconds")

In [None]:
import itertools
import time

# Define the elements
elements = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

# Benchmarking itertools.combinations
start_time = time.time()
itertools_combinations = list(itertools.combinations(elements, 6))
itertools_time = time.time() - start_time

# Benchmarking nested loops
start_time = time.time()
nested_combinations = []
for i in range(len(elements)):
    for j in range(i + 1, len(elements)):
        for k in range(j + 1, len(elements)):
            for l in range(k + 1, len(elements)):
                for m in range(l + 1, len(elements)):
                    for n in range(m + 1, len(elements)):
                        nested_combinations.append((elements[i], elements[j], elements[k], elements[l], elements[m], elements[n]))
nested_time = time.time() - start_time

# Display the results
print(f"itertools.combinations time: {itertools_time:.6f} seconds")
print(f"nested loops time: {nested_time:.6f} seconds")

## itertools.permutations (p[, r])
generates all possible ordering of subsets of size r

In [None]:
import itertools

# List of items
items = ['X', 'Y', 'Z']

# Generate all possible orderings
orderings = list(itertools.permutations(items))

print("All possible orderings:", orderings)

In [None]:
import itertools
import time

# Define the elements
elements = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

# Benchmarking itertools.permutations
start_time = time.time()
itertools_permutations = list(itertools.permutations(elements, 3))
itertools_time = time.time() - start_time

# Benchmarking nested loops
start_time = time.time()
nested_permutations = []
for a in elements:
    for b in elements:
        if b != a:
            for c in elements:
                if c != a and c != b:
                    nested_permutations.append((a, b, c))
nested_time = time.time() - start_time

# Display the results
print(f"itertools.permutations time: {itertools_time:.6f} seconds")
print(f"nested loops time: {nested_time:.6f} seconds")

## Warning : may take 1:30 min

## itertools.product(p, q, … [repeat=1])
Generates the cartesian product of lists p, q,... 
repeat permits to duplicate input lists

In [None]:
import itertools

# Two lists
list1 = ['A', 'B']
list2 = [1, 2]

# Generate Cartesian product
cartesian_product = list(itertools.product(list1, list2))

print("Cartesian product:", cartesian_product)

In [None]:
import itertools
import time

# Define the elements
elements = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

# Benchmarking itertools
start_time = time.time()
itertools_combinations = list(itertools.product(elements, repeat=3))
itertools_time = time.time() - start_time

# Benchmarking nested loops
start_time = time.time()
nested_combinations = []
for a in elements:
    for b in elements:
        for c in elements:
            nested_combinations.append((a, b, c))
nested_time = time.time() - start_time

# Display the results
print(f"itertools time: {itertools_time:.6f} seconds")
print(f"nested loops time: {nested_time:.6f} seconds")

## itertools strengths
- Memory Efficiency: works with iterators, so it doesn't load all data into memory at once
- Code Simplicity: it provides concise and readable solutions for complex iteration tasks
- Performance: often faster than handmade code

## Exercise
A company requests users to create passwords with:
- at least one uppercase letter
- at least 4 lowercase letters, all different
- at least 2 digits
- at least one special character among "+" "-" "=" "_" "@" "#"

Write a program that enumerates all such passwords, assuming the user satisfies requests as quickly as possible.

Is it better or worse than a passowrd with no constraints?

# joblib
The joblib library in Python is a powerful tool for parallel computing, caching, and efficient serialization of large data objects.

In [None]:
import joblib
import math

joblib.Parallel(n_jobs=-1)(joblib.delayed(math.sqrt)(i**2) for i in range(10))

In [None]:
from joblib import Parallel, delayed
import time

# A function that simulates a time-consuming task
def process_item(item):
    time.sleep(1)  # Simulate a 1-second task
    return item ** 2

# List of items to process
items = list(range(10))

# Process items in parallel using all available CPU cores
results = Parallel(n_jobs=-1)(delayed(process_item)(item) for item in items)

print("Processed results:", results)

## joblib Memory Caching
Caching Expensive Function Calls with Memory

In [None]:
from joblib import Memory

# Create a cache directory
cachedir = './cache'
memory = Memory(cachedir, verbose=0)

# A function that simulates an expensive computation
@memory.cache
def expensive_computation(x):
    print(f"Computing {x}...")  # This will only print once per unique input
    return x ** 2

# Call the function multiple times
print(expensive_computation(5))  # Computes and caches the result
print(expensive_computation(5))  # Retrieves the result from cache
print(expensive_computation(10))  # Computes and caches a new result

In [None]:
from joblib import Memory

def fibonacci_iterative(n):
    a, b = 0, 1
    for _ in range(n):
        a, b = b, a + b
    return a

def fibonacci_recursive(n):
    if n <= 1:
        return n
    else:
        return fibonacci_recursive(n-1) + fibonacci_recursive(n-2)

print(fibonacci_iterative(10))
print(fibonacci_recursive(10))

In [None]:
from joblib import Memory

# Create a cache directory
cachedir = './cache'
memory = Memory(cachedir, verbose=0)

def fibonacci_iterative(n):
    a, b = 0, 1
    for _ in range(n):
        a, b = b, a + b
    return a

@memory.cache
def fibonacci_recursive(n):
    if n <= 1:
        return n
    else:
        return fibonacci_recursive(n-1) + fibonacci_recursive(n-2)

print(fibonacci_iterative(10))
print(fibonacci_recursive(10))

## joblib Serialization of Large Data Objects

In [None]:
import numpy as np
from joblib import dump, load

# Create a large NumPy array
large_array = np.random.rand(1000, 1000)

# Save the array to disk
dump(large_array, 'large_array.joblib')

# Load the array from disk
loaded_array = load('large_array.joblib')

print("Array matches:", np.array_equal(large_array, loaded_array))

In [None]:
import time
import pickle
import joblib
import numpy as np

# Create a large numpy array as the sample object
large_array = np.random.random((1000, 1000))

# Function to measure the time taken to dump and load using pickle
def benchmark_pickle(obj):
    start_time = time.time()
    with open('large_array.pkl', 'wb') as file:
        pickle.dump(obj, file)
    dump_time = time.time() - start_time

    start_time = time.time()
    with open('large_array.pkl', 'rb') as file:
        obj_loaded = pickle.load(file)
    load_time = time.time() - start_time

    return dump_time, load_time

# Function to measure the time taken to dump and load using joblib
def benchmark_joblib(obj):
    start_time = time.time()
    joblib.dump(obj, 'large_array.joblib')
    dump_time = time.time() - start_time

    start_time = time.time()
    obj_loaded = joblib.load('large_array.joblib')
    load_time = time.time() - start_time

    return dump_time, load_time

# Benchmarking
pickle_times = benchmark_pickle(large_array)
joblib_times = benchmark_joblib(large_array)

print(f"Pickle - Dump time: {pickle_times[0]:.4f}s, Load time: {pickle_times[1]:.4f}s")
print(f"Joblib - Dump time: {joblib_times[0]:.4f}s, Load time: {joblib_times[1]:.4f}s")

## joblib strengths
- Parallel Computing: Simplifies parallel execution of tasks, leveraging multiple CPU cores.
- Caching: Avoids redundant computations by caching function results.
- Serialization: Optimized for large data objects.
- Simple and intuitive API

# Combining itertools (for efficient iteration) with joblib (for parallel computing and caching) 

## Use Case 1
Generate all possible pairs from a large dataset and process them in parallel.
- itertools generates combinations.
- joblib parallelizes the processing.

In [None]:
from itertools import combinations
from joblib import Parallel, delayed

def process_pair(pair):
    return sum(pair)

# Large dataset
data = list(range(1000))

# Generate all possible pairs using itertools
pairs = combinations(data, 2)

# Process pairs in parallel using all CPU cores
results = Parallel(n_jobs=-1)(delayed(process_pair)(pair) for pair in pairs)

print(f"Processed {len(results)} pairs in parallel.")

## Use Case 2
Cache results of expensive permutation-based calculations (e.g., simulations or brute-force searches).
- itertools generates permutations.
- joblib caches results to avoid recomputation.

In [None]:
from itertools import permutations
from joblib import Memory

# Create a cache directory
memory = Memory("./cache_dir", verbose=0)

@memory.cache
def evaluate_permutation(perm):
    # Expensive computation (e.g., fitness evaluation)
    return sum(perm) ** 2

# Generate permutations of a sequence
sequence = [1, 2, 3, 4]
perms = permutations(sequence, 2)

# Evaluate permutations and cache results
results = [evaluate_permutation(p) for p in perms]

print(f"Evaluated permutations: {results}")

## Use case 3
Process an infinite sequence in batches (e.g., streaming data).
- itertools.islice chunks the sequence into batches.
- joblib processes batches in parallel.

In [None]:
from itertools import islice, count
from joblib import Parallel, delayed

def process_batch(batch):
    return sum(batch)  # Simulate batch processing

# Infinite sequence (e.g., sensor data stream)
infinite_stream = count(start=0, step=1)

# Define batch size
BATCH_SIZE = 1000

# Process batches in parallel
results = Parallel(n_jobs=-1)(
    delayed(process_batch)(list(islice(infinite_stream, BATCH_SIZE)))
    for _ in range(10)  # Process 10 batches
)

print(f"Batch results: {results}")

## Use case 4

Use Case: Grid search over hyperparameters using Cartesian products.
- itertools.product generates all hyperparameter combinations.
- joblib evaluates models in parallel.

In [None]:
from itertools import product
from joblib import Parallel, delayed
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification

# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=20)

# Define hyperparameter grid
param_grid = {
    "max_depth": [3, 5, 7],
    "min_samples_split": [2, 5, 10]
}

# Generate all parameter combinations
param_combinations = product(param_grid["max_depth"], param_grid["min_samples_split"])

def train_model(max_depth, min_samples_split):
    model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split)
    model.fit(X, y)
    return [(max_depth, min_samples_split), model.score(X, y)]

# Parallelize model training
scores = Parallel(n_jobs=-1)(
    delayed(train_model)(depth, split) for depth, split in param_combinations
)

sorted_scores = sorted(scores,key=lambda x: x[1])
print(f"Model scores: {sorted_scores}")