Python Collection

In [55]:
import collections

### collections.defaultdict

In [58]:
owner_pet_list = [('John', 'Dog'), ('Mary', 'Cat'), ('Will', 'Horse'), ('John', 'Horse'), ('Will', 'Duck')]
ddict = collections.defaultdict(list)
for k, v in owner_pet_list:
     ddict[k].append(v) # key k is inserted when ddict[k] accessed first time and value is initialized with list

sorted(ddict.items())

[('John', ['Dog', 'Horse']), ('Mary', ['Cat']), ('Will', ['Horse', 'Duck'])]

In [59]:
def constant_factory(value):
    return lambda: value
d = collections.defaultdict(constant_factory('<missing>'))
d.update(name='John', action='ran')
'%(name)s %(action)s to %(object)s' % d

'John ran to <missing>'

In [46]:
# when you are adding data to nested dictionary inside a dictionary, if a key is not already present in the dictionary,
# then it hits a KeyError. defaultdict allows us to circumvent this issue in a clever way. 

tree = lambda: collections.defaultdict(tree)
some_dict = tree()
some_dict['colours']['favourite']['test'] = "yellow"
print(some_dict)

defaultdict(<function <lambda> at 0x00000000059426A8>, {'colours': defaultdict(<function <lambda> at 0x00000000059426A8>, {'favourite': defaultdict(<function <lambda> at 0x00000000059426A8>, {'test': 'yellow'})})})


### collections.OrderedDict
OrderedDicts act just like regular dictionaries except they remember the order that items were added. This matters primarily when you are iterating over the OrderedDict as the order will reflect the order in which the keys were added.
A regular dictionary doesn't care about order. But dictionary in newer version maintains the order as well.

In [33]:
d = collections.OrderedDict()
d['a'] = 1
d['b'] = 10
d['c'] = 8
for letter in d:
    print(letter)

a
b
c


In [61]:
d = {'banana': 3, 'apple': 4, 'pear': 1, 'orange': 2}
# dictionary sorted by key
print(collections.OrderedDict(sorted(d.items(), key=lambda t: t[0])))
# dictionary sorted by value
print(collections.OrderedDict(sorted(d.items(), key=lambda t: t[1])))
# dictionary sorted by length of the key string
print(collections.OrderedDict(sorted(d.items(), key=lambda t: len(t[0]))))


OrderedDict([('apple', 4), ('banana', 3), ('orange', 2), ('pear', 1)])
OrderedDict([('pear', 1), ('orange', 2), ('banana', 3), ('apple', 4)])
OrderedDict([('pear', 1), ('apple', 4), ('banana', 3), ('orange', 2)])


Ordered dictionary variant that remembers the order the keys were last inserted. If a new entry overwrites an existing entry, the original insertion position is changed and moved to the end-

In [64]:
class LastUpdatedOrderedDict(collections.OrderedDict):
    'Store items in the order the keys were last added'

    def __setitem__(self, key, value):
        if key in self:
            del self[key]
        collections.OrderedDict.__setitem__(self, key, value)

ld = LastUpdatedOrderedDict(sorted(d.items(), key=lambda t: t[0]))
print(ld)
ld['banana'] = 5
print(ld)


LastUpdatedOrderedDict([('apple', 4), ('banana', 3), ('orange', 2), ('pear', 1)])
LastUpdatedOrderedDict([('apple', 4), ('orange', 2), ('pear', 1), ('banana', 5)])


### collections.Counter

In [7]:
import re
sentence = '''Counter is a dict subclass which helps to count hashable objects.
Inside which elements are stored as dictionary keys and counts are stored as values which can be zero or negative'''
words = re.findall('\w+', sentence)
collections.Counter(words).most_common(5)

[('which', 3), ('are', 2), ('stored', 2), ('as', 2), ('Counter', 1)]

In [8]:
collections.Counter('abracadabra').most_common(3)

[('a', 5), ('b', 2), ('r', 2)]

In [10]:
#Counter objects has a method called elements which returns an iterator over elements repeating each as many times as its count.
#Elements are returned in arbitrary order.

c = collections.Counter(a=4, b=2, c=0, d=-2)
list(c.elements())

['a', 'a', 'a', 'a', 'b', 'b']

With multiple Counter objects you can perform operations against them.
For instance, you can add two counters which would add the counts for each key.
You can also perform intersection or union. 
If I wanted to compare the values for given keys between two counters, I can return the minimum or maximum values only.

In [17]:
# A student has taken 4 quizzes two times each. She is allowed to keep the highest score for each quiz.

first_attempt = collections.Counter({1: 90, 2: 65, 3: 78, 4: 88})
second_attempt = collections.Counter({1: 88, 2: 84, 3: 95, 4: 92})
final = first_attempt | second_attempt
final

Counter({1: 90, 2: 84, 3: 95, 4: 92})

### collections.deque
deque stands for "double-ended queue" and is used as a stack or queue. Although lists offer many of the same operations, they are not optimized for variable-length operations.

In [27]:
queue = collections.deque([2, 3, 4])
queue.append(5)
queue.appendleft(1)
print(queue)
queue.pop()
print(queue)
queue.popleft()
print(queue)


deque([1, 2, 3, 4, 5])
deque([1, 2, 3, 4])
deque([2, 3, 4])


In [43]:
# We can also limit the amount of items a deque can hold.
# By doing this when we achieve the maximum limit of our deque it will simply pop out the items from the opposite end.
lst = [1,2,3,4,5,6,7,8,9,10]
d = collections.deque(lst, maxlen=10)
print(d)
d.append(11)
print(d)
d.appendleft(12)
print(d)


deque([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], maxlen=10)
deque([2, 3, 4, 5, 6, 7, 8, 9, 10, 11], maxlen=10)
deque([12, 2, 3, 4, 5, 6, 7, 8, 9, 10], maxlen=10)


In [57]:
# Moving averge generator
import itertools
# itertools.islice('ABCDEFG', 2) --> A B
# itertools.islice('ABCDEFG', 2, 4) --> C D
# itertools.islice('ABCDEFG', 2, None) --> C D E F G
# itertools.islice('ABCDEFG', 0, None, 2) --> A C E G

def moving_average(iterable, n=3):
    # moving_average([40, 30, 50, 46, 39, 44]) --> 40.0 42.0 45.0 43.0
    it = iter(iterable)
    d = collections.deque(itertools.islice(it, n-1))
    d.appendleft(0)
    s = sum(d)
    for elem in it:
        s += elem - d.popleft()
        d.append(elem)
        yield s / n

list(moving_average([40, 30, 50, 46, 39, 44]))

[40.0, 42.0, 45.0, 43.0]

### namedtuple
Named tuples helps to have meaning of each position in a tuple and allow us to code with better readability and self-documenting code. You can use them in any place where you are using tuples.

In [49]:
coord = collections.namedtuple('Coordinate', ['x', 'y'])
coord2 = collections.namedtuple('Coordinate2', 'x y')

c1 = coord(10, 20)
c2 = coord(x=10, y=20)
print(c1[0] == c2[0])
print(c1.y == c2.y)
print(c1.x, c1.y)

lst = [30, 45]
print(coord2._make(lst))
dct = {'y': 45, 'x': 30}
print(coord(**dct))

True
True
10 20
Coordinate2(x=30, y=45)
Coordinate(x=30, y=45)


### enum.Enum (Python 3.4+)

In [51]:
from enum import Enum
class Species(Enum):
    cat = 1
    dog = 2
    horse = 3
    dragon = 4
    unicorn = 5
    # But we don't really care about age, so we can use an alias.
    kitten = 1
    puppy = 2

Animal = collections.namedtuple('Animal', 'name age type')
perry = Animal(name="Perry", age=31, type=Species.cat)
drogon = Animal(name="Drogon", age=4, type=Species.dragon)
tom = Animal(name="Tom", age=75, type=Species.cat)
charlie = Animal(name="Charlie", age=2, type=Species.kitten)

print(charlie.type == tom.type)
print(charlie.type)

True
Species.cat
