In [1]:
# 3.1 Data Structures and Sequences

In [2]:
# Tuple
# A tuple is a fixed-length, immutable sequence of Python objects.
# The easiest way to create one is with a comma-separated sequence of values

In [3]:
tup = 4,5,6

In [4]:
tup

(4, 5, 6)

In [5]:
nested_tup = (4,5,6),(7,8)

In [6]:
nested_tup

((4, 5, 6), (7, 8))

In [7]:
# Can convert any sequence or iterator to a tuple by invoking tuple

In [8]:
tuple([4, 0, 2])

(4, 0, 2)

In [9]:
tup = tuple('string')

In [10]:
tup

('s', 't', 'r', 'i', 'n', 'g')

In [11]:
tup[0]

's'

In [12]:
# Once the tuple is created it's not possible to modified which object is stored
# in each slot

In [13]:
tup = tuple(['foo', [1,2], True])

In [15]:
tup[2] = False

TypeError: 'tuple' object does not support item assignment

In [16]:
# can concatenate tuples using the + operator to produce longer tuples
(4, None, 'foo') + (6, 0) + ('bar',)

(4, None, 'foo', 6, 0, 'bar')

In [17]:
# Multiplying a tuple by an integer, as with lists, has the effect of concatenating together
# that many copies of the tuple
('foo', 'bar')*4
# Note that the objects themselves are not copied, only the references to them

('foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'bar')

In [18]:
# Unpacking tuples
# Assign a tuple-like expression of variables, unpack the value on the righthand side of the 
# equals sign

In [19]:
tup = (4, 5, 6)

In [20]:
a, b, c = tup

In [21]:
b

5

In [22]:
# Each sequences with nested tuples can be unpacked
tup = 4, 5, (6, 7)

In [23]:
a, b, (c, d) = tup

In [24]:
d

7

In [25]:
# Swap variable names
tmp = a

In [26]:
a = b

In [27]:
b = tmp

In [28]:
# !!! Swap can be done
a, b = 1, 2

In [29]:
a

1

In [30]:
b

2

In [31]:
b, a = a, b

In [32]:
a

2

In [33]:
b

1

In [34]:
# A common use of variable unpacking is iterating over sequences of tuples
# or lists
seq = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]

In [35]:
for a, b, c in seq:
    print('a = {0}, b = {1}, c = {2}'.format(a, b, c))

a = 1, b = 2, c = 3
a = 4, b = 5, c = 6
a = 7, b = 8, c = 9


In [36]:
# Advanced tuple unpacking to help with situations where you may want to "pluck"
# a few elements from the beginning of a tuple
# This uses the special syntax *rest, which is also used in function signatures
# to capture an arbitrarily long list of positional arguments
values = 1, 2, 3, 4, 5

In [37]:
a, b, *rest = values

In [38]:
a, b

(1, 2)

In [39]:
rest

[3, 4, 5]

In [40]:
# As a matter of convention, can use underscore(_)
a, b, *_ = values

In [41]:
# Tuple methods
# Since the size and contents of a tuple cannot be modified, it is very light on instance
# methods
a = (1, 2, 2, 2, 3, 4, 2)

In [42]:
a.count(2)

4

In [43]:
# List
# In contrast with tuples, lists are variable-length and their contents can be modified
# in-place
a_list = [2, 3, 7, None]

In [44]:
tup = ('foo', 'bar', 'baz')

In [45]:
b_list = list(tup)

In [46]:
b_list

['foo', 'bar', 'baz']

In [47]:
b_list[1] = 'peekaboo'

In [48]:
b_list

['foo', 'peekaboo', 'baz']

In [49]:
# Lists and tuples are semantically similar (though tuples cannot be modified) and can 
# be used interchangeably in many functions
gen = range(10)

In [50]:
gen

range(0, 10)

In [51]:
list(gen)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [52]:
# Adding and removing elements
b_list.append('dwarf')

In [53]:
b_list

['foo', 'peekaboo', 'baz', 'dwarf']

In [54]:
# Using insert you can insert an element at a specific location in the list
b_list.insert(1, 'red')

In [55]:
b_list

['foo', 'red', 'peekaboo', 'baz', 'dwarf']

In [56]:
# insert is computationally expensive. If you need to insert elements at both the beginning
# and end of a sequence, you may wish to explore collections.deque, a double-ended queue, 
# for this purpose

In [57]:
# The inverse operation to insert is pop, which removes and returns an element at a particular
# index
b_list.pop(2)

'peekaboo'

In [58]:
b_list

['foo', 'red', 'baz', 'dwarf']

In [59]:
# Elements can be removed by value with remove, which locates the first such value and removes
# it from the last
b_list.append('foo')

In [60]:
b_list

['foo', 'red', 'baz', 'dwarf', 'foo']

In [61]:
b_list.remove('foo')

In [62]:
b_list

['red', 'baz', 'dwarf', 'foo']

In [63]:
# check if a list contains a value using the keyword
'dwarf' in b_list

True

In [64]:
'dwarf' not in b_list

False

In [65]:
# Concatenating and combining lists
[4, None, 'foo'] + [7, 8, (2, 3)]

[4, None, 'foo', 7, 8, (2, 3)]

In [66]:
# If you have a list already defined, you can append multiple elements to it using 
# the extend method
x = [4, None, 'foo']

In [67]:
x.extend([7, 8, (2, 3)])

In [68]:
x

[4, None, 'foo', 7, 8, (2, 3)]

In [69]:
# list concatenation by addition is a comparatively expensive operation since a new list must
# be created and the objects copied over
# Using extend to append elements to an existing list, especially if you are building up a
# a large list, is usually preferable

# everything = []
# for chunk in list_of_lists:
    #everything.extend(chunk)

# is faster than the concatenative alternative
# everything = []
# for chunk in list_of_lists:
#    everything = everything + chunk

NameError: name 'list_of_lists' is not defined

In [70]:
# Sorting 
# can sort a list in-place (without creating a new object)
a = [7, 2, 5, 1, 3]

In [71]:
a.sort()

In [72]:
a

[1, 2, 3, 5, 7]

In [73]:
# sort has a few options that will occasionally come in handy
# One is the ability to pass a secondary sort key - a function that produces a value to use to
# sort the objects
b = ['saw', 'small', 'He', 'foxes', 'six']

In [74]:
b.sort(key = len)

In [75]:
b

['He', 'saw', 'six', 'small', 'foxes']

In [76]:
# Binary search and maintaining a sorted list
# The built-in bisect module implements binary search and insertion into a sorted list
# bisect.bisect finds the location where an element should be inserted to keep it sorted
# bisect.insort actually insertes the element into that location
import bisect

In [77]:
c = [1, 2, 2, 2, 3, 4, 7]

In [78]:
bisect.bisect(c, 2)

4

In [79]:
bisect.bisect(c, 5)

6

In [80]:
bisect.insort(c, 6)

In [81]:
c

[1, 2, 2, 2, 3, 4, 6, 7]

In [82]:
# !!!The bisect module functions do not check whether the list is sorted,
# as doing so would be computationally expensive. Thus, using them with an unsorted list
# will succeed without error but may lead to incorrect results

In [83]:
# Sliding
# select sections of most sequence types by using slice notation
seq = [7, 2, 3, 7, 5, 6, 0, 1]

In [84]:
seq[1:5]

[2, 3, 7, 5]

In [85]:
seq[3:4] = [6,3]

In [86]:
seq

[7, 2, 3, 6, 3, 5, 6, 0, 1]

In [87]:
# While the element at the start index is included, the stop index is not included
# Either the start or stop can be omitted, in which case they default to the start
# of the sequence and the end of the sequence, respectively

In [88]:
seq[:5]

[7, 2, 3, 6, 3]

In [89]:
seq[3:]

[6, 3, 5, 6, 0, 1]

In [90]:
# Negative indices slice the sequence relative to the end
seq[-4:]

[5, 6, 0, 1]

In [91]:
seq[-6:-2]

[6, 3, 5, 6]

In [92]:
# A step can also be used after a second colon to, say, take every other elements
seq[::2]

[7, 3, 3, 6, 1]

In [93]:
#!!A clever use of this is to pass -1, which has the useful effect of reversing a list or tuple
seq[::-1]

[1, 0, 6, 5, 3, 6, 3, 2, 7]

In [94]:
# Built-in Sequence Functions

In [95]:
# Enumerate
# It's common when iterating over a sequence to want to keep track of the index of the current
# item
i = 0
for value in collection:
    # do something with value
    i += 1

NameError: name 'collection' is not defined

In [96]:
# Enumerate: returns a sequence of (i, value) tuples
#for i, value in enumerate(collection):
    # do something with value

In [97]:
some_list = ['foo', 'bar', 'baz']

In [98]:
mapping = {}

In [100]:
for i, v in enumerate(some_list):
    mapping[v] = i

In [101]:
mapping

{'foo': 0, 'bar': 1, 'baz': 2}

In [102]:
sorted([7, 1, 2, 6, 0, 3, 2])

[0, 1, 2, 2, 3, 6, 7]

In [103]:
sorted('horse race')

[' ', 'a', 'c', 'e', 'e', 'h', 'o', 'r', 'r', 's']

In [104]:
# zip
# zip "pairs" up the elements of a number of lists, tuples, or other sequences to create a list
# of tuples

In [105]:
seq1 = ['foo', 'bar', 'baz']

In [106]:
seq2 = ['one', 'two', 'three']

In [107]:
zipped = zip(seq1, seq2)

In [108]:
list(zipped)

[('foo', 'one'), ('bar', 'two'), ('baz', 'three')]

In [109]:
seq3 = [False, True]

In [110]:
list(zip(seq1, seq2, seq3))

[('foo', 'one', False), ('bar', 'two', True)]

In [111]:
# A very common use of zip: simultaneously iterating over multiple sequences,
# possibly also combined with enumerate:
for i, (a, b) in enumerate(zip(seq1, seq2)):
    print('{0}:{1}, {2}'.format(i, a, b))

0:foo, one
1:bar, two
2:baz, three


In [112]:
# Given a "zipped" sequence, zip can be applied in a clever way to "unzip" the sequence.
# Another way to think about this is converting a list of rows into a list of columns.
pitchers = [('Nolan', 'Ryan'), ('Roger', 'Clemens'), ('Schilling', 'Curt')]

In [114]:
first_names, last_names = zip(*pitchers)

In [115]:
first_names

('Nolan', 'Roger', 'Schilling')

In [116]:
last_names

('Ryan', 'Clemens', 'Curt')

In [117]:
# Reversed
list(reversed(range(10)))
# reverse is a generator, do it does not create the reversed sequence until materialized
# (e.g. with list or a for loop)

[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

In [118]:
# Dict
# Most important built-in Python data structure: common name: hash map or associative array
# key-value pairs where key and value are Python objects.
# One approach for creating one is to use curly braces {} and colons to seperate keys and 
# values

In [119]:
empty_dict = {}

In [120]:
d1 = {'a': 'some value', 'b':[1, 2, 3, 4]}

In [121]:
d1

{'a': 'some value', 'b': [1, 2, 3, 4]}

In [122]:
# You can access, insert, or set elements using the same syntax as for accessing elements
# of a list or tuple
d1[7] = 'an integer'

In [123]:
d1

{'a': 'some value', 'b': [1, 2, 3, 4], 7: 'an integer'}

In [124]:
d1['b']

[1, 2, 3, 4]

In [125]:
# Can check if a dict contains a key using the same syntax used for checking whether a list or
# tuple contains a value
'b' in d1

True

In [126]:
# can delete values either using the del keyword or pop method

In [127]:
d1[5] = 'some value'

In [128]:
d1

{'a': 'some value', 'b': [1, 2, 3, 4], 7: 'an integer', 5: 'some value'}

In [129]:
d1['dummy'] = 'another value'

In [130]:
d1

{'a': 'some value',
 'b': [1, 2, 3, 4],
 7: 'an integer',
 5: 'some value',
 'dummy': 'another value'}

In [131]:
del d1[5]

In [134]:
d1

{'a': 'some value',
 'b': [1, 2, 3, 4],
 7: 'an integer',
 'dummy': 'another value'}

In [135]:
ret = d1.pop('dummy')

In [137]:
ret

'another value'

In [138]:
d1

{'a': 'some value', 'b': [1, 2, 3, 4], 7: 'an integer'}

In [139]:
# The keys and values method give you iterators of the dict's keys and values, respectively.
# While the key-value pairs are not in any particular order, these functions output the keys
# and values in the same order
list(d1.keys())

['a', 'b', 7]

In [140]:
list(d1.values())

['some value', [1, 2, 3, 4], 'an integer']

In [141]:
# merge one dict into another using the update method
d1.update({'b': 'foo', 'c':12})

In [142]:
d1

{'a': 'some value', 'b': 'foo', 7: 'an integer', 'c': 12}

In [143]:
# The update method changes dicts in-place, so any existing keys in the data passed to
# update will have their old values discarded.

In [144]:
# creating dicts from sequences
# it's common to occasionally end up with two sequences that you want to pair up element-wise
# in dict. 

# mapping = {}
# for key, value in zip(key_list, value_list):
#     mapping[key] = value

In [145]:
# Since a dict is essentially a collection fo 2-tuples, the dict function accepts a list of
# 2-tuples:
mapping = dict(zip(range(5), reversed(range(5))))

In [146]:
mapping

{0: 4, 1: 3, 2: 2, 3: 1, 4: 0}

In [147]:
# Default values
# It's very common to have logic like:

# if key in some_dict:
#    value = some_dict[key]
# else:
#    value = default_value

# Thus, the dict methods get and pop can take a default value to be returned, so that the 
# above if-else block can be written simply as:
# value = some_dict.get(key, default_value)

# get by default will return None if the key is not present, while pop will raise an
# exception. With setting values, a common case is for the values in a dict to be other 
# collections, like lists. 

In [148]:
# Categorizing a list of words by their first letters as a dict of lists

In [149]:
words = ['apple', 'bat', 'bar', 'atom', 'book']

In [150]:
by_letter = {}

In [151]:
for word in words:
    letter = word[0]
    if letter not in by_letter:
        by_letter[letter] = [word]
    else:
        by_letter[letter].append(word)

In [152]:
by_letter

{'a': ['apple', 'atom'], 'b': ['bat', 'bar', 'book']}

In [155]:
# The built-in collections module has a useful class, defaultdict, which makes this even
# easier. 
from collections import defaultdict
by_letter = defaultdict(list)
for word in words:
    by_letter[word[0]].append(word)

In [156]:
# Valid dict key types
# The keys generally have to be immutable objects like scalar types (int, float, string) or
# tuples (all the objects in the tuple need to be immutable, too)
# Hashability: can check whether an object is hashable (can be used as a key in a dict) with
# the hash function
hash('string')

2084312460925833788

In [157]:
hash((1, 2, (2, 3)))

1097636502276347782

In [158]:
hash((1, 2, [2, 3])) # fails because lists are mutable

TypeError: unhashable type: 'list'

In [159]:
# To use a list as a key, one option is to convert it to a tuple
# which can be hashed as long as its elements also can
d = {}
d[tuple([1, 2, 3])] = 5

In [160]:
d

{(1, 2, 3): 5}

In [161]:
# Set
# A set is an unordered collection of unique elements.
# It is like dictionary: with keys only and no values
set([2, 2, 2, 1, 3, 3])

{1, 2, 3}

In [162]:
{2, 2, 2, 1, 3, 3}

{1, 2, 3}

In [163]:
# sets support mathematical set operations like union, intersection, difference, and
# symmetric difference
a = {1, 2, 3, 4, 5}

In [164]:
b = {3, 4, 5, 6, 7, 8}

In [165]:
# Union can be computed with either the union method or the | binary operator
a.union(b)

{1, 2, 3, 4, 5, 6, 7, 8}

In [166]:
a | b

{1, 2, 3, 4, 5, 6, 7, 8}

In [167]:
a.intersection(b)

{3, 4, 5}

In [168]:
a & b

{3, 4, 5}

In [169]:
# set operation
# a.add(x)
# a.clear()

# a.remove(x)
# a.pop()

# a.union(b)
# a.update(b)

# a.intersection(b)
# a.intersection_update(b)

# a.difference(b)
# a.difference_update(b)
# a.symmetric_difference(b)
# a.symmetric_difference_update(b)

# a.issubset(b) True if the elements of a are all contained in b
# a.issuperset(b) True if the elements of b are all contained in a
# a.isdisjoint(b) True if a and b have no elemenets in common

In [170]:
# All of the logical set operations have in-place counterparts, which enable replacing the
# contents of the set on the left side of the operation with the result
c = a.copy()

In [171]:
c |= b

In [172]:
c

{1, 2, 3, 4, 5, 6, 7, 8}

In [173]:
d = a.copy()

In [174]:
d &= b # set the contents of d to be the intersection of the elements in a and b

In [175]:
d

{3, 4, 5}

In [176]:
# Like dicts, set elements generally must be immutable. To have list-like elements, need
# to convert it to a tuple
my_data = [1, 2, 3, 4]

In [177]:
my_set = {tuple(my_data)}

In [178]:
my_set

{(1, 2, 3, 4)}

In [179]:
# Check if a set is a subset of (is contained in) or a superset of (contains all elements of)
# another set
a_set = {1, 2, 3, 4, 5}

In [180]:
{1, 2, 3}.issubset(a_set)

True

In [181]:
a_set.issuperset({1, 2, 3})

True

In [182]:
# Sets are equal if and only if their contents are equal
{1, 2, 3} == {3, 2, 1}

True

In [183]:
# List Set and Dict Comprehensions
# List Comprehensions: allow you to concisely form a new list by filtering the elements of a
# collection, transforming the elements passing teh filter in one concise expression

# Basic form
# [expr for val in collection if condition]

In [184]:
# Equivalent to following for loop
# result = []
# for val in collection:
#     if condiction:
#         result.append(expr)

In [185]:
# Given a list of strings, could filter out strings with length 2 or less and also 
# convert them to uppercase
strings = ['a', 'as', 'bat', 'car', 'dove', 'python']

In [186]:
[x.upper() for x in strings if len(x) > 2]

['BAT', 'CAR', 'DOVE', 'PYTHON']

In [187]:
# Set and dict comprehensions are a natural extension
# dict comprehension
# dict_comp = {key-expr: value-expr for value in collection if condiction} 

In [188]:
# set comprehension looks like the equivalent list comprehension except with curly braces 
# instead of square brackets
# set_comp = {expr for value in collection if condiction}

In [189]:
# Wanted a set containing just the lengths of the strings contained in the collection;
# we could easily compute this using a set comprehension
unique_lengths = {len(x) for x in strings}

In [190]:
unique_lengths

{1, 2, 3, 4, 6}

In [191]:
# Express this more functionally using the map function
set(map(len, strings))

{1, 2, 3, 4, 6}

In [195]:
# Create a lookup map of these strings to their locations in the list
loc_mapping = {val: index for index, val in enumerate(strings)}

In [196]:
loc_mapping

{'a': 0, 'as': 1, 'bat': 2, 'car': 3, 'dove': 4, 'python': 5}

In [197]:
# Nested list comprehensions

# A list of lists conataining some English and Spanish names
all_data = [['John', 'Emily', 'Michael', 'Mary', 'Steven'],
           ['Maria', 'Juan', 'Javier', 'Natalis', 'Pilar']]

# Gotten these names from a couple of files and decided to organise them by language
# Wanted to get a single list containing all names with two or more e's in them

In [198]:
# Do with a simple for loop
names_of_interest = []
for names in all_data:
    enough_es = [name for name in names if name.count('e') >= 2]
    names_of_interest.extend(enough_es)

In [200]:
# can wrap this whole operation up in a single nested list comprehension
result = [name for names in all_data for name in names
         if name.count('e') >= 2]

In [201]:
result

['Steven']

In [202]:
some_tuples = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]

In [203]:
# "Flatten" a list of tuples of integers into a simple list of integers
some_tuples = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]

In [204]:
flattened = [x for tup in some_tuples for x in tup]

In [206]:
flattened

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [207]:
# The order of the for expressions would be the same if you wrote a nested for loop instead
# of a list comprehension

In [208]:
flattened = []

In [209]:
for tup in some_tuples:
    for x in tup:
        flattened.append(x)

In [210]:
# can have arbitrarily many levels of nesting, though if you have more than two or three 
# levels of nesting you should probably start to question whether this makes sense
# from a code readability standapoint

In [211]:
# A list comprehension inside a list comprehension
[[x for x in tup] for tup in some_tuples]
# This produces a list of lists, rather than a flattened list f all of the inner elements

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [212]:
# 3.2 Functions
# If anticipate needing to repeat the same or very similar code more than once, it may be
# worth writing a reusable function. 
# Functons can also help make your code mroe readable by giving a name to a group of
# Python statements

In [213]:
# If Python reaches the end of a function without encountering a return statement
# None is returned automatically

In [215]:
# Each function can have positional arguments and keyword argument.
# Keyword: most commonly used to specify default values or optional arguments

# x and y are positiona, arguments while z is a keyword argument. 
# This means that the function can be called in any of these ways

# my_function(5, 6, z = 0.7)
# my_function(3.14, 7, 3.5)
# my_function(10, 20)

# The main restriction on function arguments is that the keyword arguments must follow
# the positional arguments (if any).

In [216]:
# Namespaces, Scope, and Local Functions

# Access variables in two different scopes: global and local namespace

# Any variables that are assigned within a function by default are assigned to the local 
# namespace

In [218]:
# Assigning variables outside of the function's scope
# those variables must be declared as global via the global keyword
a = None

def bind_a_variable():
    global a
    a =[]
    bind_a_variable()

print(a)

None


In [220]:
# Return Multiple Values
def f():
    a = 5
    b = 6
    c = 7
    return a, b, c
# The function is actually just returning one object, namely a tuple, which is then being
# unpacked into the result variables.

a, b, c = f()

In [221]:
# Can do this instead
return_value = f()
# In this case, return_value would be a 3-tuple with the three returned variables

In [222]:
# Instead of returning 3-tuple returned variables, can return multiple values by returning 
# a dictionary instead
def f():
    a = 5
    b = 6
    c = 7
    return {'a': a, 'b': b, 'c': c}

In [223]:
# Functions Are Objects
# When doing some data cleaning and needed to app;y a bunch of transformations to the following
# list of strings
states = [' Alabama ', 'Georgia!', 'Georgia', 'georgia', 'FlOrIda',
          'south carolina##', 'West virginia?']
# make this list of strings uniform and ready for analysis: stripping whitespace, removing 
# punctuation symbold, and standardizing on proper capitalization.

In [224]:
# Use built-in string methods along with the re standard library module for regular expressions

In [225]:
import re

In [226]:
def clean_strings(strings):
    result = []
    for value in strings:
        value = value.strip()
        value = re.sub('[!#?]', '', value)
        value = value.title()
        result.append(value)
    return result

In [227]:
clean_strings(states)

['Alabama',
 'Georgia',
 'Georgia',
 'Georgia',
 'Florida',
 'South Carolina',
 'West Virginia']

In [228]:
# Alternative approach: make a list of the operations
def remove_punctuation(value):
    return re.sub('[!#?]', '', value)

In [229]:
clean_ops = [str.strip, remove_punctuation, str.title]

In [230]:
def clean_strings(strings, ops):
    result = []
    for value in strings:
        for function in ops:
            value = function(value)
        result.append(value)
    return result

In [231]:
clean_strings(states, clean_ops)

['Alabama',
 'Georgia',
 'Georgia',
 'Georgia',
 'Florida',
 'South Carolina',
 'West Virginia']

In [232]:
# use functions as arguments to other functions like the built-in map function,
# which applies a function to a sequence of some kind:
for x in map(remove_punctuation, states):
    print(x)

 Alabama 
Georgia
Georgia
georgia
FlOrIda
south carolina
West virginia


In [233]:
# Anonymous (Lambda) Functions
# Writing functions consisting of a single statement, the result of which is the return value

# They are defined with the lambda keyword, which has no meaning other than "we are declaring
# an anonymous function":
def short_function(x):
    return x * 2

equiv_anon = lambda x: x * 2

In [234]:
# less typing (and clearer) to pass a lambda function as opposed to writing a full-out
# function declaration or even assigning the lambda function to a local variable

# A silly example:
def apply_to_list(some_list, f):
    return [f(x) for x in some_list]

ints = [4, 0, 1, 5, 6]
apply_to_list(ints, lambda x: x * 2)

[8, 0, 2, 10, 12]

In [235]:
# sort a collection of strings by the number of distinct letter in each string:
strings = ['foo', 'card', 'car', 'aaaa', 'abab']

In [236]:
# Pass a lambda function to the list's sort method
strings.sort(key = lambda x: len(set(list(x))))

In [237]:
strings

['aaaa', 'foo', 'abab', 'car', 'card']

In [238]:
# One reason lambda functions are called anonymous functions is that, unlike functions
# declared with the keyword, the function object itself is never given an explicit
# _name_attribute

In [239]:
# Currying: Partial Argument Application
# Deriving new functions from existing ones by partial argument application

# Suppose we had a trivial function that adds two numbers together
def add_numbers(x, y):
    return x + y

In [240]:
# Using this function, we could derive a new function of one variable, add_five,
# that adds 5 to its argument:
add_five = lambda y: add_numbers(5, y)

In [241]:
# The second argument to add_numbers is said to be curried:
# define a new function that calls an existing function

# The built-in functools module can simplify this process using the partial function
from functools import partial
add_five = partial(add_numbers, 5)

In [242]:
# Iterate over sequences: objects in a list or lines in a file

In [243]:
some_dict = {'a': 1, 'b': 2, 'c': 3}

In [244]:
for key in some_dict:
    print(key)

a
b
c


In [245]:
# When write for key in some_dict, the Python interpreter first attempts to create an
# iterator out of some_dict:
dict_iterator = iter(some_dict)

In [247]:
dict_iterator

<dict_keyiterator at 0x7fe393a08890>

In [248]:
# An iterator is any ovject that will yield objects to the Python interpreter when used in
# a context like a for loop. 
# Most methods expecting a list or a list-like object will also accept any iterable object
# This includes built0in methods such as min,max, and sum, and type constructors like list
# and tuple
list(dict_iterator)

['a', 'b', 'c']

In [253]:
# A generator is a concise way to construct a new iterable object.
# Normal functions execute and return a single result at a time, generators return
# a sequence of multiple results lazily, pausing after each one until the next one is requested

# !!To create a generator, use the yield keyword instead of return in a function
def squares(n = 10):
    print('Generating squares from 1 to {0}'.format(n ** 2))
    for i in range(1, n + 1):
        yield i ** 2

In [255]:
# When you actually call the generator, no code is immediately executed
gen = squares()

In [256]:
gen

<generator object squares at 0x7fe393a13a50>

In [257]:
# Not until you request elements from the generator that it begins executing its code
for x in gen:
    print(x, end = '')

Generating squares from 1 to 100
149162536496481100

In [258]:
# Another even more concise way to make a generator is by using a generator expression.
# This is a generator analogue to list, dict, and set comprehensions;

# To create one, enclose what would otherwise be a list comprehension within parenthese
# instead of brackets
gen = (x ** 2 for x in range(100))

In [259]:
gen

<generator object <genexpr> at 0x7fe393a13c50>

In [261]:
# This is completely equivalent to the following more verbose generator:
def _make_gen():
    for x in range(100):
        yield x ** 2

gen = _make_gen()

In [262]:
# Generator expressions can be used instead of list comprehensions as function arguments in
# many cases
sum(x ** 2 for x in range(100))

328350

In [263]:
dict((i, i ** 2) for i in range(5))

{0: 0, 1: 1, 2: 4, 3: 9, 4: 16}

In [264]:
# itertools module
# The standard library itertools module has a collection of generators for many
# common data algorithms

# e.g. groupby takes any sequence and a function, grouping consecutive elements
# in the sequence by return value of the function.

In [265]:
import itertools

first_letter = lambda x : x[0]

In [266]:
names = ['Alan', 'Adam', 'Wes', 'Will', 'Albert', 'Steven']

In [267]:
for letter, names in itertools.groupby(names, first_letter):
    print(letter, list(names)) # names is a generator

A ['Alan', 'Adam']
W ['Wes', 'Will']
A ['Albert']
S ['Steven']


In [268]:
# Some useful itertools functions

# combinations(iterable, k): generates a sequence of all possible k-tuples of elements
# in the iterable, ignoring order and without replacement (see also the compaion function 
# companion function combinations_with_replacement)

# permutations(iterable, k): generates a sequence of all possible k-tuples of elements
# in the iterable, respecting order.

# groupby(iterable[, keyfunc]): generates (key, sub-iterator) for each unique key

# product(*iterables, repeat = 1): generates the cartesian product of the input iterables
# as tuples, similar to a nested for loop

In [269]:
# Errors and Exception Handling
# Example: Python's float function is capable of casting a string to a floating-point
# number, but fails with ValueError on improper inputs
float('1.2345')

1.2345

In [270]:
float('something')

ValueError: could not convert string to float: 'something'

In [271]:
# Suppose we wanted a version of float that fails gracefully,
# returning the input argument. Can do this by writing a function that enclose that call to 
# float in a try/except block:
def attempt_float(x):
    try:
        return float(x)
    except:
        return x

In [273]:
attempt_float('1.2345')

1.2345

In [274]:
attempt_float('something')

'something'

In [275]:
# Float can raise exceptions other than ValueError:
float((1, 2))

TypeError: float() argument must be a string or a number, not 'tuple'

In [276]:
# Might want to only suppress ValueError, since a TypeError (the input was not a string or 
# numeric value) might indixate a legitimate bug in your program.
def attempt_float(x):
    try:
        return float(x)
    except ValueError:
        return x

In [277]:
attempt_float((1, 2))

TypeError: float() argument must be a string or a number, not 'tuple'

In [278]:
# can catch multiple exception types by writing a tuple of exception types instead
# (the parentheses are required):
def attempt_float(x):
    try:
        return float(x)
    except (TypeError, ValueError):
        return x

In [281]:
# In some cases, you may not want to suppress an exception, but you want some code to be 
# executed regardless of whether the code in the block succeeds or not.
# To do this, use finally:
f = open(path, 'w')

try:
    write_to_file(f)
finally:
    f.close()

NameError: name 'path' is not defined

In [282]:
# Can have code that executes only if the try: block succeeds using else:
f = open(path, 'w')

try:
    write_to_file(f)
except:
    print('Failed')
else:
    print('Succeeded')
finally:
    f.close()

NameError: name 'path' is not defined

In [283]:
# To open a file for reading or writing, use the built-in open function with either a relative
# or absolute file path:
path = 'examples/segismundo.txt'

In [284]:
f = open(path)

NameError: name 'pathe' is not defined

In [285]:
# By default, the file is opened in read-only mode 'r'.
# Can then treat the file handle f like a list and iterate over the lines like so:
for line in f:
    pass

TypeError: 'function' object is not iterable

In [286]:
# lines = [x.rstrip() for x in open(path)]
# lines

In [287]:
# When you use open to create file objects, it is important to explicitly close the file 
# when you are finished with it.

# One of the ways to make it easier to clean up open files is to use the with statement
# with open(path) as f:
#       lines = [x.rstrip() for x in f]
# This will automatically close the file f when exiting the with block

In [288]:
# If we had typed f = open(path, 'w'), a new file at examples/segismundo.txt would have
# been created (be careful!), overwriting any one in its plcae. 

# There is also the 'x' file mode, which creates a writable file but fails if the file 
# path already exists

In [289]:
# readable files: some of the most commonly used methods are read, seek and tell
# read: returns a certain number of characters from the file: what constitues a
# "character" is determined by the file's encoding (e.g. UTF-8) or simply raw bytes if the 
# file is ipned in binary mode:

# f = open(path)
# f.read(10)
# 'Sueña el r'

# f2 = open(path, 'rb') # Binary mode

# f2.read(10)
# b'Sue\xc3\xb1a el '

In [290]:
# The read method advances the file handle's position ny the number of bytes read.
# tell gives you the current position

# f.tell()
# 11

# f2.tell()
# 10

In [291]:
# Even though we read 10 characters from the file, the position is 11 because it took
# that many bytes to decode 10 characters using the default encoding. You can check
# the deafult encoding in the sys module:
import sys

In [292]:
sys.getdefaultencoding()

'utf-8'

In [293]:
# seek changes the file position to the indicated byte in the file:
# f.seek(3)

# f.read(1)
# 'ñ'

In [294]:
# close the files
# f.close()
# f2.close()

In [295]:
# r  Read-only mode
# w  Write-only mode; creates a new file (erasing the data for any file with the same name)
# x  Write-only mode; creates a new file, but fails if the file path already exists
# a  Append to existing file (create the file if does not already exist)
# r+ Read and write
# b  Add to mode for binary files (i.e., 'rb' or 'wb')
# t  Text mode for files (automatically decoding bytes to Unicode). This is the default
# if not specified. Add to other modes to use this (i.e. 'rt' or 'xt')

In [296]:
# write text to a file, you can use the file's write or writelines methods

In [297]:
# Create a version of prof_mod.py with no blank lines

# with open('tmp.txt', 'w') as handle:
#    handle.writelines(x for x in open(path) if len(x) > 1)

# with open('tmp.txt') as f:
#    lines = f.readlines()

# lines
# ['Sueña el rico en su riqueza,\n',
#  'que más cuidados le ofrece;\n',
#  'sueña el pobre que padece\n',
#  'su miseria y su pobreza;\n',
#  'sueña el que a medrar empieza,\n',
#  'sueña el que afana y pretende,\n',
#  'sueña el que agravia y ofende,\n',
#  'y en el mundo, en conclusión,\n',
#  'todos sueñan lo que son,\n',
#  'aunque ninguno lo entiende.\n']

In [298]:
# Important Python file methods or attributes

# read([size]): Return data from file as a string, with optional size argument indicating the
# number of bytes to read
# readlines([size]): Return list of lines in the file, with optional size argument
# write(str): Write passed string to file
# writelines(strings): Write passed sequence of strings to the file
# close()
# flush()
# seek(pos)
# tell()
# closed

In [299]:
# Bytes and Unicode with Files
# The deafult behavior for Python files (wehther readable or writable) is text mode, means
# you intend to work with Python strings (i.e. Unicode). 

# This contrasts with binary mode, which you can obtain by appending b onto the file mode

In [300]:
# File which contains non-ASCII characters with UTF-8 encoding

# with open(path) as f:
#    chars = f.read(10)

# chars
# 'Sueña el r'

# UTF-8 is a variable-length Unicode encoding, so when I requested some number of characters
# characters from the file, Python reads enough bytes (which could be as few as 10 or as 
# many as 40 bytes) from the file to decode that many character

In [301]:
# If I open the file in 'rb' mode instead, read requests exact numbers of bytes
# with open(path, 'rb') as f:
#     data = f.read(10)

# data
# b'Sue\xc3\xb1a el '

In [302]:
# Depending on the text encoding, you may be able to decode the bytes to a str object 
# yourself, but only if each of the encoded Unicolde characters is fully formed

# data.decode('utf8')
# 'Sueña el '

# data[:4].decode('utf8')
# UnicodeDecodeError Traceback (most recent call last)
# <ipython-input-235-300e0af10bb7> in <module>()
# ----> 1 data[:4].decode('utf8')
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 3: unexpecte
# d end of data

In [303]:
# Text mode, combined with the encoding option of opern, provides a convenient way to
# convert from one Unicode encoding to another

# sink_path = 'sink.txt'
# with open(path) as source:
#    with open(sink_path, 'xt', encoding = 'iso-8859-1') as sink:
#       sink.write(source.read())

# with open(sink_path, encoding = 'iso-8859-1') as f:
#     print(f.read(10))

# Sueña el r

In [None]:
# using seek when opening files in any mode other than binary.
# If the file position falls in the middle of the bytes defining a Unicode character, then 
# subsequent reads will result in an error

# f = open(path)
# f.read(5)
# 'Sueña'

# f.seek(4)
# 4

# f.read(1)
# UnicodeDecodeError Traceback (most recent call last)
# ...

# f.close()