# 1) Whitespace Formatting

In [1]:
for i in [1, 2, 3, 4, 5]:
    print i # first line in "for i" block
    for j in [1, 2, 3, 4, 5]:
        print j # first line in "for j" block
        print i + j # last line in "for j" block
    print i # last line in "for i" block
print "done looping"

1
1
2
2
3
3
4
4
5
5
6
1
2
1
3
2
4
3
5
4
6
5
7
2
3
1
4
2
5
3
6
4
7
5
8
3
4
1
5
2
6
3
7
4
8
5
9
4
5
1
6
2
7
3
8
4
9
5
10
5
done looping


In [2]:
long_winded_computation = (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 +
                            13 + 14 + 15 + 16 + 17 + 18 + 19 + 20)
print long_winded_computation

210


In [3]:
list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
print list_of_lists

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]


In [4]:
easier_to_read_list_of_lists = [ [1, 2, 3],
                                [4, 5, 6],
                                [7, 8, 9] ]
print easier_to_read_list_of_lists

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]


In [5]:
two_plus_three = 2 + \
                3

print two_plus_three

5


# 2) Modules

In [6]:
import re
my_regex = re.compile("[0-9]+", re.I)

# 3) Arithmetic

Python 2.7 uses integer division by default, so that 5 / 2 equals 2. Almost always this
is not what we want, so we will always start our files with:

In [7]:
from __future__ import division

# 4) Functions

A function is a rule for taking zero or more inputs and returning a corresponding
output. In Python, we typically define functions using def:

In [8]:
def double(x):
    """this is where you put an optional docstring
    that explains what the function does.
    for example, this function multiplies its input by 2"""
    return x * 2

Python functions are first-class, which means that we can assign them to variables
and pass them into functions just like any other arguments:

In [9]:
def apply_to_one(f):
    """calls the function f with 1 as its argument"""
    return f(1)
my_double = double # refers to the previously defined function
x = apply_to_one(my_double) # equals 2
print x

2


It is also easy to create short anonymous functions, or lambdas:

In [10]:
y = apply_to_one(lambda x: x + 4) # equals 5
print y

5


You can assign lambdas to variables, although most people will tell you that you
should just use def instead:

In [11]:
another_double = lambda x: 2 * x # don't do this
def another_double(x): return 2 * x # do this instead

Function parameters can also be given default arguments, which only need to be
specified when you want a value other than the default:

In [74]:
def my_print(message="my default message"):
    print message

my_print("hello") # prints 'hello'
my_print() # prints 'my default message'

hello
my default message


It is sometimes useful to specify arguments by name:

In [75]:
def subtract(a=0, b=0):
    return a - b

subtract(10, 5) # returns 5
subtract(0, 5) # returns -5
subtract(b=5) # same as previous

-5

# 5) Strings

Strings can be delimited by single or double quotation marks (but the quotes have to
match):

In [76]:
single_quoted_string = 'data science'
double_quoted_string = "data science"

Python uses backslashes to encode special characters. For example:

In [77]:
tab_string = "\t" # represents the tab character
len(tab_string) # is 1

1

If you want backslashes as backslashes (which you might in Windows directory
names or in regular expressions), you can create raw strings using r"":

In [78]:
not_tab_string = r"\t" # represents the characters '\' and 't'
len(not_tab_string) # is 2

2

You can create multiline strings using triple-[double-]-quotes:

In [79]:
multi_line_string = """This is the first line.
and this is the second line
and this is the third line"""

# 6) Exceptions

When something goes wrong, Python raises an exception. Unhandled, these will cause
your program to crash. You can handle them using try and except:

In [80]:
try:
    print 0 / 0
except ZeroDivisionError:
    print "cannot divide by zero"

cannot divide by zero


Although in many languages exceptions are considered bad, in Python there is no
shame in using them to make your code cleaner, and we will occasionally do so.

# 7) Lists

Probably the most fundamental data structure in Python is the list. A list is simply
an ordered collection. (It is similar to what in other languages might be called an
array, but with some added functionality.)

In [81]:
integer_list = [1, 2, 3]
heterogeneous_list = ["string", 0.1, True]
list_of_lists = [ integer_list, heterogeneous_list, [] ]

list_length = len(integer_list) # equals 3
print list_length
list_sum = sum(integer_list) # equals 6
print list_sum

3
6


You can get or set the nth element of a list with square brackets:

In [82]:
x = range(10) # is the list [0, 1, ..., 9]
x

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [83]:
zero = x[0] # equals 0, lists are 0-indexed
zero

0

In [84]:
one = x[1] # equals 1
one

1

In [85]:
nine = x[-1] # equals 9, 'Pythonic' for last element
nine

9

In [86]:
eight = x[-2] # equals 8, 'Pythonic' for next-to-last element
eight

8

In [87]:
x[0] = -1 # now x is [-1, 1, 2, 3, ..., 9]
x

[-1, 1, 2, 3, 4, 5, 6, 7, 8, 9]

You can also use square brackets to “slice” lists:

In [88]:
first_three = x[:3] # [-1, 1, 2]
print first_three

three_to_end = x[3:] # [3, 4, ..., 9]
print three_to_end

one_to_four = x[1:5] # [1, 2, 3, 4]
print one_to_four

last_three = x[-3:] # [7, 8, 9]
print last_three

without_first_and_last = x[1:-1] # [1, 2, ..., 8]
print without_first_and_last

copy_of_x = x[:] # [-1, 1, 2, ..., 9]
print copy_of_x

[-1, 1, 2]
[3, 4, 5, 6, 7, 8, 9]
[1, 2, 3, 4]
[7, 8, 9]
[1, 2, 3, 4, 5, 6, 7, 8]
[-1, 1, 2, 3, 4, 5, 6, 7, 8, 9]


Python has an in operator to check for list membership:

In [89]:
1 in [1, 2, 3] # True

True

In [90]:
0 in [1, 2, 3] # False

False

This check involves examining the elements of the list one at a time, which means
that you probably shouldn’t use it unless you know your list is pretty small (or unless
you don’t care how long the check takes).

It is easy to concatenate lists together:

In [91]:
x = [1, 2, 3]
x.extend([4, 5, 6]) # x is now [1,2,3,4,5,6]
x

[1, 2, 3, 4, 5, 6]

If you don’t want to modify x you can use list addition:

In [92]:
x = [1, 2, 3]
y = x + [4, 5, 6] # y is [1, 2, 3, 4, 5, 6]; x is unchanged
y

[1, 2, 3, 4, 5, 6]

More frequently we will append to lists one item at a time:

In [93]:
x = [1, 2, 3]
x.append(0) # x is now [1, 2, 3, 0]
x

[1, 2, 3, 0]

In [94]:
y = x[-1] # equals 0
print y

z = len(x) # equals 4
print z

0
4


It is often convenient to unpack lists if you know how many elements they contain:

In [95]:
x, y = [1, 2] # now x is 1, y is 2
print x, y

1 2


although you will get a ValueError if you don’t have the same numbers of elements
on both sides.

It’s common to use an underscore for a value you’re going to throw away:

In [96]:
_, y = [1, 2] # now y == 2, didn't care about the first element
y

2

# 8) Tuples

Tuples are lists’ immutable cousins. Pretty much anything you can do to a list that
doesn’t involve modifying it, you can do to a tuple. You specify a tuple by using
parentheses (or nothing) instead of square brackets:

In [97]:
my_list = [1, 2]
my_tuple = (1, 2)
other_tuple = 3, 4

my_list[1] = 3 # my_list is now [1, 3]
print my_list

[1, 3]


In [98]:
try:
    my_tuple[1] = 3
except TypeError:
    print "cannot modify a tuple"

cannot modify a tuple


Tuples are a convenient way to return multiple values from functions:

In [99]:
def sum_and_product(x, y):
    return (x + y),(x * y)

sp = sum_and_product(2, 3) # equals (5, 6)
print sp

s, p = sum_and_product(5, 10) # s is 15, p is 50
print s, p

(5, 6)
15 50


Tuples (and lists) can also be used for multiple assignment:

In [100]:
x, y = 1, 2 # now x is 1, y is 2
x, y = y, x # Pythonic way to swap variables; now x is 2, y is 1

# 9) Dictionaries

Another fundamental data structure is a dictionary, which associates values with keys
and allows you to quickly retrieve the value corresponding to a given key:

In [101]:
empty_dict = {} # Pythonic
empty_dict2 = dict() # less Pythonic

grades = { "Joel" : 80, "Tim" : 95 } # dictionary literal
grades

{'Joel': 80, 'Tim': 95}

You can look up the value for a key using square brackets:

In [102]:
joels_grade = grades["Joel"] # equals 80
joels_grade

80

But you’ll get a KeyError if you ask for a key that’s not in the dictionary:

In [103]:
try:
    kates_grade = grades["Kate"]
except KeyError:
    print "no grade for Kate!"

no grade for Kate!


You can check for the existence of a key using in:

In [104]:
joel_has_grade = "Joel" in grades # True
joel_has_grade

True

In [105]:
kate_has_grade = "Kate" in grades # False
kate_has_grade

False

Dictionaries have a get method that returns a default value (instead of raising an
exception) when you look up a key that’s not in the dictionary:

In [106]:
joels_grade = grades.get("Joel", 0) # equals 80
print joels_grade

kates_grade = grades.get("Kate", 0) # equals 0
print kates_grade

no_ones_grade = grades.get("No One") # default default is None
print no_ones_grade

80
0
None


You assign key-value pairs using the same square brackets:

In [107]:
grades["Tim"] = 99 # replaces the old value
grades["Kate"] = 100 # adds a third entry
print grades

num_students = len(grades) # equals 3
print num_students

{'Tim': 99, 'Joel': 80, 'Kate': 100}
3


We will frequently use dictionaries as a simple way to represent structured data:

In [108]:
tweet = {
"user" : "joelgrus",
"text" : "Data Science is Awesome",
"retweet_count" : 100,
"hashtags" : ["#data", "#science", "#datascience", "#awesome", "#yolo"]
}

Besides looking for specific keys we can look at all of them:

In [109]:
tweet_keys = tweet.keys() # list of keys
print tweet_keys

tweet_values = tweet.values() # list of values
print tweet_values

tweet_items = tweet.items() # list of (key, value) tuples
print tweet_items

['text', 'retweet_count', 'hashtags', 'user']
['Data Science is Awesome', 100, ['#data', '#science', '#datascience', '#awesome', '#yolo'], 'joelgrus']
[('text', 'Data Science is Awesome'), ('retweet_count', 100), ('hashtags', ['#data', '#science', '#datascience', '#awesome', '#yolo']), ('user', 'joelgrus')]


In [110]:
"user" in tweet_keys # True, but uses a slow list in

True

In [111]:
"user" in tweet # more Pythonic, uses faster dict in

True

In [112]:
"joelgrus" in tweet_values # True

True

Dictionary keys must be immutable; in particular, you cannot use lists as keys. If
you need a multipart key, you should use a tuple or figure out a way to turn the key
into a string.

## defaultdict

Imagine that you’re trying to count the words in a document. An obvious approach is to create a dictionary in which the keys are words and the values are counts. As you check each word, you can increment its count if it’s already in the dictionary and add
it to the dictionary if it’s not:

In [114]:
word_counts = {}
for word in document:
    if word in word_counts:
        word_counts[word] += 1
    else:
        word_counts[word] = 1

You could also use the “forgiveness is better than permission” approach and just handle
the exception from trying to look up a missing key:

In [None]:
word_counts = {}
for word in document:
    try:
        word_counts[word] += 1
    except KeyError:
        word_counts[word] = 1

A third approach is to use **get**, which behaves gracefully for missing keys:

In [None]:
word_counts = {}
for word in document:
    previous_count = word_counts.get(word, 0)
    word_counts[word] = previous_count + 1

Every one of these is slightly unwieldy, which is why defaultdict is useful. A
defaultdict is like a regular dictionary, except that when you try to look up a key it
doesn’t contain, it first adds a value for it using a zero-argument function you provided
when you created it. In order to use defaultdicts, you have to import them
from collections:

In [115]:
from collections import defaultdict

word_counts = defaultdict(int) # int() produces 0
for word in document:
    word_counts[word] += 1

They can also be useful with list or dict or even your own functions:

In [117]:
dd_list = defaultdict(list) # list() produces an empty list
dd_list[2].append(1) # now dd_list contains {2: [1]}

dd_list

defaultdict(list, {2: [1]})

In [118]:
dd_dict = defaultdict(dict) # dict() produces an empty dict
dd_dict["Joel"]["City"] = "Seattle" # { "Joel" : { "City" : Seattle"}}

dd_dict

defaultdict(dict, {'Joel': {'City': 'Seattle'}})

In [119]:
dd_pair = defaultdict(lambda: [0, 0])
dd_pair[2][1] = 1

dd_pair

defaultdict(<function __main__.<lambda>>, {2: [0, 1]})

These will be useful when we’re using dictionaries to “collect” results by some key and
don’t want to have to check every time to see if the key exists yet.

## Counter

A Counter turns a sequence of values into a defaultdict(int)-like object mapping
keys to counts. We will primarily use it to create histograms:

In [120]:
from collections import Counter
c = Counter([0, 1, 2, 0]) # c is (basically) { 0 : 2, 1 : 1, 2 : 1 }
c

Counter({0: 2, 1: 1, 2: 1})

This gives us a very simple way to solve our word_counts problem:

In [None]:
word_counts = Counter(document)

A Counter instance has a most_common method that is frequently useful:

In [None]:
# print the 10 most common words and their counts
for word, count in word_counts.most_common(10):
    print word, count

# 10) Sets

Another data structure is set, which represents a collection of distinct elements:

In [121]:
s = set()
s.add(1) # s is now { 1 }
s.add(2) # s is now { 1, 2 }
s.add(2) # s is still { 1, 2 }
s

{1, 2}

In [122]:
x = len(s) # equals 2
print x

y = 2 in s # equals True
print y

z = 3 in s # equals False
print z

2
True
False


We’ll use sets for two main reasons. The first is that in is a very fast operation on sets.
If we have a large collection of items that we want to use for a membership test, a set
is more appropriate than a list:

In [None]:
stopwords_list = ["a","an","at"] + hundreds_of_other_words + ["yet", "you"]
"zip" in stopwords_list # False, but have to check every element

In [None]:
stopwords_set = set(stopwords_list)
"zip" in stopwords_set # very fast to check

The second reason is to find the distinct items in a collection:

In [125]:
item_list = [1, 2, 3, 1, 2, 3]

num_items = len(item_list) # 6
item_set = set(item_list) # {1, 2, 3}
print item_set

num_distinct_items = len(item_set) # 3
distinct_item_list = list(item_set) # [1, 2, 3]
print distinct_item_list

set([1, 2, 3])
[1, 2, 3]


We’ll use sets much less frequently than dicts and lists.

# 11) Control Flow

As in most programming languages, you can perform an action conditionally using
if:

In [12]:
if 1 > 2:
    message = "if only 1 were greater than two..."
elif 1 > 3:
    message = "elif stands for 'else if'"
else:
    message = "when all else fails use else (if you want to)"

You can also write a ternary if-then-else on one line, which we will do occasionally:

In [13]:
parity = "even" if x % 2 == 0 else "odd"

Python has a while loop:

In [14]:
x = 0
while x < 10:
    print x, "is less than 10"
    x += 1

0 is less than 10
1 is less than 10
2 is less than 10
3 is less than 10
4 is less than 10
5 is less than 10
6 is less than 10
7 is less than 10
8 is less than 10
9 is less than 10


although more often we’ll use for and in:

In [15]:
for x in range(10):
    print x, "is less than 10"

0 is less than 10
1 is less than 10
2 is less than 10
3 is less than 10
4 is less than 10
5 is less than 10
6 is less than 10
7 is less than 10
8 is less than 10
9 is less than 10


If you need more-complex logic, you can use continue and break:

In [16]:
for x in range(10):
    if x == 3:
        continue # go immediately to the next iteration
    if x == 5:
        break # quit the loop entirely
    print x

0
1
2
4


# 12) Truthiness

Booleans in Python work as in most other languages, except that they’re capitalized:`

In [1]:
one_is_less_than_two = 1 < 2 # equals True
true_equals_false = True == False # equals False

Python uses the value None to indicate a nonexistent value. It is similar to other languages’
null:

In [2]:
x = None
print x == None # prints True, but is not Pythonic
print x is None # prints True, and is Pythonic

True
True


Python lets you use any value where it expects a Boolean. The following are all
“Falsy”:
- False
- None
- [] (an empty list)
- {} (an empty dict)
- ""
- set()
- 0
- 0.0

Pretty much anything else gets treated as True. This allows you to easily use if statements
to test for empty lists or empty strings or empty dictionaries or so on. It also
sometimes causes tricky bugs if you’re not expecting this behavior:

In [None]:
s = some_function_that_returns_a_string()
if s:
    first_char = s[0]
else:
    first_char = ""

A simpler way of doing the same is:

In [None]:
first_char = s and s[0]

since and returns its second value when the first is “truthy,” the first value when it’s
not. Similarly, if x is either a number or possibly None:

In [6]:
safe_x = x or 0
print safe_x

0


is definitely a number.

Python has an all function, which takes a list and returns True precisely when every
element is truthy, and an any function, which returns True when at least one element
is truthy:

In [7]:
all([True, 1, { 3 }]) # True
all([True, 1, {}]) # False, {} is falsy
any([True, 1, {}]) # True, True is truthy
all([]) # True, no falsy elements in the list
any([]) # False, no truthy elements in the list

False

# 13) Sorting

Every Python list has a sort method that sorts it in place. If you don’t want to mess
up your list, you can use the sorted function, which returns a new list:

In [8]:
x = [4,1,2,3]
y = sorted(x) # is [1,2,3,4], x is unchanged
print y

x.sort() # now x is [1,2,3,4]
print x

[1, 2, 3, 4]
[1, 2, 3, 4]


By default, sort (and sorted) sort a list from smallest to largest based on naively
comparing the elements to one another.

If you want elements sorted from largest to smallest, you can specify a reverse=True
parameter. And instead of comparing the elements themselves, you can compare the
results of a function that you specify with key:

In [9]:
# sort the list by absolute value from largest to smallest
x = sorted([-4,1,-2,3], key=abs, reverse=True) # is [-4,3,-2,1]
print x

[-4, 3, -2, 1]


In [None]:
# sort the words and counts from highest count to lowest
wc = sorted(word_counts.items(),
key=lambda (word, count): count,
reverse=True)