In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Learning objectives

 - Pandas helps deal with tabular (tables) data
 - List of list is not adequate alternative to excel
 - Series: new data structure
     - hybrid of a dict and a list
     - Python dict "key" equivalent to "index" in pandas
     - Python list "index" quivalent to "integer position" in pandas
     - supports complicated expressions within lookup [...]
     - element-wise operation
     - boolean indexing
 - DataFrames aka tables (next lecture)
     - built from series
     - each series will be a column in the table

# Run the following command in terminal or powershell
## pip install pandas

In [2]:
import pandas

In [3]:
pandas.Series

pandas.core.series.Series

## Module naming abbreviation

In [4]:
import pandas as pd

In [5]:
pd.Series

pandas.core.series.Series

## Create a series from a dict

In [6]:
#create a series from a dict
d = {"one":7, "two":8, "three":9}
d

{'one': 7, 'two': 8, 'three': 9}

In [7]:
s = pd.Series({"one":7, "two":8, "three":9})
s

one      7
two      8
three    9
dtype: int64

In [8]:
# IP  index    value
# 0   one      7
# 1   two      8
# 2   three    9

# dtype: int64

## Accessing values with index (.loc[...])

In [9]:
# dict access with key
d["one"]

7

In [10]:
s.loc["one"]

7

In [11]:
s.loc["two"]

8

## Accessing values with integer position (.iloc[...])

In [12]:
s.iloc[0]

7

In [13]:
s.iloc[1]

8

In [14]:
s.iloc[-1]

9

In [15]:
s["one"]

7

In [16]:
s[0]

7

## Accessing multiple values with a list of integer positions

In [17]:
s[[0, 2]]

one      7
three    9
dtype: int64

In [18]:
#series access with a list of indexes
s[["one", "three"]]

one      7
three    9
dtype: int64

## Create a series from a list

In [19]:
# Series created from a list
num_list = [100, 200, 300]
s = pd.Series([100, 200, 300])
s

0    100
1    200
2    300
dtype: int64

In [20]:
# IP  index value
# 0   0      100
# 1   1      200
# 2   2      300
# dtype: int64

In [21]:
print(s.loc[1])
print(s.iloc[1])

200
200


In [22]:
letters_list = ["A", "B", "C", "D"]
letters = pd.Series(letters_list)
# letters[-1] #Avoid negative indexes, unless we use .iloc

## Slicing series using integer positions

In [23]:
letters_list = ["A", "B", "C", "D"]
letters = pd.Series(letters_list)
letters

0    A
1    B
2    C
3    D
dtype: object

In [24]:
#list slicing reveiw
letters_list

['A', 'B', 'C', 'D']

In [25]:
sliced_letter_list = letters_list[2:]
sliced_letter_list

['C', 'D']

In [26]:
sliced_letter_list[0]

'C'

In [27]:
#series slicing
letters

0    A
1    B
2    C
3    D
dtype: object

In [28]:
sliced_letters = letters[2:]
sliced_letters

2    C
3    D
dtype: object

In [29]:
sliced_letters.loc[2]

'C'

In [30]:
sliced_letters.iloc[0]

'C'

In [31]:
# sliced_letter.loc[0] # index 0 doesn't exist in the sliced series!

In [32]:
sliced_letters[2]

'C'

In [33]:
# Note: integer positions get renumbered, whereas indexes do not.

# IP  Index  values
# 0   2       c
# 1   3       d
# 2   4       e
# 3   5       f
# dtype: object

## Slicing series using index

In [34]:
s = pd.Series({"one":7, "two":8, "three":9})
s

one      7
two      8
three    9
dtype: int64

In [35]:
#slicing with indexes
s["two":]

two      8
three    9
dtype: int64

## Element-wise operations
1. SERIES op SCALAR
2. SERIES op SERIES

In [36]:
#list recap
nums = [1, 2, 3]
nums * 3

[1, 2, 3, 1, 2, 3, 1, 2, 3]

In [37]:
snum = pd.Series(nums)
snum

0    1
1    2
2    3
dtype: int64

In [38]:
snum * 3

0    3
1    6
2    9
dtype: int64

In [39]:
snum + 3

0    4
1    5
2    6
dtype: int64

In [40]:
snum / 3

0    0.333333
1    0.666667
2    1.000000
dtype: float64

In [41]:
nums

[1, 2, 3]

In [42]:
# nums / 3 # doesn't work with lists

In [43]:
snum

0    1
1    2
2    3
dtype: int64

In [44]:
snum += 2
snum

0    3
1    4
2    5
dtype: int64

In [45]:
#list recap
l1 = [1, 2, 3]
l2 = [4, 5, 6]
l1 + l2

[1, 2, 3, 4, 5, 6]

In [46]:
s1 = pd.Series(l1)
s2 = pd.Series(l2)
print(s1)
print(s2)
s1 + s2

0    1
1    2
2    3
dtype: int64
0    4
1    5
2    6
dtype: int64


0    5
1    7
2    9
dtype: int64

In [47]:
print(s1)
print(s2)
s1 * s2

0    1
1    2
2    3
dtype: int64
0    4
1    5
2    6
dtype: int64


0     4
1    10
2    18
dtype: int64

In [48]:
print(s1)
print(s2)
s1 / s2

0    1
1    2
2    3
dtype: int64
0    4
1    5
2    6
dtype: int64


0    0.25
1    0.40
2    0.50
dtype: float64

In [49]:
print(s1)
print(s2)
s2 ** s1

0    1
1    2
2    3
dtype: int64
0    4
1    5
2    6
dtype: int64


0      4
1     25
2    216
dtype: int64

In [50]:
print(s1)
print(s2)
s1 < s2

0    1
1    2
2    3
dtype: int64
0    4
1    5
2    6
dtype: int64


0    True
1    True
2    True
dtype: bool

## What happens to element-wise operation if we have two series with different sizes?

In [51]:
pd.Series([1,2,3]) + pd.Series([4,5])

0    5.0
1    7.0
2    NaN
dtype: float64

## Series with different types

In [52]:
pd.Series(["a", "Alice", True, 1, 4.5, [1,2], {"a":"Alice"}])

0                 a
1             Alice
2              True
3                 1
4               4.5
5            [1, 2]
6    {'a': 'Alice'}
dtype: object

## How do you merge two series?

In [53]:
s1 = pd.Series([1,2,3]) 
s2 = pd.Series([4,5])
print(s1)
print(s2)

0    1
1    2
2    3
dtype: int64
0    4
1    5
dtype: int64


In [54]:
s = pd.concat( [s1, s2] )
s

0    1
1    2
2    3
0    4
1    5
dtype: int64

In [55]:
s.loc[0]

0    1
0    4
dtype: int64

## Element-wise Ambiguity

In [56]:
s1 = pd.Series({"A":10, "B": 20 })
s2 = pd.Series({"B":1, "A": 2 })
print(s1)
print(s2)

A    10
B    20
dtype: int64
B    1
A    2
dtype: int64


In [57]:
# INDEX ALIGNMENT
s1 + s2

A    12
B    21
dtype: int64

## How to insert an index-value pair?

In [58]:
s = pd.Series({"A":10, "B": 20 })
print(s)
s["Z"] = 100
s

A    10
B    20
dtype: int64


A     10
B     20
Z    100
dtype: int64

## Boolean indexing

In [59]:
s = pd.Series([10, 2, 3, 15])
s

0    10
1     2
2     3
3    15
dtype: int64

## How to extract numbers > 8?

In [60]:
b = pd.Series([True, False, False, True])
b

0     True
1    False
2    False
3     True
dtype: bool

In [61]:
s[b]

0    10
3    15
dtype: int64

In [62]:
s

0    10
1     2
2     3
3    15
dtype: int64

In [63]:
b = s > 8
b

0     True
1    False
2    False
3     True
dtype: bool

In [64]:
s[b]

0    10
3    15
dtype: int64

In [65]:
s[s > 8]

0    10
3    15
dtype: int64

In [66]:
s[pd.Series([True, False, False, True])]

0    10
3    15
dtype: int64

## Element-wise String operations

In [67]:
words = pd.Series(["APPLE", "boy", "CAT", "dog"])
words

0    APPLE
1      boy
2      CAT
3      dog
dtype: object

In [68]:
# words.upper()  # can't call string functions on Series

In [69]:
words.str.upper()

0    APPLE
1      BOY
2      CAT
3      DOG
dtype: object

In [70]:
#words[BOOLEAN SERIES]
#How do we get BOOLEAN SERIES?
b = words == words.str.upper()
b

0     True
1    False
2     True
3    False
dtype: bool

In [71]:
words[b]

0    APPLE
2      CAT
dtype: object

In [72]:
words[words == words.str.upper()]

0    APPLE
2      CAT
dtype: object

## How to get the odd numbers from a list?

In [73]:
s = pd.Series([10, 19, 11, 30, 35])
s

0    10
1    19
2    11
3    30
4    35
dtype: int64

In [74]:
s % 2

0    0
1    1
2    1
3    0
4    1
dtype: int64

In [75]:
b = s % 2 == 1
b

0    False
1     True
2     True
3    False
4     True
dtype: bool

In [76]:
s

0    10
1    19
2    11
3    30
4    35
dtype: int64

In [77]:
s[b]

1    19
2    11
4    35
dtype: int64

## BOOLEAN OPERATORS on series: and, or, not 

## How to get numbers < 12 or numbers > 33?

In [78]:
s

0    10
1    19
2    11
3    30
4    35
dtype: int64

In [79]:
# s[s < 12 or s > 33] # doesn't work with or, and, not

In [80]:
# use | instead of or
s[ s < 12 |  s > 33]
# error because precedence is so high
# s[ s < (12 |  s) > 33]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [81]:
# Use lots of parenthesis
s[ (s < 12) | (s > 33)]

0    10
2    11
4    35
dtype: int64

In [82]:
# AND is &
s[ (s > 12) & (s < 33)]

1    19
3    30
dtype: int64

In [83]:
# NOT is ~
s[ ~((s > 12) & (s < 33))]

0    10
2    11
4    35
dtype: int64