# Pandas 1 worksheet

- Observe syntax, predict output and run cell to confirm your output

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Learning objectives

 - Pandas helps deal with tabular (tables) data
 - List of list is not adequate alternative to excel
 - Series: new data structure
     - hybrid of a dict and a list
     - Python dict "key" equivalent to "index" in pandas
     - Python list "index" quivalent to "integer position" in pandas
     - supports complicated expressions within lookup [...]
     - element-wise operation
     - boolean indexing
 - DataFrames aka tables (next lecture)
     - built from series
     - each series will be a column in the table

# pandas comes with Anaconda installation
If for some reason, you don't have pandas installed, run the following command in terminal or powershell
<pre> pip install pandas </pre>

In [None]:
import pandas

In [None]:
pandas.Series

## Module naming abbreviation

In [None]:
import pandas as pd

In [None]:
pd.Series

## Create a series from a dict

In [None]:
#create a series from a dict
d = {"one":7, "two":8, "three":9}
d

In [None]:
s = pd.Series({"one":7, "two":8, "three":9})
s

In [None]:
# IP  index    value
# 0   one      7
# 1   two      8
# 2   three    9

# dtype: int64

## Accessing values with index (.loc[...])

In [None]:
# dict access with key
d["one"]

In [None]:
s.loc["one"]

In [None]:
s.loc["two"]

## Accessing values with integer position (.iloc[...])

In [None]:
s.iloc[0]

In [None]:
s.iloc[1]

In [None]:
s.iloc[-1]

In [None]:
s["one"]

In [None]:
s[0]

## Accessing multiple values with a list of integer positions

In [None]:
s[[0, 2]]

In [None]:
#series access with a list of indexes
s[["one", "three"]]

## Create a series from a list

In [None]:
# Series created from a list
num_list = [100, 200, 300]
s = pd.Series([100, 200, 300])
s

In [None]:
# IP  index value
# 0   0      100
# 1   1      200
# 2   2      300
# dtype: int64

In [None]:
print(s.loc[1])
print(s.iloc[1])

In [None]:
letters_list = ["A", "B", "C", "D"]
letters = pd.Series(letters_list)
# letters[-1] #Avoid negative indexes, unless we use .iloc

## Slicing series using integer positions

In [None]:
letters_list = ["A", "B", "C", "D"]
letters = pd.Series(letters_list)
letters

In [None]:
#list slicing reveiw
letters_list

In [None]:
sliced_letter_list = letters_list[2:]
sliced_letter_list

In [None]:
sliced_letter_list[0]

In [None]:
#series slicing
letters

In [None]:
sliced_letters = letters[2:]
sliced_letters

In [None]:
sliced_letters.loc[2]

In [None]:
sliced_letters.iloc[0]

In [None]:
# sliced_letter.loc[0] # index 0 doesn't exist in the sliced series!

In [None]:
sliced_letters[2]

In [None]:
# Note: integer positions get renumbered, whereas indexes do not.

# IP  Index  values
# 0   2       c
# 1   3       d
# 2   4       e
# 3   5       f
# dtype: object

## Slicing series using index

In [None]:
s = pd.Series({"one":7, "two":8, "three":9})
s

In [None]:
#slicing with indexes
s["two":]

## Element-wise operations
1. SERIES op SCALAR
2. SERIES op SERIES

In [None]:
#list recap
nums = [1, 2, 3]
nums * 3

In [None]:
snum = pd.Series(nums)
snum

In [None]:
snum * 3

In [None]:
snum + 3

In [None]:
snum / 3

In [None]:
nums

In [None]:
# nums / 3 # doesn't work with lists

In [None]:
snum

In [None]:
snum += 2
snum

In [None]:
#list recap
l1 = [1, 2, 3]
l2 = [4, 5, 6]
l1 + l2

In [None]:
s1 = pd.Series(l1)
s2 = pd.Series(l2)
print(s1)
print(s2)
s1 + s2

In [None]:
print(s1)
print(s2)
s1 * s2

In [None]:
print(s1)
print(s2)
s1 / s2

In [None]:
print(s1)
print(s2)
s2 ** s1

In [None]:
print(s1)
print(s2)
s1 < s2

## What happens to element-wise operation if we have two series with different sizes?

In [None]:
pd.Series([1,2,3]) + pd.Series([4,5])

## Series with different types

In [None]:
pd.Series(["a", "Alice", True, 1, 4.5, [1,2], {"a":"Alice"}])

## How do you merge two series?

In [None]:
s1 = pd.Series([1,2,3]) 
s2 = pd.Series([4,5])
print(s1)
print(s2)

In [None]:
s = pd.concat( [s1, s2] )
s

In [None]:
s.loc[0]

## Element-wise Ambiguity

In [None]:
s1 = pd.Series({"A":10, "B": 20 })
s2 = pd.Series({"B":1, "A": 2 })
print(s1)
print(s2)

In [None]:
# INDEX ALIGNMENT
s1 + s2

## How to insert an index-value pair?

In [None]:
s = pd.Series({"A":10, "B": 20 })
print(s)
s["Z"] = 100
s

## Boolean indexing

In [None]:
s = pd.Series([10, 2, 3, 15])
s

## How to extract numbers > 8?

In [None]:
b = pd.Series([True, False, False, True])
b

In [None]:
s[b]

In [None]:
s

In [None]:
b = s > 8
b

In [None]:
s[b]

In [None]:
s[s > 8]

In [None]:
s[pd.Series([True, False, False, True])]

## Element-wise String operations

In [None]:
words = pd.Series(["APPLE", "boy", "CAT", "dog"])
words

In [None]:
# words.upper()  # can't call string functions on Series

In [None]:
words.str.upper()

In [None]:
#words[BOOLEAN SERIES]
#How do we get BOOLEAN SERIES?
b = words == words.str.upper()
b

In [None]:
words[b]

In [None]:
words[words == words.str.upper()]

## How to get the odd numbers from a list?

In [None]:
s = pd.Series([10, 19, 11, 30, 35])
s

In [None]:
s % 2

In [None]:
b = s % 2 == 1
b

In [None]:
s

In [None]:
s[b]

## BOOLEAN OPERATORS on series: and, or, not 

## How to get numbers < 12 or numbers > 33?

In [None]:
s

In [None]:
# s[s < 12 or s > 33] # doesn't work with or, and, not

In [None]:
# use | instead of or
s[ s < 12 |  s > 33]
# error because precedence is so high
# s[ s < (12 |  s) > 33]

In [None]:
# Use lots of parenthesis
s[ (s < 12) | (s > 33)]

In [None]:
# AND is &
s[ (s > 12) & (s < 33)]

In [None]:
# NOT is ~
s[ ~((s > 12) & (s < 33))]