In [1]:
import numpy as np
import pandas as pd 

from pandas import Series, DataFrame

### 5.1 Introduction to pandas Data Structures

#### Series

In [2]:
obj = pd.Series([1, 3, 4, 7])
obj

0    1
1    3
2    4
3    7
dtype: int64

In [5]:
obj.array

<NumpyExtensionArray>
[1, 3, 4, 7]
Length: 4, dtype: int64

In [6]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
obj = pd.Series([2, 4, 6, 7], index=["d", "b", "a", "c"])
obj

d    2
b    4
a    6
c    7
dtype: int64

In [8]:
obj.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [9]:
obj["a"]

6

In [12]:
obj[["a", "c"]]

a    6
c    7
dtype: int64

In [14]:
obj[obj > 5]

a    6
c    7
dtype: int64

In [15]:
obj * 2

d     4
b     8
a    12
c    14
dtype: int64

In [16]:
np.exp(obj)

d       7.389056
b      54.598150
a     403.428793
c    1096.633158
dtype: float64

In [18]:
"b" in obj

True

In [19]:
data = {"A": 10, "B": 20, "C": 20}
obj = pd.Series(data)
obj

A    10
B    20
C    20
dtype: int64

In [20]:
obj.to_dict()

{'A': 10, 'B': 20, 'C': 20}

In [21]:
letters = ["A", "B", "C", "D", "E"]
obj = pd.Series(data, index=letters)
obj

A    10.0
B    20.0
C    20.0
D     NaN
E     NaN
dtype: float64

In [23]:
pd.isna(obj)

A    False
B    False
C    False
D     True
E     True
dtype: bool

In [24]:
pd.notna(obj)

A     True
B     True
C     True
D    False
E    False
dtype: bool

In [25]:
obj.isna()

A    False
B    False
C    False
D     True
E     True
dtype: bool

In [28]:
obj = pd.Series(data)
obj2 = pd.Series(data, index=letters)

print(obj, obj2, sep="\n")

A    10
B    20
C    20
dtype: int64
A    10.0
B    20.0
C    20.0
D     NaN
E     NaN
dtype: float64


In [29]:
obj + obj2

A    20.0
B    40.0
C    40.0
D     NaN
E     NaN
dtype: float64

In [30]:
obj2.name = "value"
obj2.index.name = "letter"
obj2

letter
A    10.0
B    20.0
C    20.0
D     NaN
E     NaN
Name: value, dtype: float64

In [31]:
obj2.index = ["F", "G", "H", "I", "J"]
obj2

F    10.0
G    20.0
H    20.0
I     NaN
J     NaN
Name: value, dtype: float64

#### DataFrame

In [32]:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
"year": [2000, 2001, 2002, 2001, 2002, 2003],
"pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)

frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [33]:
# Display first five rows
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [34]:
# Display last five rows
frame.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [35]:
pd.DataFrame(data, columns=["year", "state", "pop"])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [66]:
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [38]:
frame2["year"]

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [39]:
frame2.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [40]:
frame2.loc[1]

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object

In [43]:
frame2.iloc[1]

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object

In [44]:
frame2["debt"] = 99
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,99
1,2001,Ohio,1.7,99
2,2002,Ohio,3.6,99
3,2001,Nevada,2.4,99
4,2002,Nevada,2.9,99
5,2003,Nevada,3.2,99


In [46]:
frame2["debt"] = np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,0.0
1,2001,Ohio,1.7,1.0
2,2002,Ohio,3.6,2.0
3,2001,Nevada,2.4,3.0
4,2002,Nevada,2.9,4.0
5,2003,Nevada,3.2,5.0


In [47]:
frame2["eastern"] = frame2["state"] == "Ohio"
frame2

Unnamed: 0,year,state,pop,debt,eastern
0,2000,Ohio,1.5,0.0,True
1,2001,Ohio,1.7,1.0,True
2,2002,Ohio,3.6,2.0,True
3,2001,Nevada,2.4,3.0,False
4,2002,Nevada,2.9,4.0,False
5,2003,Nevada,3.2,5.0,False


In [48]:
del frame2["eastern"]
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [49]:
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
               "Nevada": {2001: 2.4, 2002: 2.9}}

frame3 = pd.DataFrame(populations)
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [50]:
frame3.T

Unnamed: 0,2000,2001,2002
Ohio,1.5,1.7,3.6
Nevada,,2.4,2.9


In [51]:
pdata = {"Ohio": frame3["Ohio"][:-1],
         "Nevada": frame3["Nevada"][:2]}

pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [53]:
frame3.index.name = "year"
frame3.columns.name = "state"
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [54]:
frame3.to_numpy()

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9]])

In [69]:
frame2.to_numpy()

array([[2000, 'Ohio', 1.5, 16.0],
       [2001, 'Ohio', 1.7, 16.0],
       [2002, 'Ohio', 3.6, 16.0],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, nan],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

#### Index Objects

In [None]:
obj = pd.Series(np.arange(3), index=["a", "b", "c"])
index = obj.index
index
# Index objects are immutable

Index(['a', 'b', 'c'], dtype='object')

In [71]:
labels = pd.Index(np.arange(3))
labels

Index([0, 1, 2], dtype='int32')

In [72]:
obj2 = pd.Series([1.2, -2.3, 3.5], index = labels)
obj2

0    1.2
1   -2.3
2    3.5
dtype: float64

In [73]:
obj2.index is labels

True

In [74]:
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [75]:
frame3.columns

Index(['Ohio', 'Nevada'], dtype='object', name='state')

In [76]:
# Unlike Python sets, a pandas Index can contain duplicates
pd.Index(["foo", "foo"])

Index(['foo', 'foo'], dtype='object')

### 5.2 Essential Functionality

#### Reindexing

In [77]:
obj = pd.Series([2.2, 3.4, 1.4, 5.6], index=["d","b","a","c"])
obj

d    2.2
b    3.4
a    1.4
c    5.6
dtype: float64

In [78]:
obj2 = obj.reindex(["a","b","c","d","e"])
obj2

a    1.4
b    3.4
c    5.6
d    2.2
e    NaN
dtype: float64

In [79]:
obj3 = pd.Series(["blue", "purple", "yellow"], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [80]:
# Forward Fill Method
obj3.reindex(np.arange(6), method="ffill")

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [83]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=["a", "c", "d"], columns=["Ohio", "Texas", "California"])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [84]:
frame2 = frame.reindex(index=["a", "b", "c", "d"])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [85]:
states = ["Texas", "Utah", "California"]
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [86]:
# another way
frame.reindex(states, axis="columns")

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [87]:
frame.loc[["a","d","c"], ["California", "Texas"]]

Unnamed: 0,California,Texas
a,2,1
d,8,7
c,5,4


#### Dropping Entries from an Axis

In [88]:
obj = pd.Series(np.arange(5.), index=["a","b","c","d","e"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [89]:
new_obj = obj.drop("c")
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [90]:
obj.drop(["d","c"])

a    0.0
b    1.0
e    4.0
dtype: float64

In [91]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
    index=["Ohio", "Colorado", "Utah", "New York"],
    columns=["one", "two", "three", "four"])

data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [92]:
data.drop(index=["Colorado", "Ohio"])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [93]:
data.drop(columns="two")

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [94]:
data.drop("two", axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [95]:
data.drop(["two", "four"], axis="columns")

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


#### Indexing, Selection, and Filtering

In [96]:
obj = pd.Series(np.arange(4.), index=["a","b","c","d"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [97]:
obj["b"]

1.0

In [99]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [100]:
obj[["b","a","c"]]

b    1.0
a    0.0
c    2.0
dtype: float64

In [102]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [103]:
obj.loc[["b","a","c"]]

b    1.0
a    0.0
c    2.0
dtype: float64

In [105]:
obj.iloc[[0, 1, 2]]

a    0.0
b    1.0
c    2.0
dtype: float64

In [106]:
# Endpoint is inclusive
obj.loc["b":"c"]

b    1.0
c    2.0
dtype: float64

In [109]:
obj.loc["b":"c"] = 7
obj

a    0.0
b    7.0
c    7.0
d    3.0
dtype: float64

In [110]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
    index=["Ohio", "Colorado", "Utah", "New York"],
    columns=["one", "two", "three", "four"])

data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [111]:
data["two"]

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [112]:
data[["two","three"]]

Unnamed: 0,two,three
Ohio,1,2
Colorado,5,6
Utah,9,10
New York,13,14


In [113]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [114]:
data[data["three"] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [120]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [121]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


#### Selection on DataFrame with loc and iloc

In [122]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [124]:
data.loc["Colorado"]

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int32

In [125]:
data.loc[["Colorado", "New York"]]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
New York,12,13,14,15


In [126]:
data.loc["Colorado", ["two", "four"]]

two     5
four    7
Name: Colorado, dtype: int32

In [127]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [128]:
data.iloc[[2, 1]]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Colorado,0,5,6,7


In [129]:
data.iloc[2, [3, 0, 2]]

four     11
one       8
three    10
Name: Utah, dtype: int32

In [130]:
data.iloc[[1, 2], [3, 0, 2]]

Unnamed: 0,four,one,three
Colorado,7,0,6
Utah,11,8,10


In [131]:
data.loc[:"Utah", "two"]

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [132]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [133]:
# Only for loc(boolean array usage)
data.loc[data.three >= 2]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15



#### Integer indexing pitfalls

In [134]:
ser = pd.Series(np.arange(3.))
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [145]:
try:
    ser[-1]
except:
    print("Error occured!")

Error occured!


In [None]:
# for non-integer index, there is no such problem
ser2 = pd.Series(np.arange(3.), index=["a","b","c"])
ser2[-1]   # FutureWarning

  ser2[-1]


2.0