# Chapter 5. Getting Started with pandas

We will spend much of the remaining time on the *pandas* library. pandas offers much of the same idioms from NumPy, designed for tabular data. Although newer paradigms such as *polars* and *DuckDB* have shown faster processing time and more efficient memory management, pandas remains the dominant tool for data analysis. Therefore, it is critical to master pandas before moving on to other tools.

In [2]:
# Setting up for the chapter
import numpy as np 
import pandas as pd 
from pandas import Series, DataFrame

## Series

pandas *Series* is a one-dimensional array, with an indexed data label called *index*. Series has a `array` and an `index` attribute.

In [None]:
my_series = pd.Series([4, 7, -3.5, 3])
print(my_series.array)
print(my_series.index)

my_series2 = pd.Series([4, 7, -3.5, 3], index=['a', 'b', 'c', 'd'])

# selecting by index
my_series2['b']
my_series2[['b', 'd']]

# Boolean selection
my_series2[np.abs(my_series2) > 3]

# algebra
np.exp(my_series2)

# test whether an index exists
'b' in my_series2 # True


<NumpyExtensionArray>
[4.0, 7.0, -3.5, 3.0]
Length: 4, dtype: float64
RangeIndex(start=0, stop=4, step=1)


True

In [29]:
# Series is basically a dict
pop_dict = {'Ohio': 35_000, 'New York': 1_300_000, 'California': 23_000_000, 'Oregon': 12_000}
pop = pd.Series(pop_dict)
pop_todict = pop.to_dict()

# re-index
states = ['Ohio', 'New York', 'California', 'Utah']
pop2 = pd.Series(pop_dict, index=states)
pd.isna(pop2)
pop2['Utah'] = np.nan
pd.notna(pop2)
pop + pop2

California    46000000.0
New York       2600000.0
Ohio             70000.0
Oregon               NaN
Utah                 NaN
dtype: float64

In [31]:
# Both Series and index have a name attribute
pop.name = "Population"
pop.index.name = "State"
pop

State
Ohio             35000
New York       1300000
California    23000000
Oregon           12000
Name: Population, dtype: int64

In [38]:
print(pop2.to_string)

<bound method Series.to_string of Ohio             35000.0
New York       1300000.0
California    23000000.0
Utah                 NaN
dtype: float64>


In [39]:
pop2

Ohio             35000.0
New York       1300000.0
California    23000000.0
Utah                 NaN
dtype: float64

## DataFrame

A DataFrame is a rectangular table to data, containing ordered, named collection of columns. Each column can be of different data type. DataFrame has both row and column index.

One common way to construct a DataFrame is from a dictionary of equal-length lists or np.array.

In [42]:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df = pd.DataFrame(data)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [45]:
df.head(3)
df.tail(3)
df.describe()

Unnamed: 0,year,pop
count,6.0,6.0
mean,2001.5,2.55
std,1.048809,0.836062
min,2000.0,1.5
25%,2001.0,1.875
50%,2001.5,2.65
75%,2002.0,3.125
max,2003.0,3.6


In [49]:
# re-order the cols
df2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'])
df2.head(2)
df2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [None]:
df2['state']
df2.pop # naming conflict, so it doesn't work

<bound method DataFrame.pop of    year   state  pop debt
0  2000    Ohio  1.5  NaN
1  2001    Ohio  1.7  NaN
2  2002    Ohio  3.6  NaN
3  2001  Nevada  2.4  NaN
4  2002  Nevada  2.9  NaN
5  2003  Nevada  3.2  NaN>

In [54]:
# label-based versus integer-based indexing
df2.loc[1]

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object

In [56]:
df2.iloc[1]

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object

In [67]:
df2.debt = 100 * np.random.uniform(size=len(df2))
df2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,3.603237
1,2001,Ohio,1.7,19.775919
2,2002,Ohio,3.6,2.289917
3,2001,Nevada,2.4,70.73792
4,2002,Nevada,2.9,2.324624
5,2003,Nevada,3.2,86.809201


In [72]:
df2['new col'] = df2['state'] == "California"
del df2['new col']
df2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

> Another common way to construct DataFrame is from a nested dictionary of dictionaries. In this case, the outer dictionary keys are treated as columns, while the inner keys are set as rows.

You can also transpose a DataFrame. However, transpose discards the column data types if the columns don't all hae the same data type. So transposing and then transposing again may not give you back the same data.

In [75]:
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
               "Nevada": {2001: 2.4, 2002: 2.9}}
pop3 = pd.DataFrame(populations)
pop3.T

Unnamed: 0,2000,2001,2002
Ohio,1.5,1.7,3.6
Nevada,,2.4,2.9


In [None]:
pop3.index.name = 'year'
pop3.columns.name = 'state'

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [None]:
# to_numpy() method converts df to np.array
arr = pop3.to_numpy()

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9]])

In [None]:
# extract index object
row_ind = pop3.index

# Reset index
pop4 = pop3.copy()
new_idx = pd.Index(np.arange(3)) # create an index object
pop4.index = new_idx # reset the index
pop4.index is new_idx # test index

True

In [96]:
pop3.index

Index([2000, 2001, 2002], dtype='int64', name='year')

In [108]:
# Reindexing
df = pd.DataFrame({'color': ['blue', 'purple', 'red'], 'size': [12, 20, 30]}, index=[0, 2, 4])
df2 = df.reindex(index=np.arange(6), method='ffill')
type(df2)
df2


Unnamed: 0,color,size
0,blue,12
1,blue,12
2,purple,20
3,purple,20
4,red,30
5,red,30


In [118]:
df = pd.DataFrame(np.arange(9).reshape((3, 3)),
                  index=['a', 'c', 'd'],
                  columns=['OH', 'NJ', 'CA'])
df2 = df.reindex(index=['a', 'b', 'c'])

df3 = df.reindex(columns=['OH', 'NJ', 'NY'])

# You can also use .loc to reindex, but all rows and cols must exist
df4 = df.loc[['a', 'd', 'c'], ['OH', 'NJ']]
df4

Unnamed: 0,OH,NJ
a,0,1
d,6,7
c,3,4


In [128]:
# Drop entries from an axis
df = pd.DataFrame(np.arange(16).reshape((4,4)),
                  index=['OH', 'NJ', 'NY', 'CA'],
                  columns=['v1', 'v2', 'v3', 'v4'])
df.drop(['NJ', 'NY'])
df.drop(['v2', 'v1'], axis='columns')

Unnamed: 0,v3,v4
OH,2,3
NJ,6,7
NY,10,11
CA,14,15


In [140]:
# Indexing, selection, and filtering
df[0:2]
df.loc['OH':'NJ']
df.loc[['OH','NY']]

Unnamed: 0,v1,v2,v3,v4
OH,0,1,2,3
NY,8,9,10,11


In [None]:
df.iloc[[1, 2, 3]] # the inner [] is a list
df.iloc[1:3] # slicing
df.loc['OH':'NY'] # slicing is inclusive

Unnamed: 0,v1,v2,v3,v4
OH,0,1,2,3
NJ,4,5,6,7
NY,8,9,10,11


In [152]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [154]:
# chained operation
data.iloc[:, :3][data['three'] > 5]

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [None]:
# to select a single cell
data.at['Colorado','two']
data.iat[0,2]

np.int64(2)

In [162]:
ser = pd.Series(np.arange(5))
ser.iloc[-1]
ser.iloc[0:2]

0    0
1    1
dtype: int64

In [168]:
data.loc[data['three'] == 6, 'three'] = 100
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,10,7
Utah,8,9,10,11
New York,12,13,14,15


In [178]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list("bcd"),
                   index=["Ohio", "Texas", "Colorado"])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list("bde"),
                   index=["Utah", "Ohio", "Texas", "Oregon"])

# Reindex df1 using index & columns from df2
df1.reindex(index=df2.index, columns=df2.columns, fill_value=0)

# Add df1 and df2, with NaN filled with 0
df1.add(df2, fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


In [180]:
df1.reindex_like(df2)

Unnamed: 0,b,d,e
Utah,,,
Ohio,0.0,2.0,
Texas,3.0,5.0,
Oregon,,,


In [181]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [182]:
ser_c = df1['c']
ser_c

Ohio        1.0
Texas       4.0
Colorado    7.0
Name: c, dtype: float64

In [None]:
# broadcasting is by default matched by columns
df1 - ser_c

Unnamed: 0,Colorado,Ohio,Texas,b,c,d
Ohio,,,,,,
Texas,,,,,,
Colorado,,,,,,


In [None]:
# You would have to specify if it's match by rows
df1.sub(ser_c, axis='index')

Unnamed: 0,b,c,d
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Colorado,-1.0,0.0,1.0
