In [1]:
import numpy as np
import pandas as pd

In [2]:
from pandas import Series, DataFrame

In [3]:
#import numpy as np
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows #default: 60
pd.options.display.max_rows = 20
pd.options.display.max_columns = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision=4, suppress=True)

In [4]:
PREVIOUS_MAX_ROWS

60

## Introduction to pandas Data Structures

### Series 

`pd.Series()` is a constructor function in pandas used to create a Series object. A Series is a one-dimensional labeled array that can hold data of various types (integers, floats, strings, etc.).

In [5]:
Series([4, 7, -5, 3])

0    4
1    7
2   -5
3    3
dtype: int64

In [6]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [7]:
obj.array #array attibute


<NumpyExtensionArray>
[4, 7, -5, 3]
Length: 4, dtype: int64

In [8]:
obj.index # index atrribute

RangeIndex(start=0, stop=4, step=1)

In [9]:
obj2 = pd.Series([4, 7, -5, 3], index=["d", "b", "c", "a"])
# obj2 = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "a"]) # repeated indcids ok

obj2


d    4
b    7
c   -5
a    3
dtype: int64

In [10]:
obj2.index

Index(['d', 'b', 'c', 'a'], dtype='object')

In [11]:
obj2["a"]


3

In [12]:
obj2["d"] = 6


In [13]:
obj2

d    6
b    7
c   -5
a    3
dtype: int64

In [14]:
obj2[["c", "a", "d"]] #subsetting using a list of indices 
# obj2[[ "a", "d"]]


c   -5
a    3
d    6
dtype: int64

In [15]:
obj2>0

d     True
b     True
c    False
a     True
dtype: bool

In [16]:
obj2[obj2 > 0] #filtering, maintain the index-value mapping


d    6
b    7
a    3
dtype: int64

In [17]:
obj2 * 2


d    12
b    14
c   -10
a     6
dtype: int64

In [18]:
import numpy as np
np.exp(obj2)

d     403.428793
b    1096.633158
c       0.006738
a      20.085537
dtype: float64

Another way to think about a Series is as a fixed-length, ordered dictionary, as it is a mapping of index values to data values. It can be used in many contexts where you might use a dictionary:

In [19]:
"b" in obj2


True

In [20]:
"e" in obj2

False

In [21]:
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [22]:
obj3.to_dict()

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

When you are only passing a dictionary, the index in the resulting Series will respect the order of the keys according to the dictionary's `keys` method, which depends on the key insertion order. You can override this by passing an index with the dictionary keys in the order you want them to appear in the resulting Series:

In [23]:
states = ["California", "Ohio", "Oregon", "Texas"]
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

`NaN` is used in pandas to mark missing or `NA` values. 

In [24]:
pd.isna(obj4) # isna() decting missing values

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [25]:
pd.notna(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [26]:
obj4.isna()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [27]:
obj3


Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [28]:
obj4


California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [29]:
obj3 + obj4 #automatically aligns data by labels in arithmetic ops. 

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [30]:
obj4.name = "population" #pd.Series object obj4 has an attribute, assigned with "population"
obj4.index.name = "state" # index has an attibue name, assigned with 'state'.
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [31]:
obj


0    4
1    7
2   -5
3    3
dtype: int64

In [32]:
obj.index = ["Bob", "Steve", "Jeff", "Ryan"]
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### DataFrame

In [33]:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [34]:
frame # Jupyter will display it as a HTML table

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [35]:
frame.head(20)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [36]:
frame.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [37]:
pd.DataFrame(data, columns=["year", "state", "pop"]) # specify order of  columns

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [38]:
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
frame2


Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [39]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [40]:
frame2["state"] #result is a pandas Series


0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

`frame2[column]` works for any column name, but `frame2.column` works only when the column name is a valid Python variable name and does not conflict with any of the method names in DataFrame. For example, if a column's name contains whitespace or symbols other than underscores, it cannot be accessed with the dot attribute method.

In [41]:
frame2.year # only works when the col name is a valid Python variable name (without space etc,)
#  and does not conflict with any dataFrame method name. 

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [42]:
frame2['year']

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [43]:
frame2[['year']] # result is a dataframe

Unnamed: 0,year
0,2000
1,2001
2,2002
3,2001
4,2002
5,2003


Table 5.1: Possible data inputs to the DataFrame constructor
Type|	Notes
|:-------------------|:---------------------------------------------------------|
2D ndarray|	A matrix of data, passing optional row and column labels
Dictionary of arrays, lists, or tuples |	Each sequence becomes a column in the DataFrame; all sequences must be the same length
NumPy structured/record array	|Treated as the “dictionary of arrays” case
Dictionary of Series|	Each value becomes a column; indexes from each Series are unioned together to form the result’s row index if no explicit index is passed
Dictionary of dictionaries	|Each inner dictionary becomes a column; keys are unioned to form the row index as in the “dictionary of Series” case
List of dictionaries or Series|	Each item becomes a row in the DataFrame; unions of dictionary keys or Series indexes become the DataFrame’s column labels
List of lists or tuples	|Treated as the “2D ndarray” case
Another DataFrame	|The DataFrame’s indexes are used unless different ones are passed
NumPy MaskedArray|	Like the “2D ndarray” case except masked values are missing in the DataFrame result

`pd.DataFrame.loc[]` and `pd.DataFrame.iloc[]` are two different ways to access and manipulate data in a pandas DataFrame in Python. They are used for selecting rows and columns, but they differ in how they reference elements within the DataFrame:

1. **`pd.DataFrame.loc[]` (Label-based Indexing)**:
   - `loc[]` is primarily label-based indexing. It uses labels or row/column names to access elements.
   - You can use row and column labels to access data using `loc[]`. It includes the start and end labels in the range when slicing.
   - The syntax for `loc[]` is `df.loc[row_label, column_label]` or `df.loc[row_label]` for selecting rows.
   - Example:
     ```python
     import pandas as pd

     df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['X', 'Y', 'Z'])

     # Selecting a single element by label
     element = df.loc['Y', 'B']  # This selects the value at row 'Y' and column 'B'

     # Slicing using labels (inclusive of both 'X' and 'Y' rows, and 'A' and 'B' columns)
     sliced_df = df.loc['X':'Y', 'A':'B']
     ```

2. **`pd.DataFrame.iloc[]` (Integer-based Indexing)**:
   - `iloc[]` is primarily integer-based indexing. It uses integer positions to access elements, similar to NumPy indexing.
   - You can use integer positions to access data using `iloc[]`. It includes the start position but excludes the end position when slicing.
   - The syntax for `iloc[]` is `df.iloc[row_position, column_position]` or `df.iloc[row_position]` for selecting rows.
   - Example:
     ```python
     import pandas as pd

     df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

     # Selecting a single element by integer position
     element = df.iloc[1, 0]  # This selects the value at row 1 and column 0

     # Slicing using integer positions (includes row 0 but excludes row 2 and excludes column 1)
     sliced_df = df.iloc[0:2, 0]
     ```

In summary, the main difference between `loc[]` and `iloc[]` is in how they reference data within a DataFrame:

- `loc[]` uses labels or names for both rows and columns and is more human-readable. It is inclusive of both the start and end labels when slicing.
- `iloc[]` uses integer positions and is more similar to traditional programming indexing. It includes the start position but excludes the end position when slicing.

Your choice between `loc[]` and `iloc[]` depends on whether you want to use labels or integer positions and whether you want to include or exclude the end when slicing.

In [44]:
# import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['X', 'Y', 'Z'])
df

Unnamed: 0,A,B
X,1,4
Y,2,5
Z,3,6


In [45]:
# Selecting a single element by label
df.loc['Y', 'B']  # This selects the value at row 'Y' and column 'B'

5

In [46]:

# Slicing using labels (inclusive of both 'X' and 'Y' rows, and 'A' and 'B' columns)
df.loc['X':'Y', 'A':'B']


Unnamed: 0,A,B
X,1,4
Y,2,5


In [47]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [48]:
frame2.loc[1] #row with index 1: label-based indexing


year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object

In [49]:
# import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [50]:
# Selecting a single element by integer position
df.iloc[1, 0]  # This selects the value at row 1 and column 0

2

In [51]:

# Slicing using integer positions (includes row 0 
# but excludes row 2 and excludes column 1)
df.iloc[0:2, 0]

0    1
1    2
Name: A, dtype: int64

In [52]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [53]:
frame2.iloc[2] #row with integer index 2.

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 2, dtype: object

In [55]:
try:
    frame2[2] # try to get column with index 2, but 2 is not a column index
except:
    print("not able to read frame[2]")

not able to read frame[2]


In [54]:
frame2["debt"] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,16.5
1,2001,Ohio,1.7,16.5
2,2002,Ohio,3.6,16.5
3,2001,Nevada,2.4,16.5
4,2002,Nevada,2.9,16.5
5,2003,Nevada,3.2,16.5


In [55]:
frame2["debt"] = np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,0.0
1,2001,Ohio,1.7,1.0
2,2002,Ohio,3.6,2.0
3,2001,Nevada,2.4,3.0
4,2002,Nevada,2.9,4.0
5,2003,Nevada,3.2,5.0


If you assign a Series, its labels will be **realigned** exactly to the DataFrame’s index, inserting missing values in any index values not present:

In [56]:
val = pd.Series([-1.2, -1.5, -1.7], index=[2, 4, 5])
val

2   -1.2
4   -1.5
5   -1.7
dtype: float64

In [57]:
frame2["debt"] = val
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,-1.2
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,-1.5
5,2003,Nevada,3.2,-1.7


In [58]:
val = pd.Series([-1.2, -1.5, -1.7], index=["two", "four", "five"])
val

two    -1.2
four   -1.5
five   -1.7
dtype: float64

In [59]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,-1.2
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,-1.5
5,2003,Nevada,3.2,-1.7


In [60]:
frame2["debt"] = val
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [61]:
frame2["eastern"] = frame2["state"] == "Ohio" #create a new col
# New columns cannot be created with the frame2.eastern dot attribute notation.
frame2

Unnamed: 0,year,state,pop,debt,eastern
0,2000,Ohio,1.5,,True
1,2001,Ohio,1.7,,True
2,2002,Ohio,3.6,,True
3,2001,Nevada,2.4,,False
4,2002,Nevada,2.9,,False
5,2003,Nevada,3.2,,False


In [62]:
del frame2["eastern"]

In [63]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [64]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

The column returned from indexing a DataFrame is a `view` on the underlying data, not a copy. Thus, any in-place modifications to the Series will be reflected in the DataFrame. The column can be explicitly copied with the Series’s `copy` method.

In [65]:
frame2_col = frame2['debt']
frame2_col

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
Name: debt, dtype: float64

In [66]:
frame2_col[0]= 100
frame2_col

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame2_col[0]= 100


0    100.0
1      NaN
2      NaN
3      NaN
4      NaN
5      NaN
Name: debt, dtype: float64

In [67]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,100.0
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


If the nested dictionary is passed to the DataFrame, pandas will interpret the outer dictionary keys as the columns, and the inner keys as the row indices:

In [69]:
# nested dictionary
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
               "Nevada": {2001: 2.4, 2002: 2.9}}

populations

{'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}, 'Nevada': {2001: 2.4, 2002: 2.9}}

In [71]:
frame3 = pd.DataFrame(populations)
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [72]:
transpose_frame3 =frame3.T
transpose_frame3

Unnamed: 0,2000,2001,2002
Ohio,1.5,1.7,3.6
Nevada,,2.4,2.9


In [73]:
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [74]:
transpose_frame3.T

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


### Warning
Note that transposing discards the column data types if the columns do not all have the same data type, so transposing and then transposing back may lose the previous type information. The columns become arrays of pure Python objects, essentially a generic data type that can store any type of data. 

In [75]:
df = pd.DataFrame({"A":["a","b","c"],"B":[1,2,3]})
df

Unnamed: 0,A,B
0,a,1
1,b,2
2,c,3


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       3 non-null      object
 1   B       3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 180.0+ bytes


In [77]:
dfT=df.T
dfT

Unnamed: 0,0,1,2
A,a,b,c
B,1,2,3


In [78]:
dfT.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, A to B
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       2 non-null      object
 1   1       2 non-null      object
 2   2       2 non-null      object
dtypes: object(3)
memory usage: 172.0+ bytes


In [79]:
dfT_transfer_back = dfT.T
dfT_transfer_back

Unnamed: 0,A,B
0,a,1
1,b,2
2,c,3


In [80]:
dfT_transfer_back.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       3 non-null      object
 1   B       3 non-null      object
dtypes: object(2)
memory usage: 180.0+ bytes


The keys in the inner dictionaries are combined to form the index in the result. This isn’t true if an explicit index is specified:

In [81]:
populations

{'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}, 'Nevada': {2001: 2.4, 2002: 2.9}}

In [82]:
pd.DataFrame(populations, index=[2001, 2002, 2003])
# Note the row 2003 has all values NaN, as populations do not have a row of 2003. 
# note how the data are aligned by index. 

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9
2003,,


In [83]:
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [84]:
frame3["Ohio"][:-1] # subsection [:-1] on the pd.Series frames['Ohio']

2000    1.5
2001    1.7
Name: Ohio, dtype: float64

In [86]:
# manually create a new dictionary
pdata = {"Ohio": frame3["Ohio"][:-1],
         "Nevada": frame3["Nevada"][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [87]:
frame3.index.name = "year"
frame3.columns.name = "state"
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [88]:
frame3.index

Index([2000, 2001, 2002], dtype='int64', name='year')

Unlike Series, DataFrame does not have a name attribute.

In [89]:
frame3.to_numpy()

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9]])

In [90]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,100.0
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


If the DataFrame’s columns are different data types, the data type of the returned array will be chosen to accommodate all of the columns:

In [91]:
frame2.to_numpy()

array([[2000, 'Ohio', 1.5, 100.0],
       [2001, 'Ohio', 1.7, nan],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, nan],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

### Index Objects

In [93]:
obj = pd.Series(np.arange(3), index=["a", "b", "c"])
obj

a    0
b    1
c    2
dtype: int32

In [94]:
# obj = pd.Series(np.arange(3), index=["a", "b", "c"])
index = obj.index
index


Index(['a', 'b', 'c'], dtype='object')

In [95]:
index[1:]

Index(['b', 'c'], dtype='object')

In [96]:
labels = pd.Index(np.arange(3))
labels

Index([0, 1, 2], dtype='int32')

pandas’s Index objects are responsible for holding the axis labels (including a DataFrame's column names) and other metadata (like the axis name or names). Any array or other sequence of labels you use when constructing a Series or DataFrame is internally converted to an Index:

In [99]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2


0    1.5
1   -2.5
2    0.0
dtype: float64

In [100]:
obj2.index

Index([0, 1, 2], dtype='int32')

In [101]:
0 == obj2.index[0]

True

In [102]:
obj2.index is labels

True

In [103]:
index[1] = "d"  # TypeError. Index obj is immutable

TypeError: Index does not support mutable operations

In [104]:
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [105]:

frame3.columns

Index(['Ohio', 'Nevada'], dtype='object', name='state')

In addition to being array-like, an Index also behaves like a fixed-size set:

In [106]:
"Ohio" in frame3.columns

True

In [107]:
2003 in frame3.index

False

Unlike Python sets, a pandas Index can contain duplicate labels:Selections with duplicate labels will select all occurrences of that label.

In [108]:
pd.Index(["foo", "foo", "bar", "bar"])

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

Table 5.2: Some Index methods and properties

|Method/Propertyz|	Description|
|:---------------|:-----------------------------------------|
|append()	|Concatenate with additional Index objects, producing a new Index|
|difference()|	Compute set difference as an Index|
|intersection()|	Compute set intersection|
|union()	|Compute set union|
|isin()|	Compute Boolean array indicating whether each value is contained in the passed collection|
|delete()	|Compute new Index with element at Index i deleted|
|drop()|	Compute new Index by deleting passed values|
|insert()|	Compute new Index by inserting element at Index i|
|is_monotonic|	Returns True if each element is greater than or equal to the previous element|
|is_unique|	Returns True if the Index has no duplicate values|
|unique()|	Compute the array of unique values in the Index|

## Essential Functionality

### Reindexing

An important method on pandas objects is reindex, which means to create a new object with the values rearranged to align with the new index. Consider an example:

In [109]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

Calling reindex on this Series rearranges the data according to the new index, introducing missing values if any index values were not already present:

In [110]:
obj2 = obj.reindex(["a", "b", "c", "d", "e"]) 
obj2 #new object

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [111]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [112]:
obj3 = pd.Series(["blue", "purple", "yellow"], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

For ordered data like time series, you may want to do some interpolation or filling of values when reindexing. The method option allows us to do this, using a method such as ffill, which forward-fills the values:

In [123]:
 # new object


0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [113]:
obj3.reindex(np.arange(6))

0      blue
1       NaN
2    purple
3       NaN
4    yellow
5       NaN
dtype: object

In [114]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [116]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=["a", "c", "d"],
                     columns=["Ohio", "Texas", "California"])
frame


Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


With DataFrame, reindex can alter the (row) index, columns, or both. When passed only a sequence, it reindexes the rows in the result:

In [117]:
frame2 = frame.reindex(index=["a", "b", "c", "d"])#reindex rows
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


The columns can be reindexed with the columns keyword:

In [119]:
states = ["Texas", "Utah", "California"]
frame.reindex(columns=states) # reindex columns

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


Because "Ohio" was not in states, the data for that column is dropped from the result.

Another way to reindex a particular axis is to pass the new axis labels as a positional argument and then specify the axis to reindex with the axis keyword:

In [120]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [132]:
frame.reindex(states, axis="columns")

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


Table 5.3: reindex function arguments
|Argument	|Description|
|:-------------|:-----------------------------------------------|
|labels|	New sequence to use as an index. Can be Index instance or any other sequence-like Python data structure. An Index will be used exactly as is without any copying.|
|index|	Use the passed sequence as the new index labels.|
|columns	|Use the passed sequence as the new column labels.|
|axis|	The axis to reindex, whether "index" (rows) or "columns". The default is "index". You can alternately do reindex(index=new_labels) or reindex(columns=new_labels).|
|method	|Interpolation (fill) method; "ffill" fills forward, while "bfill" fills backward.|
|fill_value|	Substitute value to use when introducing missing data by reindexing. Use fill_value="missing" (the default behavior) when you want absent labels to have null values in the result.|
|limit|	When forward filling or backfilling, the maximum size gap (in number of elements) to fill.|
|tolerance|	When forward filling or backfilling, the maximum size gap (in absolute numeric distance) to fill for inexact matches.
level	|Match simple Index on level of MultiIndex; otherwise select subset of.|
|copy|	If True, always copy underlying data even if the new index is equivalent to the old index; if False, do not copy the data when the indexes are equivalent.|


reindex by using the loc operator. This works only if all of the new index labels already exist in the DataFrame (whereas reindex will insert missing data for new labels):

In [123]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [122]:
frame.loc[["a", "d", "c"], ["California", "Texas"]] # reordering rows and/or cols

Unnamed: 0,California,Texas
a,2,1
d,8,7
c,5,4


### Dropping Entries from an Axis

In [124]:
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
obj


a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

Dropping one or more entries from an axis is simple if you already have an index array or list without those entries, since you can use the reindex method or .loc-based indexing. As that can require a bit of munging and set logic, the drop method will return a new object with the indicated value or values deleted from an axis:

In [125]:
new_obj = obj.drop("c")
new_obj


a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [126]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [127]:
obj.drop(["d", "c"])

a    0.0
b    1.0
e    4.0
dtype: float64

In [128]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


With DataFrame, index values can be deleted from either axis.

In [129]:
data.drop(index=["Colorado", "Ohio"]) # new object

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [130]:
data.drop(["Colorado", "Ohio"])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [131]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [132]:
data.drop(columns=["two"])

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [134]:
# data.drop(["two"])

In [142]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [136]:
data.drop("two", axis=1) # axis: 0 (row); 1(col)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [139]:

data.drop(["two", "four"], axis="columns")

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


### Indexing, Selection, and Filtering

In [141]:
obj = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

Indexing, Selection, and Filtering
Series indexing (obj[...]) works analogously to NumPy array indexing, except you can use the Series’s index values instead of only integers. 

In [142]:
obj["b"]

1.0

In [144]:
obj[1]

1.0

In [146]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [148]:
obj[["b", "a", "d"]] # note the reordering. Similar to Numpy fancy indexing

b    1.0
a    0.0
d    3.0
dtype: float64

In [149]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [150]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [152]:
obj < 2

a     True
b     True
c    False
d    False
dtype: bool

In [153]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [155]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [154]:
obj.loc[["b", "a", "d"]] #note reordering

b    1.0
a    0.0
d    3.0
dtype: float64

In [156]:
obj1 = pd.Series([1, 2, 3], index=[2, 0, 1])
obj2 = pd.Series([1, 2, 3], index=["a", "b", "c"])

In [157]:
obj1

2    1
0    2
1    3
dtype: int64

In [158]:
obj2

a    1
b    2
c    3
dtype: int64

The reason to prefer loc is because of the different treatment of integers when indexing with []. Regular []-based indexing will treat integers as labels if the index contains integers, so the behavior differs depending on the data type of the index. For example:

In [159]:
obj1[[0, 1, 2]] # note reordering, similar to numpy fancy indexing

0    2
1    3
2    1
dtype: int64

In [160]:
obj2

a    1
b    2
c    3
dtype: int64

In [161]:
obj2[[0, 1, 2]]

a    1
b    2
c    3
dtype: int64

When using loc, the expression obj.loc[[0, 1, 2]] will fail when the index does not contain integers:

In [163]:
obj2.loc[[0, 1]]  #[0,1] are supposed index (row)labels, but lables are [a,b,c]

KeyError: "None of [Index([0, 1], dtype='int32')] are in the [index]"

In [164]:
obj1

2    1
0    2
1    3
dtype: int64

In [166]:
obj1.iloc[[0, 1, 2]]

2    1
0    2
1    3
dtype: int64

In [168]:
obj2

a    1
b    2
c    3
dtype: int64

In [167]:

obj2.iloc[[0, 1, 2]]

a    1
b    2
c    3
dtype: int64

In [169]:
obj2.loc["b":"c"]

b    2
c    3
dtype: int64

In [164]:
obj2.loc["b":"c"] = 5
obj2

a    1
b    5
c    5
dtype: int64

### Selection on DataFrame with loc and iloc

In [2]:
import pandas as pd
import numpy as np

In [165]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data


Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [166]:
data["two"]


Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [167]:
data[["three", "one"]] # dataframe result

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


Indexing like this has a few special cases. The first is slicing or selecting data with a Boolean array:The row selection syntax `data[:2]` is provided as a convenience. 

Passing a single element or a list to the `[]` operator selects columns.

In [168]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [169]:

data[data["three"] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [170]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [171]:
data[data < 5] = 0 #indexing with a Boolean DataFrame
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [148]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


## Selection on DataFrame with loc and iloc

In [149]:
# select a row
data.loc["Colorado"]

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int64

In [150]:
data.loc[["Colorado", "New York"]]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
New York,12,13,14,15


In [172]:
data.loc["Colorado", ["two", "three"]]

two      5
three    6
Name: Colorado, dtype: int64

In [173]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [174]:
data.iloc[2] #select a row

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [175]:
data.iloc[[2, 1]] # note the rows are reordered. 

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Colorado,0,5,6,7


In [177]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [176]:
data.iloc[2, [3, 0, 1]] # the columns are reordered in the result

four    11
one      8
two      9
Name: Utah, dtype: int64

In [178]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [179]:

data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [12]:
data.loc[:"Utah", "two"]

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32

In [18]:
data.iloc[:, :3]

Unnamed: 0,one,two,three
Ohio,0,1,2
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [14]:

data.iloc[:, :3][data.three > 5]# the second indexing selects rows

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


Boolean arrays can be used with `loc` but not `iloc`:

In [180]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [181]:
data.loc[data.three >= 4]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


Table 5.4: Indexing options with DataFrame
|Type|	Notes|
|:-----------------------|:-----------------------------------------------------------|
|df[column]|	Select single column or sequence of columns from the DataFrame; special case conveniences: Boolean array (filter rows), slice (slice rows), or Boolean DataFrame (set values based on some criterion)|
|df.loc[rows]|	Select single row or subset of rows from the DataFrame by label|
|df.loc[:, cols]|	Select single column or subset of columns by label|
|df.loc[rows, cols]|	Select both row(s) and column(s) by label|
|df.iloc[rows]|	Select single row or subset of rows from the DataFrame by integer position|
|df.iloc[:, cols]|	Select single column or subset of columns by integer position|
|df.iloc[rows, cols]|	Select both row(s) and column(s) by integer position|
|df.at[row, col]|	Select a single scalar value by row and column label|
|df.iat[row, col]|	Select a single scalar value by row and column position (integers)|
|reindex method|	Select either rows or columns by labels|

### Integer indexing pitfalls

 pandas objects indexed by integers  work differently from built-in Python data structures like lists and tuples. 

In [182]:
ser = pd.Series(np.arange(3.))
ser


0    0.0
1    1.0
2    2.0
dtype: float64

In [183]:
try:
    ser[-1] #pandas does not want to guess what the user wants (label-based indexing or position-based)
except:
    print("failed")

failed


In [184]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [185]:
ser2 = pd.Series(np.arange(3.), index=["a", "b", "c"])
ser2


a    0.0
b    1.0
c    2.0
dtype: float64

In [186]:
ser2[-1] #with a noninteger index, there is no such ambiguity

2.0

If you have an axis index containing integers, data selection will always be label oriented.  use loc (for labels) or iloc (for integers) you will get exactly what you want:

In [187]:
ser.iloc[-1]

2.0

In [188]:
ser[:2] #slicing with integers is always integer oriented

0    0.0
1    1.0
dtype: float64

As a result of these pitfalls, it is best to always prefer indexing with `loc` and `iloc` to avoid ambiguity.

### Pitfalls with chained indexing

In [189]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [190]:
data.loc[:, "one"] = 1 #inplace assignment
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,1,5,6,7
Utah,1,9,10,11
New York,1,13,14,15


In [191]:
data.iloc[2] = 5
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,1,5,6,7
Utah,5,5,5,5
New York,1,13,14,15


In [198]:


data.loc[data["four"] > 5] = 3
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,6,5
New York,3,3,3,3


In [204]:
x = data.loc[data.three == 6]["three"] # x is a view 
x

Utah    6
Name: three, dtype: int64

In [205]:
x=100

In [206]:
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,6,5
New York,3,3,3,3


In [193]:
data.loc[data.three == 5]["three"] = 6 #chained indexing

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data.three == 5]["three"] = 6 #chained indexing


you are trying to modify a temporary value (the nonempty result of `data.loc[data.three == 5])` instead of the original DataFrame data, which might be what you were intending. Here, data was unmodified:

In [194]:
data #unchanged

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,5,5
New York,3,3,3,3


In [195]:
# A good rule of thumb is to avoid chained indexing when doing assignments.
data.loc[data.three == 5, "three"] = 6
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,6,5
New York,3,3,3,3


## Arithmetic and Data Alignment
pandas can make it much simpler to work with objects that have different indexes. For example, when you add objects, if any index pairs are not the same, the respective index in the result will be the union of the index pairs.

In [2]:
import pandas as pd

In [3]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=["a", "c", "e", "f", "g"])
s1


a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [4]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [5]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In the case of DataFrame, alignment is performed on both rows and columns:

In [7]:
import numpy as np

In [8]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list("bcd"),
                   index=["Ohio", "Texas", "Colorado"])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list("bde"),
                   index=["Utah", "Ohio", "Texas", "Oregon"])
df1


Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [9]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [10]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


Adding these returns a DataFrame with index and columns that are the unions of the ones in each DataFrame:

In [11]:
df1 = pd.DataFrame({"A": [1, 2]})
df2 = pd.DataFrame({"B": [3, 4]})
df1


Unnamed: 0,A
0,1
1,2


In [12]:
df2


Unnamed: 0,B
0,3
1,4


In [13]:
df1 + df2

Unnamed: 0,A,B
0,,
1,,


In [14]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list("abcd"))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list("abcde"))
df2.loc[1, "b"] = np.nan
df1


Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [15]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [16]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


 ## Arithmetic methods with fill values
In arithmetic operations between differently indexed objects, you might want to fill with a special value, like 0, when an axis label is found in one object but not the other. 

In [17]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                    columns=list("abcd"))

df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                    columns=list("abcde"))

df2.loc[1, "b"] = np.nan

df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [18]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [19]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [20]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


See Table 5.5 for a listing of Series and DataFrame methods for arithmetic. Each has a counterpart, starting with the letter `r`, that has arguments reversed. So these two statements are equivalent:

Table 5.5: Flexible arithmetic methods
|Method|	Description|
|:---------|:----------------------------------------|
|add, radd|	Methods for addition (+)|
|sub, rsub|	Methods for subtraction (-)|
|div, rdiv|	Methods for division (/)|
|floordiv, rfloordiv|	Methods for floor division (//)|
|mul, rmul|	Methods for multiplication (*)|
|pow, rpow|	Methods for exponentiation (**)|

In [21]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [22]:
1 / df1


Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [24]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [32]:
# Relatedly, when reindexing a Series or DataFrame, you can also specify a different fill value:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


## Operations between DataFrame and Series

In [25]:
arr = np.arange(12.).reshape((3, 4))
arr


array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [26]:
arr[0]


array([0., 1., 2., 3.])

In [27]:
arr - arr[0] # broadcasting

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [31]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list("bde"),
                     index=["Utah", "Ohio", "Texas", "Oregon"])
series = frame.iloc[0]
frame


Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [32]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [33]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


By default, arithmetic between DataFrame and Series matches the index of the Series on the columns of the DataFrame, broadcasting down the rows:

In [34]:
series2 = pd.Series(np.arange(3), index=["b", "e", "f"])
series2


b    0
e    1
f    2
dtype: int64

In [35]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


If an index value is not found in either the DataFrame’s columns or the Series’s index, the objects will be reindexed to form the union:

In [36]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [37]:
series3 = frame["d"]
series3


Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [38]:
frame


Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [44]:
frame.sub(series3, axis="index")

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


The axis that you pass is the axis to match on. In this case we mean to match on the DataFrame’s row index (axis="index") and broadcast across the columns.

In [40]:
frame = pd.DataFrame(np.random.standard_normal((4, 3)),
                     columns=list("bde"),
                     index=["Utah", "Ohio", "Texas", "Oregon"])
frame


Unnamed: 0,b,d,e
Utah,1.139693,1.177776,1.367161
Ohio,-1.12065,-0.760538,-1.657985
Texas,-0.689233,0.002346,-0.529166
Oregon,2.394679,0.249256,-0.005759


## Function Application and Mapping
NumPy ufuncs (element-wise array methods) also work with pandas objects:

In [41]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.139693,1.177776,1.367161
Ohio,1.12065,0.760538,1.657985
Texas,0.689233,0.002346,0.529166
Oregon,2.394679,0.249256,0.005759


Another frequent operation is applying a function on one-dimensional arrays to each column or row. DataFrame’s apply method does exactly this:

In [42]:
frame

Unnamed: 0,b,d,e
Utah,1.139693,1.177776,1.367161
Ohio,-1.12065,-0.760538,-1.657985
Texas,-0.689233,0.002346,-0.529166
Oregon,2.394679,0.249256,-0.005759


In [43]:
def f1(x):
    return x.max() - x.min()

frame.apply(f1)

b    3.515329
d    1.938314
e    3.025146
dtype: float64

Here the function `f`, which computes the difference between the maximum and minimum of a Series, is invoked once on each `column` in frame. The result is a Series having the columns of frame as its index.


If you pass `axis=columns` to apply, the function will be invoked once per row instead. A helpful way to think about this is as "apply across the columns".

In [49]:
frame.apply(f1, axis="columns")

Utah      2.187740
Ohio      1.224283
Texas     0.670096
Oregon    0.637906
dtype: float64

The function passed to apply need not return a scalar value; it can also return a Series with multiple values:

In [50]:
def f2(x):
    return pd.Series([x.min(), x.max()], index=["min", "max"])
frame.apply(f2)

Unnamed: 0,b,d,e
min,-0.887477,-1.411358,-0.395316
max,0.992303,1.300263,0.354397


Element-wise Python functions can be used, too. Suppose you wanted to compute a formatted string from each floating-point value in frame. You can do this with `applymap`: The reason for the name `applymap` is that Series has a `map` method for applying an element-wise function:

In [45]:
frame

Unnamed: 0,b,d,e
Utah,1.139693,1.177776,1.367161
Ohio,-1.12065,-0.760538,-1.657985
Texas,-0.689233,0.002346,-0.529166
Oregon,2.394679,0.249256,-0.005759


In [46]:
def my_format(x):
    return f"{x:.2f}"

frame.applymap(my_format)

Unnamed: 0,b,d,e
Utah,1.14,1.18,1.37
Ohio,-1.12,-0.76,-1.66
Texas,-0.69,0.0,-0.53
Oregon,2.39,0.25,-0.01


In [48]:
frame["e"]

Utah      1.367161
Ohio     -1.657985
Texas    -0.529166
Oregon   -0.005759
Name: e, dtype: float64

In [47]:
frame["e"].map(my_format)

Utah       1.37
Ohio      -1.66
Texas     -0.53
Oregon    -0.01
Name: e, dtype: object

### Sorting and Ranking

In [49]:
obj = pd.Series(np.arange(4), index=["d", "a", "b", "c"])
obj


d    0
a    1
b    2
c    3
dtype: int64

To sort lexicographically by row or column label, use the `sort_index` method, which returns a new, sorted object:

In [50]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [2]:
import pandas as pd
import numpy as np

In [51]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=["three", "one"],
                     columns=["d", "a", "b", "c"])
frame


Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [52]:
frame.sort_index() # sort by rows


Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [53]:
frame.sort_index(axis="columns")

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [54]:
frame.sort_index(axis="columns", ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


To sort a Series by its values, use its sort_values method:
Any missing values are sorted to the end of the Series by default:

In [55]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()
# 

2   -3
3    2
0    4
1    7
dtype: int64

In [56]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [57]:
obj.sort_values(na_position="first")

1    NaN
3    NaN
4   -3.0
5    2.0
0    4.0
2    7.0
dtype: float64

In [58]:
frame = pd.DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [59]:

frame.sort_values("b") # use the values of column "b" as keys

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [60]:
frame.sort_values(["a", "b"]) #by multiple columns: first by a, then by b

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [61]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

Ranking assigns ranks from one through the number of valid data points in an array, starting from the lowest value. by default, rank breaks ties by assigning each group the mean rank: rank(smallest)=1

In [62]:
# obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

Ranks can also be assigned according to the order in which they’re observed in the data:Here, instead of using the average rank 6.5 for the entries 0 and 2, they instead have been set to 6 and 7 because label 0 precedes label 2 in the data.

In [63]:
# obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank(method="first")

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

Table 5.6: Tie-breaking methods with rank
|Method|	Description|
|:--------|:------------------------------------------|
|"average"|	Default: assign the average rank to each entry in the equal group|
|"min"|	Use the minimum rank for the whole group|
|"max"|	Use the maximum rank for the whole group|
|"first"|	Assign ranks in the order the values appear in the data|
|"dense"|	Like method="min", but ranks always increase by 1 between groups rather than the number of equal elements in a group|

In [20]:
# obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank(ascending=False)

0    1.5
1    7.0
2    1.5
3    3.5
4    5.0
5    6.0
6    3.5
dtype: float64

In [64]:
frame = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1],
                      "c": [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [22]:
frame.rank(axis="columns") # horizontal

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


## Axis Indexes with Duplicate Labels
While many pandas functions (like reindex) require that the labels be unique, it’s not mandatory. 

In [65]:
obj = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [66]:
obj.index.is_unique

False

Data selection is one of the main things that behaves differently with duplicates. Indexing a label with multiple entries returns a Series, while single entries return a scalar value:

In [67]:
obj["a"] # return a pd series


a    0
a    1
dtype: int64

In [68]:
obj["c"]

4

In [69]:
df = pd.DataFrame(np.random.standard_normal((5, 3)),
                  index=["a", "a", "b", "b", "c"])
df


Unnamed: 0,0,1,2
a,0.263901,1.231513,1.648928
a,0.243121,-0.280879,-0.374046
b,-1.085312,1.159446,0.727659
b,-0.810697,-1.703102,-0.46827
c,-0.237549,-1.171255,-0.8168


In [70]:
df.loc["b"]


Unnamed: 0,0,1,2
b,-1.085312,1.159446,0.727659
b,-0.810697,-1.703102,-0.46827


In [71]:
df.loc["c"]

0   -0.237549
1   -1.171255
2   -0.816800
Name: c, dtype: float64

## 5.3 Summarizing and Computing Descriptive Statistics

In [72]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=["a", "b", "c", "d"],
                  columns=["one", "two"])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


Calling DataFrame’s sum method returns a Series containing column sums:

In [73]:
df.sum() # sums by row, ignoring nan

one    9.25
two   -5.80
dtype: float64

Passing axis="columns" or axis=1 sums across the columns instead:

In [74]:
df.sum(axis="columns")# ignoring nan

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

When an entire row or column contains all NA values, the sum is 0, whereas if any value is not NA, then the result is NA. This can be disabled with the skipna option, in which case any NA value in a row or column names the corresponding result NA:

In [75]:
df.sum(axis="index", skipna=False)


one   NaN
two   NaN
dtype: float64

In [76]:
df.sum(axis="columns", skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

Some aggregations, like mean, require at least one non-NA value to yield a value result, so here we have:

In [77]:
df.mean(axis="columns")

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

Table 5.7: Options for reduction methods
|Method|	Description|
|:------------|:----------------------------------------------|
|axis|	Axis to reduce over; "index" for DataFrame’s rows and "columns" for columns|
|skipna|	Exclude missing values; True by default|
|level|	Reduce grouped by level if the axis is hierarchically indexed (MultiIndex)|

Some methods, like `idxmin` and `idxmax`, return indirect statistics, like the index value where the minimum or maximum values are attained:

In [78]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [79]:
df.idxmax() # compute row index label

one    b
two    d
dtype: object

In [80]:
df.cumsum() # along rows

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [81]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [82]:
obj = pd.Series(["a", "a", "b", "c"] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [83]:

obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

Table 5.8: Descriptive and summary statistics
|Method|	Description|
|:------------|:--------------------------------------------|
|count|	Number of non-NA values|
|describe|	Compute set of summary statistics|
|min, max|	Compute minimum and maximum values|
|argmin, argmax|	Compute index locations (integers) at which minimum or maximum value is obtained, respectively; not available on DataFrame objects|
|idxmin, idxmax|	Compute index labels at which minimum or maximum value is obtained, respectively|
|quantile|	Compute sample quantile ranging from 0 to 1 (default: 0.5)|
|sum|	Sum of values|
|mean|	Mean of values|
|median|	Arithmetic median (50% quantile) of values|
|mad|	Mean absolute deviation from mean value|
|prod|	Product of all values|
|var|	Sample variance of values|
|std|	Sample standard deviation of values|
|skew|	Sample skewness (third moment) of values|
|kurt|	Sample kurtosis (fourth moment) of values|
|cumsum|	Cumulative sum of values|
|cummin, cummax|	Cumulative minimum or maximum of values, respectively|
|cumprod|	Cumulative product of values|
|diff|	Compute first arithmetic difference (useful for time series)|
|pct_change|	Compute percent changes|

### Correlation and Covariance

In [84]:
price = pd.read_pickle("examples/yahoo_price.pkl")
volume = pd.read_pickle("examples/yahoo_volume.pkl")

In [85]:
price

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,27.990226,313.062468,113.304536,25.884104
2010-01-05,28.038618,311.683844,111.935822,25.892466
2010-01-06,27.592626,303.826685,111.208683,25.733566
2010-01-07,27.541619,296.753749,110.823732,25.465944
2010-01-08,27.724725,300.709808,111.935822,25.641571
...,...,...,...,...
2016-10-17,117.550003,779.960022,154.770004,57.220001
2016-10-18,117.470001,795.260010,150.720001,57.660000
2016-10-19,117.120003,801.500000,151.259995,57.529999
2016-10-20,117.059998,796.969971,151.520004,57.250000


In [86]:
volume

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,123432400,3927000,6155300,38409100
2010-01-05,150476200,6031900,6841400,49749600
2010-01-06,138040000,7987100,5605300,58182400
2010-01-07,119282800,12876600,5840600,50559700
2010-01-08,111902700,9483900,4197200,51197400
...,...,...,...,...
2016-10-17,23624900,1089500,5890400,23830000
2016-10-18,24553500,1995600,12770600,19149500
2016-10-19,20034600,116600,4632900,22878400
2016-10-20,24125800,1734200,4023100,49455600


In [87]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.00068,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.00769
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867
2016-10-21,-0.00393,0.003011,-0.012474,0.042096


In [88]:
returns.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,,,,
2010-01-05,0.001729,-0.004404,-0.01208,0.000323
2010-01-06,-0.015906,-0.025209,-0.006496,-0.006137
2010-01-07,-0.001849,-0.02328,-0.003462,-0.0104
2010-01-08,0.006648,0.013331,0.010035,0.006897


In [89]:
price.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,27.990226,313.062468,113.304536,25.884104
2010-01-05,28.038618,311.683844,111.935822,25.892466
2010-01-06,27.592626,303.826685,111.208683,25.733566
2010-01-07,27.541619,296.753749,110.823732,25.465944
2010-01-08,27.724725,300.709808,111.935822,25.641571


In [90]:
price.iloc[1:,0] #-price.iloc[0:-1,0]

Date
2010-01-05     28.038618
2010-01-06     27.592626
2010-01-07     27.541619
2010-01-08     27.724725
2010-01-11     27.480148
                 ...    
2016-10-17    117.550003
2016-10-18    117.470001
2016-10-19    117.120003
2016-10-20    117.059998
2016-10-21    116.599998
Name: AAPL, Length: 1713, dtype: float64

In [91]:
price.iloc[0:-1,0]

Date
2010-01-04     27.990226
2010-01-05     28.038618
2010-01-06     27.592626
2010-01-07     27.541619
2010-01-08     27.724725
                 ...    
2016-10-14    117.629997
2016-10-17    117.550003
2016-10-18    117.470001
2016-10-19    117.120003
2016-10-20    117.059998
Name: AAPL, Length: 1713, dtype: float64

In [92]:
(price.iloc[1:,0]-price.iloc[0:-1,0])
# note the labels aligned to calculate difference

Date
2010-01-04    NaN
2010-01-05    0.0
2010-01-06    0.0
2010-01-07    0.0
2010-01-08    0.0
             ... 
2016-10-17    0.0
2016-10-18    0.0
2016-10-19    0.0
2016-10-20    0.0
2016-10-21    NaN
Name: AAPL, Length: 1714, dtype: float64

In [93]:
(price.iloc[1:,0].to_numpy()-price.iloc[0:-1,0].to_numpy())

array([ 0.048392, -0.445992, -0.051007, ..., -0.349998, -0.060005,
       -0.46    ])

In [94]:
(price.iloc[1:,0].to_numpy()-price.iloc[0:-1,0].to_numpy())\
/price.iloc[0:-1,0].to_numpy()

array([ 0.00172889, -0.01590635, -0.00184857, ..., -0.00297947,
       -0.00051234, -0.00392961])

The `corr` method of Series computes the correlation of the overlapping, non-NA, aligned-by-index values in two Series. Relatedly, `cov` computes the covariance:

In [95]:
returns["MSFT"].corr(returns["IBM"])


0.4997636114415114

In [96]:
returns["MSFT"].cov(returns["IBM"])

8.870655479703546e-05

In [97]:
returns.corr()


Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.407919,0.386817,0.389695
GOOG,0.407919,1.0,0.405099,0.465919
IBM,0.386817,0.405099,1.0,0.499764
MSFT,0.389695,0.465919,0.499764,1.0


In [98]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000277,0.000107,7.8e-05,9.5e-05
GOOG,0.000107,0.000251,7.8e-05,0.000108
IBM,7.8e-05,7.8e-05,0.000146,8.9e-05
MSFT,9.5e-05,0.000108,8.9e-05,0.000215


Using DataFrame’s `corrwith` method, you can compute pair-wise correlations between a DataFrame’s columns or rows with another Series or DataFrame. Passing a Series returns a Series with the correlation value computed for each column:

In [99]:
returns.corrwith(returns["IBM"])

AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64

In [100]:
returns.corrwith(volume)
# Passing a DataFrame computes the correlations of matching column names.

AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

### Unique Values, Value Counts, and Membership

In [101]:
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

The unique values are not necessarily returned in the order in which they first appear, and not in sorted order, but they could be sorted after the fact if needed (`uniques.sort()`). Relatedly, `value_counts` computes a Series containing value frequencies:

In [102]:
uniques = obj.unique()
uniques # a numpy array

array(['c', 'a', 'd', 'b'], dtype=object)

In [103]:
obj.value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

Note, The resulted Series is sorted by value in descending order as a convenience. 

`value_counts` is also available as a top-level pandas method that can be used with NumPy arrays or other Python sequences:

In [104]:
pd.value_counts(obj.to_numpy(), sort=False)

c    3
a    3
d    1
b    2
Name: count, dtype: int64

In [105]:
pd.value_counts(obj.to_numpy(), sort=True)

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [106]:
pd.value_counts(obj, sort=False)

c    3
a    3
d    1
b    2
Name: count, dtype: int64

In [74]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

`isin` performs a vectorized set membership check and can be useful in filtering a dataset down to a subset of values in a Series or column in a DataFrame:

In [107]:

mask = obj.isin(["b", "c"])
mask


0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [108]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [109]:
to_match = pd.Series(["c", "a", "b", "b", "c", "a"])
to_match

0    c
1    a
2    b
3    b
4    c
5    a
dtype: object

In [110]:
unique_vals = pd.Series(["c", "b", "a"])
unique_vals

0    c
1    b
2    a
dtype: object

Related to `isin` is the `Index.get_indexer` method, which gives you an index array from an array of possibly nondistinct values into another array of distinct values:

In [111]:
pd.Index(unique_vals)

Index(['c', 'b', 'a'], dtype='object')

In [79]:
# to_match = pd.Series(["c", "a", "b", "b", "c", "a"])
indices = pd.Index(unique_vals).get_indexer(to_match)
indices

array([0, 2, 1, 1, 0, 2])

Table 5.9: Unique, value counts, and set membership methods
Method	|Description
|:--------|:--------------------------------------|
isin|	Compute a Boolean array indicating whether each Series or DataFrame value is contained in the passed sequence of values
get_indexer|	Compute integer indices for each value in an array into another array of distinct values; helpful for data alignment and join-type operations
unique|	Compute an array of unique values in a Series, returned in the order observed
value_counts|	Return a Series containing unique values as its index and frequencies as its values, ordered count in descending order

In [112]:
data = pd.DataFrame({"Qu1": [1, 3, 4, 3, 4],
                     "Qu2": [2, 3, 1, 2, 3],
                     "Qu3": [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [113]:
data["Qu1"].value_counts()

Qu1
3    2
4    2
1    1
Name: count, dtype: int64

In [114]:
data["Qu1"].value_counts().sort_index()

Qu1
1    1
3    2
4    2
Name: count, dtype: int64

To compute this for all columns, pass `pandas.value_counts` to the DataFrame’s `apply` method:



In [115]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


Here, the row labels in the result are the distinct values occurring in all of the columns. The values are the respective counts of these values in each column.

In [116]:
data = pd.DataFrame({"a": [1, 1, 1, 2, 2], "b": [0, 0, 1, 0, 0]})
data


Unnamed: 0,a,b
0,1,0
1,1,0
2,1,1
3,2,0
4,2,0


There is also a `DataFrame.value_counts` method, but it computes counts considering each row of the DataFrame as a tuple to determine the number of occurrences of each distinct row.

In [117]:
data

Unnamed: 0,a,b
0,1,0
1,1,0
2,1,1
3,2,0
4,2,0


In [118]:
data.value_counts() # counts each row as a tuple

a  b
1  0    2
2  0    2
1  1    1
Name: count, dtype: int64

In [142]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS