In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

Creating a Series by passing a list of values, letting pandas create a default integer index

In [2]:
seri = pd.Series([1,2,3,np.nan,45])
seri

0     1.0
1     2.0
2     3.0
3     NaN
4    45.0
dtype: float64

Creating a DataFrameby passing a numpy array, with a datetime index and labeled columns:

In [3]:
dates = pd.date_range("20210101", periods=10)
dates

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(10,4), index=dates, columns=["a","b","c","d"]) #list('abcd') possible
df

Unnamed: 0,a,b,c,d
2021-01-01,0.482631,-0.917418,0.652312,-0.675364
2021-01-02,-0.601569,0.888478,0.875262,-1.645763
2021-01-03,0.971676,0.16708,-1.057748,-1.223418
2021-01-04,-0.424823,1.512963,-1.044807,0.357849
2021-01-05,0.425621,-0.049688,-0.433479,0.12093
2021-01-06,1.357304,0.178053,-1.265542,-0.643415
2021-01-07,0.031349,-0.749075,-0.018996,0.386057
2021-01-08,-0.882429,-0.817032,-1.165577,0.179282
2021-01-09,1.548553,0.545712,-0.744566,1.493493
2021-01-10,0.733131,0.771151,-0.131902,-0.619149


Creating a DataFrame by passing a dict of objects that can be converted to series like

In [5]:
df2 = pd.DataFrame({
    "a" : 12,
    "b" : pd.Timestamp("20201201"),
    "c" : pd.Series(1, index=list(range(4)), dtype=float ),
    "d" : np.array([3] * 4, dtype="int32"),
    "e" : pd.Categorical(["test", "train"]*2),
    "f" : "foo"
})
df2

Unnamed: 0,a,b,c,d,e,f
0,12,2020-12-01,1.0,3,test,foo
1,12,2020-12-01,1.0,3,train,foo
2,12,2020-12-01,1.0,3,test,foo
3,12,2020-12-01,1.0,3,train,foo


In [6]:
df2.dtypes

a             int64
b    datetime64[ns]
c           float64
d             int32
e          category
f            object
dtype: object

In [7]:
df2.b

0   2020-12-01
1   2020-12-01
2   2020-12-01
3   2020-12-01
Name: b, dtype: datetime64[ns]

Display the index, columns, and the underlying numpy data

In [8]:
df.index

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df.columns

Index(['a', 'b', 'c', 'd'], dtype='object')

In [10]:
df.values

array([[ 0.48263137, -0.91741811,  0.65231161, -0.67536433],
       [-0.60156895,  0.88847752,  0.87526232, -1.64576326],
       [ 0.97167591,  0.16707971, -1.05774817, -1.22341839],
       [-0.42482296,  1.5129627 , -1.04480731,  0.35784925],
       [ 0.42562102, -0.04968797, -0.43347932,  0.12092959],
       [ 1.35730385,  0.17805294, -1.26554199, -0.6434153 ],
       [ 0.03134921, -0.74907544, -0.01899605,  0.38605705],
       [-0.88242852, -0.817032  , -1.16557685,  0.17928164],
       [ 1.54855298,  0.54571236, -0.74456577,  1.49349345],
       [ 0.7331305 ,  0.77115105, -0.13190184, -0.61914928]])

Describe shows a quick statistic summary of your data

In [11]:
df.describe()

Unnamed: 0,a,b,c,d
count,10.0,10.0,10.0,10.0
mean,0.364144,0.153022,-0.433504,-0.22695
std,0.825431,0.807545,0.763253,0.912753
min,-0.882429,-0.917418,-1.265542,-1.645763
25%,-0.31078,-0.574229,-1.054513,-0.667377
50%,0.454126,0.172566,-0.589023,-0.24911
75%,0.91204,0.714791,-0.047222,0.313207
max,1.548553,1.512963,0.875262,1.493493


In [12]:
df.T #transposing the data

Unnamed: 0,2021-01-01 00:00:00,2021-01-02 00:00:00,2021-01-03 00:00:00,2021-01-04 00:00:00,2021-01-05 00:00:00,2021-01-06 00:00:00,2021-01-07 00:00:00,2021-01-08 00:00:00,2021-01-09 00:00:00,2021-01-10 00:00:00
a,0.482631,-0.601569,0.971676,-0.424823,0.425621,1.357304,0.031349,-0.882429,1.548553,0.733131
b,-0.917418,0.888478,0.16708,1.512963,-0.049688,0.178053,-0.749075,-0.817032,0.545712,0.771151
c,0.652312,0.875262,-1.057748,-1.044807,-0.433479,-1.265542,-0.018996,-1.165577,-0.744566,-0.131902
d,-0.675364,-1.645763,-1.223418,0.357849,0.12093,-0.643415,0.386057,0.179282,1.493493,-0.619149


Sorting by an axis

In [13]:
df.sort_index(axis=1, ascending=False)
df

Unnamed: 0,a,b,c,d
2021-01-01,0.482631,-0.917418,0.652312,-0.675364
2021-01-02,-0.601569,0.888478,0.875262,-1.645763
2021-01-03,0.971676,0.16708,-1.057748,-1.223418
2021-01-04,-0.424823,1.512963,-1.044807,0.357849
2021-01-05,0.425621,-0.049688,-0.433479,0.12093
2021-01-06,1.357304,0.178053,-1.265542,-0.643415
2021-01-07,0.031349,-0.749075,-0.018996,0.386057
2021-01-08,-0.882429,-0.817032,-1.165577,0.179282
2021-01-09,1.548553,0.545712,-0.744566,1.493493
2021-01-10,0.733131,0.771151,-0.131902,-0.619149


Sorting by value

In [14]:
df.sort_values(by="c")

Unnamed: 0,a,b,c,d
2021-01-06,1.357304,0.178053,-1.265542,-0.643415
2021-01-08,-0.882429,-0.817032,-1.165577,0.179282
2021-01-03,0.971676,0.16708,-1.057748,-1.223418
2021-01-04,-0.424823,1.512963,-1.044807,0.357849
2021-01-09,1.548553,0.545712,-0.744566,1.493493
2021-01-05,0.425621,-0.049688,-0.433479,0.12093
2021-01-10,0.733131,0.771151,-0.131902,-0.619149
2021-01-07,0.031349,-0.749075,-0.018996,0.386057
2021-01-01,0.482631,-0.917418,0.652312,-0.675364
2021-01-02,-0.601569,0.888478,0.875262,-1.645763


### Selection
Getting

In [15]:
df["a"]

2021-01-01    0.482631
2021-01-02   -0.601569
2021-01-03    0.971676
2021-01-04   -0.424823
2021-01-05    0.425621
2021-01-06    1.357304
2021-01-07    0.031349
2021-01-08   -0.882429
2021-01-09    1.548553
2021-01-10    0.733131
Freq: D, Name: a, dtype: float64

In [16]:
df.a

2021-01-01    0.482631
2021-01-02   -0.601569
2021-01-03    0.971676
2021-01-04   -0.424823
2021-01-05    0.425621
2021-01-06    1.357304
2021-01-07    0.031349
2021-01-08   -0.882429
2021-01-09    1.548553
2021-01-10    0.733131
Freq: D, Name: a, dtype: float64

Selecting via [], which slices the rows

In [17]:
df[0:4]

Unnamed: 0,a,b,c,d
2021-01-01,0.482631,-0.917418,0.652312,-0.675364
2021-01-02,-0.601569,0.888478,0.875262,-1.645763
2021-01-03,0.971676,0.16708,-1.057748,-1.223418
2021-01-04,-0.424823,1.512963,-1.044807,0.357849


In [18]:
df["20210102":"20210107"]

Unnamed: 0,a,b,c,d
2021-01-02,-0.601569,0.888478,0.875262,-1.645763
2021-01-03,0.971676,0.16708,-1.057748,-1.223418
2021-01-04,-0.424823,1.512963,-1.044807,0.357849
2021-01-05,0.425621,-0.049688,-0.433479,0.12093
2021-01-06,1.357304,0.178053,-1.265542,-0.643415
2021-01-07,0.031349,-0.749075,-0.018996,0.386057


Selection by Label

In [19]:
df.loc[dates[0]]

a    0.482631
b   -0.917418
c    0.652312
d   -0.675364
Name: 2021-01-01 00:00:00, dtype: float64

Selecting on a multi-axis by label

In [20]:
df.loc[:,["b","c"]]

Unnamed: 0,b,c
2021-01-01,-0.917418,0.652312
2021-01-02,0.888478,0.875262
2021-01-03,0.16708,-1.057748
2021-01-04,1.512963,-1.044807
2021-01-05,-0.049688,-0.433479
2021-01-06,0.178053,-1.265542
2021-01-07,-0.749075,-0.018996
2021-01-08,-0.817032,-1.165577
2021-01-09,0.545712,-0.744566
2021-01-10,0.771151,-0.131902


Showing label slicing, both endpoints are included

In [21]:
df.loc["20210103":"20210106",["a","b"]]

Unnamed: 0,a,b
2021-01-03,0.971676,0.16708
2021-01-04,-0.424823,1.512963
2021-01-05,0.425621,-0.049688
2021-01-06,1.357304,0.178053


Reduction in the dimensions of the returned object

In [22]:
df.loc["20210103", ["c","d"]]

c   -1.057748
d   -1.223418
Name: 2021-01-03 00:00:00, dtype: float64

For getting a scalar value

In [23]:
df.loc[dates[1],"a"]

-0.6015689504982479

In [24]:
df.at[dates[1],"a"] #same with the previous method

-0.6015689504982479

Selection by Position

In [25]:
df.iloc[3]

a   -0.424823
b    1.512963
c   -1.044807
d    0.357849
Name: 2021-01-04 00:00:00, dtype: float64

By integer slices, acting similar to numpy/python

In [26]:
df.iloc[3:5, 2:4]

Unnamed: 0,c,d
2021-01-04,-1.044807,0.357849
2021-01-05,-0.433479,0.12093


By lists of integer position locations, similar to the numpy/python style

In [27]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,a,c
2021-01-02,-0.601569,0.875262
2021-01-03,0.971676,-1.057748
2021-01-05,0.425621,-0.433479


For slicing rows explicitly

In [28]:
df.iloc[1:3,:]

Unnamed: 0,a,b,c,d
2021-01-02,-0.601569,0.888478,0.875262,-1.645763
2021-01-03,0.971676,0.16708,-1.057748,-1.223418


For slicing columns explicitly

In [29]:
df.iloc[:,1:3]

Unnamed: 0,b,c
2021-01-01,-0.917418,0.652312
2021-01-02,0.888478,0.875262
2021-01-03,0.16708,-1.057748
2021-01-04,1.512963,-1.044807
2021-01-05,-0.049688,-0.433479
2021-01-06,0.178053,-1.265542
2021-01-07,-0.749075,-0.018996
2021-01-08,-0.817032,-1.165577
2021-01-09,0.545712,-0.744566
2021-01-10,0.771151,-0.131902


For getting a value explicitly

In [30]:
df.iloc[1,1]

0.888477515439966

In [31]:
df.iat[1,1] # same with the previous method

0.888477515439966

Boolean Indexing

Using a single column’s values to select data

In [32]:
df[df.a > 0]

Unnamed: 0,a,b,c,d
2021-01-01,0.482631,-0.917418,0.652312,-0.675364
2021-01-03,0.971676,0.16708,-1.057748,-1.223418
2021-01-05,0.425621,-0.049688,-0.433479,0.12093
2021-01-06,1.357304,0.178053,-1.265542,-0.643415
2021-01-07,0.031349,-0.749075,-0.018996,0.386057
2021-01-09,1.548553,0.545712,-0.744566,1.493493
2021-01-10,0.733131,0.771151,-0.131902,-0.619149


A where operation for getting

In [33]:
df[df > 0]

Unnamed: 0,a,b,c,d
2021-01-01,0.482631,,0.652312,
2021-01-02,,0.888478,0.875262,
2021-01-03,0.971676,0.16708,,
2021-01-04,,1.512963,,0.357849
2021-01-05,0.425621,,,0.12093
2021-01-06,1.357304,0.178053,,
2021-01-07,0.031349,,,0.386057
2021-01-08,,,,0.179282
2021-01-09,1.548553,0.545712,,1.493493
2021-01-10,0.733131,0.771151,,


Using the isin() method for filtering

In [34]:
df2 = df.copy()
df2

Unnamed: 0,a,b,c,d
2021-01-01,0.482631,-0.917418,0.652312,-0.675364
2021-01-02,-0.601569,0.888478,0.875262,-1.645763
2021-01-03,0.971676,0.16708,-1.057748,-1.223418
2021-01-04,-0.424823,1.512963,-1.044807,0.357849
2021-01-05,0.425621,-0.049688,-0.433479,0.12093
2021-01-06,1.357304,0.178053,-1.265542,-0.643415
2021-01-07,0.031349,-0.749075,-0.018996,0.386057
2021-01-08,-0.882429,-0.817032,-1.165577,0.179282
2021-01-09,1.548553,0.545712,-0.744566,1.493493
2021-01-10,0.733131,0.771151,-0.131902,-0.619149


In [35]:
df2["e"] = ["one", "two", "three", "four", "five"]*2
df2

Unnamed: 0,a,b,c,d,e
2021-01-01,0.482631,-0.917418,0.652312,-0.675364,one
2021-01-02,-0.601569,0.888478,0.875262,-1.645763,two
2021-01-03,0.971676,0.16708,-1.057748,-1.223418,three
2021-01-04,-0.424823,1.512963,-1.044807,0.357849,four
2021-01-05,0.425621,-0.049688,-0.433479,0.12093,five
2021-01-06,1.357304,0.178053,-1.265542,-0.643415,one
2021-01-07,0.031349,-0.749075,-0.018996,0.386057,two
2021-01-08,-0.882429,-0.817032,-1.165577,0.179282,three
2021-01-09,1.548553,0.545712,-0.744566,1.493493,four
2021-01-10,0.733131,0.771151,-0.131902,-0.619149,five


In [36]:
df2[df2["e"].isin(["one", "two"])]

Unnamed: 0,a,b,c,d,e
2021-01-01,0.482631,-0.917418,0.652312,-0.675364,one
2021-01-02,-0.601569,0.888478,0.875262,-1.645763,two
2021-01-06,1.357304,0.178053,-1.265542,-0.643415,one
2021-01-07,0.031349,-0.749075,-0.018996,0.386057,two


### Setting
Setting a new column automatically aligns the data by the indexes

In [37]:
s1 = pd.Series(np.arange(1,7), index=pd.date_range("20210101", periods=6))
s1

2021-01-01    1
2021-01-02    2
2021-01-03    3
2021-01-04    4
2021-01-05    5
2021-01-06    6
Freq: D, dtype: int32

In [38]:
df["f"] = s1
df

Unnamed: 0,a,b,c,d,f
2021-01-01,0.482631,-0.917418,0.652312,-0.675364,1.0
2021-01-02,-0.601569,0.888478,0.875262,-1.645763,2.0
2021-01-03,0.971676,0.16708,-1.057748,-1.223418,3.0
2021-01-04,-0.424823,1.512963,-1.044807,0.357849,4.0
2021-01-05,0.425621,-0.049688,-0.433479,0.12093,5.0
2021-01-06,1.357304,0.178053,-1.265542,-0.643415,6.0
2021-01-07,0.031349,-0.749075,-0.018996,0.386057,
2021-01-08,-0.882429,-0.817032,-1.165577,0.179282,
2021-01-09,1.548553,0.545712,-0.744566,1.493493,
2021-01-10,0.733131,0.771151,-0.131902,-0.619149,


Setting values by label

In [39]:
df.at[dates[0],"a"] = 0

Setting values by position

In [40]:
df.iat[0,1] = 0

Setting by assigning with a numpy array

In [41]:
df.loc[:,"d"] = np.array([5] * len(df))
df

Unnamed: 0,a,b,c,d,f
2021-01-01,0.0,0.0,0.652312,5,1.0
2021-01-02,-0.601569,0.888478,0.875262,5,2.0
2021-01-03,0.971676,0.16708,-1.057748,5,3.0
2021-01-04,-0.424823,1.512963,-1.044807,5,4.0
2021-01-05,0.425621,-0.049688,-0.433479,5,5.0
2021-01-06,1.357304,0.178053,-1.265542,5,6.0
2021-01-07,0.031349,-0.749075,-0.018996,5,
2021-01-08,-0.882429,-0.817032,-1.165577,5,
2021-01-09,1.548553,0.545712,-0.744566,5,
2021-01-10,0.733131,0.771151,-0.131902,5,


A where operation with setting.


In [43]:
df3 = df.copy()
df3[df3 > 0] = -df3
df3

Unnamed: 0,a,b,c,d,f
2021-01-01,0.0,0.0,-0.652312,-5,-1.0
2021-01-02,-0.601569,-0.888478,-0.875262,-5,-2.0
2021-01-03,-0.971676,-0.16708,-1.057748,-5,-3.0
2021-01-04,-0.424823,-1.512963,-1.044807,-5,-4.0
2021-01-05,-0.425621,-0.049688,-0.433479,-5,-5.0
2021-01-06,-1.357304,-0.178053,-1.265542,-5,-6.0
2021-01-07,-0.031349,-0.749075,-0.018996,-5,
2021-01-08,-0.882429,-0.817032,-1.165577,-5,
2021-01-09,-1.548553,-0.545712,-0.744566,-5,
2021-01-10,-0.733131,-0.771151,-0.131902,-5,


### Missing Data

pandas primarily uses the value np.nanto represent missing data. It is by default not included in
computations. 

Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of
the data.

In [50]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["e"])
df1

Unnamed: 0,a,b,c,d,f,e
2021-01-01,0.0,0.0,0.652312,5,1.0,
2021-01-02,-0.601569,0.888478,0.875262,5,2.0,
2021-01-03,0.971676,0.16708,-1.057748,5,3.0,
2021-01-04,-0.424823,1.512963,-1.044807,5,4.0,


In [51]:
df1.loc[dates[0]:dates[1], "e"] = 1

In [52]:
df1

Unnamed: 0,a,b,c,d,f,e
2021-01-01,0.0,0.0,0.652312,5,1.0,1.0
2021-01-02,-0.601569,0.888478,0.875262,5,2.0,1.0
2021-01-03,0.971676,0.16708,-1.057748,5,3.0,
2021-01-04,-0.424823,1.512963,-1.044807,5,4.0,


To drop any rows that have missing data.

In [53]:
df1.dropna(how="any", inplace=True)
df1

Unnamed: 0,a,b,c,d,f,e
2021-01-01,0.0,0.0,0.652312,5,1.0,1.0
2021-01-02,-0.601569,0.888478,0.875262,5,2.0,1.0


In [54]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["e"])
df1

Unnamed: 0,a,b,c,d,f,e
2021-01-01,0.0,0.0,0.652312,5,1.0,
2021-01-02,-0.601569,0.888478,0.875262,5,2.0,
2021-01-03,0.971676,0.16708,-1.057748,5,3.0,
2021-01-04,-0.424823,1.512963,-1.044807,5,4.0,


In [57]:
df1.fillna(value=4, inplace=True)
df1

Unnamed: 0,a,b,c,d,f,e
2021-01-01,0.0,0.0,0.652312,5,1.0,4.0
2021-01-02,-0.601569,0.888478,0.875262,5,2.0,4.0
2021-01-03,0.971676,0.16708,-1.057748,5,3.0,4.0
2021-01-04,-0.424823,1.512963,-1.044807,5,4.0,4.0


To get the boolean mask where values are nan

In [58]:
pd.isnull(df1)

Unnamed: 0,a,b,c,d,f,e
2021-01-01,False,False,False,False,False,False
2021-01-02,False,False,False,False,False,False
2021-01-03,False,False,False,False,False,False
2021-01-04,False,False,False,False,False,False
