In [38]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

Creating a Series by passing a list of values, letting pandas create a default integer index

In [39]:
seri = pd.Series([1,2,3,np.nan,45])
seri

0     1.0
1     2.0
2     3.0
3     NaN
4    45.0
dtype: float64

Creating a DataFrameby passing a numpy array, with a datetime index and labeled columns:

In [40]:
dates = pd.date_range("20210101", periods=10)
dates

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10'],
              dtype='datetime64[ns]', freq='D')

In [41]:
df = pd.DataFrame(np.random.randn(10,4), index=dates, columns=["a","b","c","d"]) #list('abcd') possible
df

Unnamed: 0,a,b,c,d
2021-01-01,0.480883,-0.168973,1.49548,-0.738783
2021-01-02,-0.731,1.349302,1.059063,0.710393
2021-01-03,-1.902861,0.33702,-0.344063,0.09966
2021-01-04,0.769069,-1.190182,-0.289454,-0.147524
2021-01-05,1.318712,0.509324,0.663939,-0.471609
2021-01-06,0.50153,-0.585483,-1.450538,1.048853
2021-01-07,-2.22421,-1.439403,-1.494122,0.974578
2021-01-08,-0.761851,-0.448527,-0.440605,-0.297513
2021-01-09,0.783171,0.27096,-1.189345,-2.237883
2021-01-10,0.210608,1.971006,-0.692841,0.344552


Creating a DataFrame by passing a dict of objects that can be converted to series like

In [42]:
df2 = pd.DataFrame({
    "a" : 12,
    "b" : pd.Timestamp("20201201"),
    "c" : pd.Series(1, index=list(range(4)), dtype=float ),
    "d" : np.array([3] * 4, dtype="int32"),
    "e" : pd.Categorical(["test", "train"]*2),
    "f" : "foo"
})
df2

Unnamed: 0,a,b,c,d,e,f
0,12,2020-12-01,1.0,3,test,foo
1,12,2020-12-01,1.0,3,train,foo
2,12,2020-12-01,1.0,3,test,foo
3,12,2020-12-01,1.0,3,train,foo


In [43]:
df2.dtypes

a             int64
b    datetime64[ns]
c           float64
d             int32
e          category
f            object
dtype: object

In [44]:
df2.b

0   2020-12-01
1   2020-12-01
2   2020-12-01
3   2020-12-01
Name: b, dtype: datetime64[ns]

Display the index, columns, and the underlying numpy data

In [45]:
df.index

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10'],
              dtype='datetime64[ns]', freq='D')

In [46]:
df.columns

Index(['a', 'b', 'c', 'd'], dtype='object')

In [47]:
df.values

array([[ 0.48088295, -0.16897317,  1.49547953, -0.73878348],
       [-0.73099987,  1.3493024 ,  1.05906305,  0.71039304],
       [-1.90286104,  0.33701962, -0.34406281,  0.09966005],
       [ 0.76906861, -1.19018158, -0.28945403, -0.14752402],
       [ 1.318712  ,  0.50932405,  0.66393919, -0.47160912],
       [ 0.50152981, -0.58548257, -1.45053834,  1.04885328],
       [-2.22421041, -1.43940346, -1.49412225,  0.97457768],
       [-0.76185132, -0.44852658, -0.44060479, -0.29751317],
       [ 0.78317143,  0.27096016, -1.1893455 , -2.23788252],
       [ 0.21060819,  1.97100559, -0.69284132,  0.34455183]])

Describe shows a quick statistic summary of your data

In [48]:
df.describe()

Unnamed: 0,a,b,c,d
count,10.0,10.0,10.0,10.0
mean,-0.155595,0.060504,-0.268249,-0.071528
std,1.197759,1.063339,1.038516,0.972891
min,-2.22421,-1.439403,-1.494122,-2.237883
25%,-0.754138,-0.551244,-1.065219,-0.428085
50%,0.345746,0.050993,-0.392334,-0.023932
75%,0.702184,0.466248,0.425591,0.618933
max,1.318712,1.971006,1.49548,1.048853


In [49]:
df.T #transposing the data

Unnamed: 0,2021-01-01 00:00:00,2021-01-02 00:00:00,2021-01-03 00:00:00,2021-01-04 00:00:00,2021-01-05 00:00:00,2021-01-06 00:00:00,2021-01-07 00:00:00,2021-01-08 00:00:00,2021-01-09 00:00:00,2021-01-10 00:00:00
a,0.480883,-0.731,-1.902861,0.769069,1.318712,0.50153,-2.22421,-0.761851,0.783171,0.210608
b,-0.168973,1.349302,0.33702,-1.190182,0.509324,-0.585483,-1.439403,-0.448527,0.27096,1.971006
c,1.49548,1.059063,-0.344063,-0.289454,0.663939,-1.450538,-1.494122,-0.440605,-1.189345,-0.692841
d,-0.738783,0.710393,0.09966,-0.147524,-0.471609,1.048853,0.974578,-0.297513,-2.237883,0.344552


Sorting by an axis

In [51]:
df.sort_index(axis=1, ascending=False)
df

Unnamed: 0,a,b,c,d
2021-01-01,0.480883,-0.168973,1.49548,-0.738783
2021-01-02,-0.731,1.349302,1.059063,0.710393
2021-01-03,-1.902861,0.33702,-0.344063,0.09966
2021-01-04,0.769069,-1.190182,-0.289454,-0.147524
2021-01-05,1.318712,0.509324,0.663939,-0.471609
2021-01-06,0.50153,-0.585483,-1.450538,1.048853
2021-01-07,-2.22421,-1.439403,-1.494122,0.974578
2021-01-08,-0.761851,-0.448527,-0.440605,-0.297513
2021-01-09,0.783171,0.27096,-1.189345,-2.237883
2021-01-10,0.210608,1.971006,-0.692841,0.344552


Sorting by value

In [52]:
df.sort_values(by="c")

Unnamed: 0,a,b,c,d
2021-01-07,-2.22421,-1.439403,-1.494122,0.974578
2021-01-06,0.50153,-0.585483,-1.450538,1.048853
2021-01-09,0.783171,0.27096,-1.189345,-2.237883
2021-01-10,0.210608,1.971006,-0.692841,0.344552
2021-01-08,-0.761851,-0.448527,-0.440605,-0.297513
2021-01-03,-1.902861,0.33702,-0.344063,0.09966
2021-01-04,0.769069,-1.190182,-0.289454,-0.147524
2021-01-05,1.318712,0.509324,0.663939,-0.471609
2021-01-02,-0.731,1.349302,1.059063,0.710393
2021-01-01,0.480883,-0.168973,1.49548,-0.738783


### Selection
Getting

In [53]:
df["a"]

2021-01-01    0.480883
2021-01-02   -0.731000
2021-01-03   -1.902861
2021-01-04    0.769069
2021-01-05    1.318712
2021-01-06    0.501530
2021-01-07   -2.224210
2021-01-08   -0.761851
2021-01-09    0.783171
2021-01-10    0.210608
Freq: D, Name: a, dtype: float64

In [54]:
df.a

2021-01-01    0.480883
2021-01-02   -0.731000
2021-01-03   -1.902861
2021-01-04    0.769069
2021-01-05    1.318712
2021-01-06    0.501530
2021-01-07   -2.224210
2021-01-08   -0.761851
2021-01-09    0.783171
2021-01-10    0.210608
Freq: D, Name: a, dtype: float64

Selecting via [], which slices the rows

In [55]:
df[0:4]

Unnamed: 0,a,b,c,d
2021-01-01,0.480883,-0.168973,1.49548,-0.738783
2021-01-02,-0.731,1.349302,1.059063,0.710393
2021-01-03,-1.902861,0.33702,-0.344063,0.09966
2021-01-04,0.769069,-1.190182,-0.289454,-0.147524


In [40]:
df["20210102":"20210107"]

Unnamed: 0,a,b,c,d
2021-01-02,-1.047581,1.161637,-0.803006,-2.093864
2021-01-03,-1.151396,0.575674,-0.723401,-0.487148
2021-01-04,-0.628977,1.176879,-0.412652,2.204962
2021-01-05,1.115872,1.940716,0.584026,0.259215
2021-01-06,0.944919,-1.035663,-0.510705,-1.078126
2021-01-07,-0.581449,1.246238,-0.510814,-1.517736


Selection by Label

In [56]:
df.loc[dates[0]]

a    0.480883
b   -0.168973
c    1.495480
d   -0.738783
Name: 2021-01-01 00:00:00, dtype: float64

Selecting on a multi-axis by label

In [57]:
df.loc[:,["b","c"]]

Unnamed: 0,b,c
2021-01-01,-0.168973,1.49548
2021-01-02,1.349302,1.059063
2021-01-03,0.33702,-0.344063
2021-01-04,-1.190182,-0.289454
2021-01-05,0.509324,0.663939
2021-01-06,-0.585483,-1.450538
2021-01-07,-1.439403,-1.494122
2021-01-08,-0.448527,-0.440605
2021-01-09,0.27096,-1.189345
2021-01-10,1.971006,-0.692841


Showing label slicing, both endpoints are included

In [43]:
df.loc["20210103":"20210106",["a","b"]]

Unnamed: 0,a,b
2021-01-03,-1.151396,0.575674
2021-01-04,-0.628977,1.176879
2021-01-05,1.115872,1.940716
2021-01-06,0.944919,-1.035663


Reduction in the dimensions of the returned object

In [58]:
df.loc["20210103", ["c","d"]]

c   -0.344063
d    0.099660
Name: 2021-01-03 00:00:00, dtype: float64

For getting a scalar value

In [45]:
df.loc[dates[1],"a"]

-1.0475811442402923

In [47]:
df.at[dates[1],"a"] #same with the previous method

-1.0475811442402923

Selection by Position

In [61]:
df.iloc[3]

a    0.769069
b   -1.190182
c   -0.289454
d   -0.147524
Name: 2021-01-04 00:00:00, dtype: float64

By integer slices, acting similar to numpy/python

In [60]:
df.iloc[3:5, 2:4]

Unnamed: 0,c,d
2021-01-04,-0.289454,-0.147524
2021-01-05,0.663939,-0.471609


By lists of integer position locations, similar to the numpy/python style

In [59]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,a,c
2021-01-02,-0.731,1.059063
2021-01-03,-1.902861,-0.344063
2021-01-05,1.318712,0.663939


For slicing rows explicitly

In [63]:
df.iloc[1:3,:]

Unnamed: 0,a,b,c,d
2021-01-02,-0.731,1.349302,1.059063,0.710393
2021-01-03,-1.902861,0.33702,-0.344063,0.09966


For slicing columns explicitly

In [62]:
df.iloc[:,1:3]

Unnamed: 0,b,c
2021-01-01,-0.168973,1.49548
2021-01-02,1.349302,1.059063
2021-01-03,0.33702,-0.344063
2021-01-04,-1.190182,-0.289454
2021-01-05,0.509324,0.663939
2021-01-06,-0.585483,-1.450538
2021-01-07,-1.439403,-1.494122
2021-01-08,-0.448527,-0.440605
2021-01-09,0.27096,-1.189345
2021-01-10,1.971006,-0.692841


For getting a value explicitly

In [65]:
df.iloc[1,1]

1.3493024012593835

In [66]:
df.iat[1,1] # same with the previous method

1.3493024012593835

Boolean Indexing

Using a single column’s values to select data

In [67]:
df[df.a > 0]

Unnamed: 0,a,b,c,d
2021-01-01,0.480883,-0.168973,1.49548,-0.738783
2021-01-04,0.769069,-1.190182,-0.289454,-0.147524
2021-01-05,1.318712,0.509324,0.663939,-0.471609
2021-01-06,0.50153,-0.585483,-1.450538,1.048853
2021-01-09,0.783171,0.27096,-1.189345,-2.237883
2021-01-10,0.210608,1.971006,-0.692841,0.344552


A where operation for getting

In [68]:
df[df > 0]

Unnamed: 0,a,b,c,d
2021-01-01,0.480883,,1.49548,
2021-01-02,,1.349302,1.059063,0.710393
2021-01-03,,0.33702,,0.09966
2021-01-04,0.769069,,,
2021-01-05,1.318712,0.509324,0.663939,
2021-01-06,0.50153,,,1.048853
2021-01-07,,,,0.974578
2021-01-08,,,,
2021-01-09,0.783171,0.27096,,
2021-01-10,0.210608,1.971006,,0.344552


Using the isin() method for filtering

In [69]:
df2 = df.copy()
df2

Unnamed: 0,a,b,c,d
2021-01-01,0.480883,-0.168973,1.49548,-0.738783
2021-01-02,-0.731,1.349302,1.059063,0.710393
2021-01-03,-1.902861,0.33702,-0.344063,0.09966
2021-01-04,0.769069,-1.190182,-0.289454,-0.147524
2021-01-05,1.318712,0.509324,0.663939,-0.471609
2021-01-06,0.50153,-0.585483,-1.450538,1.048853
2021-01-07,-2.22421,-1.439403,-1.494122,0.974578
2021-01-08,-0.761851,-0.448527,-0.440605,-0.297513
2021-01-09,0.783171,0.27096,-1.189345,-2.237883
2021-01-10,0.210608,1.971006,-0.692841,0.344552


In [70]:
df2["e"] = ["one", "two", "three", "four", "five"]*2
df2

Unnamed: 0,a,b,c,d,e
2021-01-01,0.480883,-0.168973,1.49548,-0.738783,one
2021-01-02,-0.731,1.349302,1.059063,0.710393,two
2021-01-03,-1.902861,0.33702,-0.344063,0.09966,three
2021-01-04,0.769069,-1.190182,-0.289454,-0.147524,four
2021-01-05,1.318712,0.509324,0.663939,-0.471609,five
2021-01-06,0.50153,-0.585483,-1.450538,1.048853,one
2021-01-07,-2.22421,-1.439403,-1.494122,0.974578,two
2021-01-08,-0.761851,-0.448527,-0.440605,-0.297513,three
2021-01-09,0.783171,0.27096,-1.189345,-2.237883,four
2021-01-10,0.210608,1.971006,-0.692841,0.344552,five


In [71]:
df2[df2["e"].isin(["one", "two"])]

Unnamed: 0,a,b,c,d,e
2021-01-01,0.480883,-0.168973,1.49548,-0.738783,one
2021-01-02,-0.731,1.349302,1.059063,0.710393,two
2021-01-06,0.50153,-0.585483,-1.450538,1.048853,one
2021-01-07,-2.22421,-1.439403,-1.494122,0.974578,two


### Setting
Setting a new column automatically aligns the data by the indexes

In [72]:
s1 = pd.Series(np.arange(1,7), index=pd.date_range("20210101", periods=6))
s1

2021-01-01    1
2021-01-02    2
2021-01-03    3
2021-01-04    4
2021-01-05    5
2021-01-06    6
Freq: D, dtype: int32

In [73]:
df["f"] = s1
df

Unnamed: 0,a,b,c,d,f
2021-01-01,0.480883,-0.168973,1.49548,-0.738783,1.0
2021-01-02,-0.731,1.349302,1.059063,0.710393,2.0
2021-01-03,-1.902861,0.33702,-0.344063,0.09966,3.0
2021-01-04,0.769069,-1.190182,-0.289454,-0.147524,4.0
2021-01-05,1.318712,0.509324,0.663939,-0.471609,5.0
2021-01-06,0.50153,-0.585483,-1.450538,1.048853,6.0
2021-01-07,-2.22421,-1.439403,-1.494122,0.974578,
2021-01-08,-0.761851,-0.448527,-0.440605,-0.297513,
2021-01-09,0.783171,0.27096,-1.189345,-2.237883,
2021-01-10,0.210608,1.971006,-0.692841,0.344552,


Setting values by label

In [74]:
df.at[dates[0],"a"] = 0

Setting values by position

In [75]:
df.iat[0,1] = 0

Setting by assigning with a numpy array

In [76]:
df.loc[:,"d"] = np.array([5] * len(df))
df

Unnamed: 0,a,b,c,d,f
2021-01-01,0.0,0.0,1.49548,5,1.0
2021-01-02,-0.731,1.349302,1.059063,5,2.0
2021-01-03,-1.902861,0.33702,-0.344063,5,3.0
2021-01-04,0.769069,-1.190182,-0.289454,5,4.0
2021-01-05,1.318712,0.509324,0.663939,5,5.0
2021-01-06,0.50153,-0.585483,-1.450538,5,6.0
2021-01-07,-2.22421,-1.439403,-1.494122,5,
2021-01-08,-0.761851,-0.448527,-0.440605,5,
2021-01-09,0.783171,0.27096,-1.189345,5,
2021-01-10,0.210608,1.971006,-0.692841,5,


A where operation with setting.


In [78]:
df3 = df.copy()
df3[df3 > 0] = -df3
df3

Unnamed: 0,a,b,c,d,f
2021-01-01,0.0,0.0,-1.49548,-5,-1.0
2021-01-02,-0.731,-1.349302,-1.059063,-5,-2.0
2021-01-03,-1.902861,-0.33702,-0.344063,-5,-3.0
2021-01-04,-0.769069,-1.190182,-0.289454,-5,-4.0
2021-01-05,-1.318712,-0.509324,-0.663939,-5,-5.0
2021-01-06,-0.50153,-0.585483,-1.450538,-5,-6.0
2021-01-07,-2.22421,-1.439403,-1.494122,-5,
2021-01-08,-0.761851,-0.448527,-0.440605,-5,
2021-01-09,-0.783171,-0.27096,-1.189345,-5,
2021-01-10,-0.210608,-1.971006,-0.692841,-5,
