# 10 minutes to pandas

In [105]:
import numpy as np
import pandas as pd

# Object creation

In [106]:
s = pd.Series([1,3, 4, np.nan, 6,8])
s

0    1.0
1    3.0
2    4.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [107]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [108]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.532694,-0.011502,0.413009,-1.057982
2013-01-02,0.540142,-0.675289,-0.242844,-1.204546
2013-01-03,-0.720458,-0.990545,-1.852175,0.142439
2013-01-04,0.905927,0.171357,1.04947,-1.280824
2013-01-05,-0.785988,-1.305083,0.129368,-1.927478
2013-01-06,-1.350556,-0.424405,-0.90877,0.532062


In [109]:
df2 = pd.DataFrame(
    {
        "A":1.0,
        "B":pd.Timestamp("20210324"),
        "C":pd.Series(1, index=list(range(4)), dtype="float32"),
        "D":np.array([3]*4, dtype="int32"),
        "E":pd.Categorical(["test", "train", "test", "train"]),
        "F":"foo"
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-03-24,1.0,3,test,foo
1,1.0,2021-03-24,1.0,3,train,foo
2,1.0,2021-03-24,1.0,3,test,foo
3,1.0,2021-03-24,1.0,3,train,foo


In [110]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# Viewing data

In [111]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-1.532694,-0.011502,0.413009,-1.057982
2013-01-02,0.540142,-0.675289,-0.242844,-1.204546
2013-01-03,-0.720458,-0.990545,-1.852175,0.142439
2013-01-04,0.905927,0.171357,1.04947,-1.280824
2013-01-05,-0.785988,-1.305083,0.129368,-1.927478


In [112]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.905927,0.171357,1.04947,-1.280824
2013-01-05,-0.785988,-1.305083,0.129368,-1.927478
2013-01-06,-1.350556,-0.424405,-0.90877,0.532062


In [113]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [114]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [115]:
df.to_numpy()

array([[-1.53269407, -0.0115019 ,  0.41300886, -1.05798187],
       [ 0.54014216, -0.67528907, -0.24284428, -1.20454639],
       [-0.72045774, -0.99054461, -1.85217501,  0.14243946],
       [ 0.9059269 ,  0.1713571 ,  1.04947002, -1.280824  ],
       [-0.78598757, -1.30508273,  0.12936846, -1.92747819],
       [-1.35055606, -0.42440458, -0.9087699 ,  0.53206151]])

In [116]:
df2.to_numpy()

array([[1.0, Timestamp('2021-03-24 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2021-03-24 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2021-03-24 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2021-03-24 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [117]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.490604,-0.539244,-0.235324,-0.799388
std,0.997827,0.566629,1.026823,0.93757
min,-1.532694,-1.305083,-1.852175,-1.927478
25%,-1.209414,-0.911731,-0.742288,-1.261755
50%,-0.753223,-0.549847,-0.056738,-1.131264
75%,0.224992,-0.114728,0.342099,-0.157666
max,0.905927,0.171357,1.04947,0.532062


In [118]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-1.532694,0.540142,-0.720458,0.905927,-0.785988,-1.350556
B,-0.011502,-0.675289,-0.990545,0.171357,-1.305083,-0.424405
C,0.413009,-0.242844,-1.852175,1.04947,0.129368,-0.90877
D,-1.057982,-1.204546,0.142439,-1.280824,-1.927478,0.532062


In [119]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.057982,0.413009,-0.011502,-1.532694
2013-01-02,-1.204546,-0.242844,-0.675289,0.540142
2013-01-03,0.142439,-1.852175,-0.990545,-0.720458
2013-01-04,-1.280824,1.04947,0.171357,0.905927
2013-01-05,-1.927478,0.129368,-1.305083,-0.785988
2013-01-06,0.532062,-0.90877,-0.424405,-1.350556


In [120]:
df.sort_values(by="B", ascending=False)

Unnamed: 0,A,B,C,D
2013-01-04,0.905927,0.171357,1.04947,-1.280824
2013-01-01,-1.532694,-0.011502,0.413009,-1.057982
2013-01-06,-1.350556,-0.424405,-0.90877,0.532062
2013-01-02,0.540142,-0.675289,-0.242844,-1.204546
2013-01-03,-0.720458,-0.990545,-1.852175,0.142439
2013-01-05,-0.785988,-1.305083,0.129368,-1.927478


# Selection

# Getting

In [161]:
df["A"]

2013-01-01   -1.532694
2013-01-02    0.540142
2013-01-03   -0.720458
2013-01-04    0.905927
2013-01-05   -0.785988
2013-01-06   -1.350556
Freq: D, Name: A, dtype: float64

In [122]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-1.532694,-0.011502,0.413009,-1.057982
2013-01-02,0.540142,-0.675289,-0.242844,-1.204546
2013-01-03,-0.720458,-0.990545,-1.852175,0.142439


In [123]:
df["2013-01-05":"2013-01-07"]

Unnamed: 0,A,B,C,D
2013-01-05,-0.785988,-1.305083,0.129368,-1.927478
2013-01-06,-1.350556,-0.424405,-0.90877,0.532062


# Selection by label

In [124]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.532694,-0.011502,0.413009,-1.057982
2013-01-02,0.540142,-0.675289,-0.242844,-1.204546
2013-01-03,-0.720458,-0.990545,-1.852175,0.142439
2013-01-04,0.905927,0.171357,1.04947,-1.280824
2013-01-05,-0.785988,-1.305083,0.129368,-1.927478
2013-01-06,-1.350556,-0.424405,-0.90877,0.532062


In [158]:
df.loc[dates[0]]

A   -1.532694
B   -0.011502
C    0.413009
D   -1.057982
Name: 2013-01-01 00:00:00, dtype: float64

In [126]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2013-01-01,-1.532694,-0.011502
2013-01-02,0.540142,-0.675289
2013-01-03,-0.720458,-0.990545
2013-01-04,0.905927,0.171357
2013-01-05,-0.785988,-1.305083
2013-01-06,-1.350556,-0.424405


In [127]:
df.loc["20120102":"20130104",["A","B"]]

Unnamed: 0,A,B
2013-01-01,-1.532694,-0.011502
2013-01-02,0.540142,-0.675289
2013-01-03,-0.720458,-0.990545
2013-01-04,0.905927,0.171357


In [128]:
df.loc["20130102",["A","B"]]

A    0.540142
B   -0.675289
Name: 2013-01-02 00:00:00, dtype: float64

In [129]:
df.loc[dates[0], "A"]

-1.5326940701734946

In [130]:
df.at[dates[0], "A"]

-1.5326940701734946

# Selection by position

In [131]:
df.iloc[3]

A    0.905927
B    0.171357
C    1.049470
D   -1.280824
Name: 2013-01-04 00:00:00, dtype: float64

In [132]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.905927,0.171357
2013-01-05,-0.785988,-1.305083


In [135]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,0.540142,-0.242844
2013-01-03,-0.720458,-1.852175
2013-01-05,-0.785988,0.129368


In [136]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.540142,-0.675289,-0.242844,-1.204546
2013-01-03,-0.720458,-0.990545,-1.852175,0.142439


In [138]:
df.iloc[1,1]

-0.675289074148737

In [139]:
df.iat[1,1]

-0.675289074148737

# Boolean indexing

In [154]:
df["A"] > 0

2013-01-01    False
2013-01-02     True
2013-01-03    False
2013-01-04     True
2013-01-05    False
2013-01-06    False
Freq: D, Name: A, dtype: bool

In [155]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-02,0.540142,-0.675289,-0.242844,-1.204546
2013-01-04,0.905927,0.171357,1.04947,-1.280824


In [156]:
df>0

Unnamed: 0,A,B,C,D
2013-01-01,False,False,True,False
2013-01-02,True,False,False,False
2013-01-03,False,False,False,True
2013-01-04,True,True,True,False
2013-01-05,False,False,True,False
2013-01-06,False,False,False,True


In [145]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,0.413009,
2013-01-02,0.540142,,,
2013-01-03,,,,0.142439
2013-01-04,0.905927,0.171357,1.04947,
2013-01-05,,,0.129368,
2013-01-06,,,,0.532062


In [153]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2
df2["E"].isin(["two", "four"])
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.720458,-0.990545,-1.852175,0.142439,two
2013-01-05,-0.785988,-1.305083,0.129368,-1.927478,four
