In [1]:
import numpy as np

In [2]:
import pandas as pd

# Object creation

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [4]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
dates = pd.date_range("20130101", periods=6)

In [6]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))

In [8]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.556988,-0.085105,-1.686347,-0.191298
2013-01-02,-0.246014,0.401341,1.964883,-1.204096
2013-01-03,-0.475919,0.333546,1.163193,1.268625
2013-01-04,0.055626,0.568156,0.779179,-0.271035
2013-01-05,1.668773,-0.736226,-0.296121,-0.861025
2013-01-06,0.084356,-1.284141,0.214441,-0.452666


In [9]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

In [10]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [11]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# Viewing data

In [12]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.556988,-0.085105,-1.686347,-0.191298
2013-01-02,-0.246014,0.401341,1.964883,-1.204096
2013-01-03,-0.475919,0.333546,1.163193,1.268625
2013-01-04,0.055626,0.568156,0.779179,-0.271035
2013-01-05,1.668773,-0.736226,-0.296121,-0.861025


In [13]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.055626,0.568156,0.779179,-0.271035
2013-01-05,1.668773,-0.736226,-0.296121,-0.861025
2013-01-06,0.084356,-1.284141,0.214441,-0.452666


In [14]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [15]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [16]:
df.to_numpy()

array([[-0.55698772, -0.08510546, -1.68634671, -0.19129837],
       [-0.24601364,  0.40134101,  1.96488274, -1.20409626],
       [-0.47591946,  0.33354647,  1.1631926 ,  1.26862454],
       [ 0.0556257 ,  0.56815579,  0.77917893, -0.27103466],
       [ 1.6687726 , -0.73622555, -0.29612138, -0.86102533],
       [ 0.08435554, -1.28414054,  0.21444149, -0.45266574]])

In [17]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

numpy 所有数据是一个类型，pandas中数据一列是一个类型，转化后不包含index列

In [18]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.088306,-0.133738,0.356538,-0.285249
std,0.817983,0.732953,1.267724,0.851749
min,-0.556988,-1.284141,-1.686347,-1.204096
25%,-0.418443,-0.573446,-0.168481,-0.758935
50%,-0.095194,0.124221,0.49681,-0.36185
75%,0.077173,0.384392,1.067189,-0.211232
max,1.668773,0.568156,1.964883,1.268625


In [19]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.556988,-0.246014,-0.475919,0.055626,1.668773,0.084356
B,-0.085105,0.401341,0.333546,0.568156,-0.736226,-1.284141
C,-1.686347,1.964883,1.163193,0.779179,-0.296121,0.214441
D,-0.191298,-1.204096,1.268625,-0.271035,-0.861025,-0.452666


In [20]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.191298,-1.686347,-0.085105,-0.556988
2013-01-02,-1.204096,1.964883,0.401341,-0.246014
2013-01-03,1.268625,1.163193,0.333546,-0.475919
2013-01-04,-0.271035,0.779179,0.568156,0.055626
2013-01-05,-0.861025,-0.296121,-0.736226,1.668773
2013-01-06,-0.452666,0.214441,-1.284141,0.084356


In [21]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-06,0.084356,-1.284141,0.214441,-0.452666
2013-01-05,1.668773,-0.736226,-0.296121,-0.861025
2013-01-01,-0.556988,-0.085105,-1.686347,-0.191298
2013-01-03,-0.475919,0.333546,1.163193,1.268625
2013-01-02,-0.246014,0.401341,1.964883,-1.204096
2013-01-04,0.055626,0.568156,0.779179,-0.271035


# Selection

## Getting

In [22]:
df["A"]

2013-01-01   -0.556988
2013-01-02   -0.246014
2013-01-03   -0.475919
2013-01-04    0.055626
2013-01-05    1.668773
2013-01-06    0.084356
Freq: D, Name: A, dtype: float64

In [23]:
df.A

2013-01-01   -0.556988
2013-01-02   -0.246014
2013-01-03   -0.475919
2013-01-04    0.055626
2013-01-05    1.668773
2013-01-06    0.084356
Freq: D, Name: A, dtype: float64

In [24]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.556988,-0.085105,-1.686347,-0.191298
2013-01-02,-0.246014,0.401341,1.964883,-1.204096
2013-01-03,-0.475919,0.333546,1.163193,1.268625


In [25]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,-0.246014,0.401341,1.964883,-1.204096
2013-01-03,-0.475919,0.333546,1.163193,1.268625
2013-01-04,0.055626,0.568156,0.779179,-0.271035


## Selection by label

In [26]:
df.loc[dates[0]]

A   -0.556988
B   -0.085105
C   -1.686347
D   -0.191298
Name: 2013-01-01 00:00:00, dtype: float64

In [27]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,-0.556988,-0.085105
2013-01-02,-0.246014,0.401341
2013-01-03,-0.475919,0.333546
2013-01-04,0.055626,0.568156
2013-01-05,1.668773,-0.736226
2013-01-06,0.084356,-1.284141


In [28]:
df.loc["20130102":"20130104", ["A", "B"]]

Unnamed: 0,A,B
2013-01-02,-0.246014,0.401341
2013-01-03,-0.475919,0.333546
2013-01-04,0.055626,0.568156


In [29]:
df.loc["20130102", ["A", "B"]]

A   -0.246014
B    0.401341
Name: 2013-01-02 00:00:00, dtype: float64

In [30]:
df.loc[dates[0], "A"]

-0.5569877247988242

In [31]:
df.at[dates[0], "A"]

-0.5569877247988242

## Selection by position

In [32]:
df.iloc[3]

A    0.055626
B    0.568156
C    0.779179
D   -0.271035
Name: 2013-01-04 00:00:00, dtype: float64

In [33]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.055626,0.568156
2013-01-05,1.668773,-0.736226


In [34]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,-0.246014,1.964883
2013-01-03,-0.475919,1.163193
2013-01-05,1.668773,-0.296121


In [35]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.246014,0.401341,1.964883,-1.204096
2013-01-03,-0.475919,0.333546,1.163193,1.268625


In [36]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,-0.085105,-1.686347
2013-01-02,0.401341,1.964883
2013-01-03,0.333546,1.163193
2013-01-04,0.568156,0.779179
2013-01-05,-0.736226,-0.296121
2013-01-06,-1.284141,0.214441


In [37]:
df.iloc[1, 1]

0.4013410076845108

In [38]:
df.iat[1, 1]

0.4013410076845108

## Boolean indexing

In [39]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-04,0.055626,0.568156,0.779179,-0.271035
2013-01-05,1.668773,-0.736226,-0.296121,-0.861025
2013-01-06,0.084356,-1.284141,0.214441,-0.452666


In [40]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,,0.401341,1.964883,
2013-01-03,,0.333546,1.163193,1.268625
2013-01-04,0.055626,0.568156,0.779179,
2013-01-05,1.668773,,,
2013-01-06,0.084356,,0.214441,


In [41]:
df2 = df.copy()

In [42]:
df2["E"] = ["one", "one", "two", "three", "four", "three"]

In [43]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.556988,-0.085105,-1.686347,-0.191298,one
2013-01-02,-0.246014,0.401341,1.964883,-1.204096,one
2013-01-03,-0.475919,0.333546,1.163193,1.268625,two
2013-01-04,0.055626,0.568156,0.779179,-0.271035,three
2013-01-05,1.668773,-0.736226,-0.296121,-0.861025,four
2013-01-06,0.084356,-1.284141,0.214441,-0.452666,three


In [44]:
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.475919,0.333546,1.163193,1.268625,two
2013-01-05,1.668773,-0.736226,-0.296121,-0.861025,four


## Setting

In [45]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))


In [46]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [47]:
df["F"] = s1

In [48]:
df.at[dates[0], "A"] = 0

In [49]:
df.iat[0, 1] = 0

In [50]:
df.loc[:, "D"] = np.array([5] * len(df))

In [51]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.686347,5,
2013-01-02,-0.246014,0.401341,1.964883,5,1.0
2013-01-03,-0.475919,0.333546,1.163193,5,2.0
2013-01-04,0.055626,0.568156,0.779179,5,3.0
2013-01-05,1.668773,-0.736226,-0.296121,5,4.0
2013-01-06,0.084356,-1.284141,0.214441,5,5.0


In [52]:
df2 = df.copy()

In [53]:
df2[df2 > 0] = -df2

In [54]:
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.686347,-5,
2013-01-02,-0.246014,-0.401341,-1.964883,-5,-1.0
2013-01-03,-0.475919,-0.333546,-1.163193,-5,-2.0
2013-01-04,-0.055626,-0.568156,-0.779179,-5,-3.0
2013-01-05,-1.668773,-0.736226,-0.296121,-5,-4.0
2013-01-06,-0.084356,-1.284141,-0.214441,-5,-5.0
