In [1]:
import pandas as pd
import numpy as np

# 创建 1D 序列
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [2]:
dates = pd.date_range('20170101', periods=7)
dates

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07'],
              dtype='datetime64[ns]', freq='D')

In [3]:
# 创建 2D dataframe
df = pd.DataFrame(np.random.randn(7,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2017-01-01,1.389669,-0.28975,-0.482087,0.117522
2017-01-02,0.906099,1.147294,-1.11966,0.365747
2017-01-03,-1.552837,-0.7251,-0.010343,-2.252705
2017-01-04,-1.034722,2.837132,0.77069,0.008662
2017-01-05,-0.591056,-1.569515,0.205568,-0.388648
2017-01-06,-0.232486,-0.136549,-0.078237,0.664034
2017-01-07,0.025262,0.706775,-0.907876,1.777096


In [4]:
# 通过传入字典创建 dataframe
df2 = pd.DataFrame({
    'A': 1.,
    'B': pd.Timestamp('20130102'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(["test", "train", "test", "train"]),
    'F': 'foo'
})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [5]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [6]:
df2.A

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64

In [7]:
df2.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [8]:
df2.tail(3)

Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [9]:
df.index

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [11]:
df.to_numpy()

array([[ 1.38966855, -0.28975047, -0.4820873 ,  0.1175215 ],
       [ 0.90609908,  1.14729356, -1.1196598 ,  0.36574672],
       [-1.55283709, -0.7251003 , -0.0103427 , -2.25270475],
       [-1.03472196,  2.83713219,  0.77068982,  0.00866165],
       [-0.59105648, -1.56951498,  0.20556772, -0.38864819],
       [-0.23248598, -0.13654933, -0.07823718,  0.66403432],
       [ 0.02526238,  0.70677523, -0.90787581,  1.77709562]])

In [12]:
# df2 这个 DataFrame 包含了多种类型，DataFrame.to_numpy() 操作就会耗费较多资源
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [13]:
# describe() 可以快速查看数据的统计摘要
df.describe()

Unnamed: 0,A,B,C,D
count,7.0,7.0,7.0,7.0
mean,-0.155725,0.281469,-0.231706,0.041672
std,1.038302,1.437724,0.655352,1.222036
min,-1.552837,-1.569515,-1.11966,-2.252705
25%,-0.812889,-0.507425,-0.694982,-0.189993
50%,-0.232486,-0.136549,-0.078237,0.117522
75%,0.465681,0.927034,0.097613,0.514891
max,1.389669,2.837132,0.77069,1.777096


In [14]:
df.T # 转置数据

Unnamed: 0,2017-01-01,2017-01-02,2017-01-03,2017-01-04,2017-01-05,2017-01-06,2017-01-07
A,1.389669,0.906099,-1.552837,-1.034722,-0.591056,-0.232486,0.025262
B,-0.28975,1.147294,-0.7251,2.837132,-1.569515,-0.136549,0.706775
C,-0.482087,-1.11966,-0.010343,0.77069,0.205568,-0.078237,-0.907876
D,0.117522,0.365747,-2.252705,0.008662,-0.388648,0.664034,1.777096


In [15]:
# 按轴排序
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2017-01-01,0.117522,-0.482087,-0.28975,1.389669
2017-01-02,0.365747,-1.11966,1.147294,0.906099
2017-01-03,-2.252705,-0.010343,-0.7251,-1.552837
2017-01-04,0.008662,0.77069,2.837132,-1.034722
2017-01-05,-0.388648,0.205568,-1.569515,-0.591056
2017-01-06,0.664034,-0.078237,-0.136549,-0.232486
2017-01-07,1.777096,-0.907876,0.706775,0.025262


In [16]:
# 按值排序
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2017-01-05,-0.591056,-1.569515,0.205568,-0.388648
2017-01-03,-1.552837,-0.7251,-0.010343,-2.252705
2017-01-01,1.389669,-0.28975,-0.482087,0.117522
2017-01-06,-0.232486,-0.136549,-0.078237,0.664034
2017-01-07,0.025262,0.706775,-0.907876,1.777096
2017-01-02,0.906099,1.147294,-1.11966,0.365747
2017-01-04,-1.034722,2.837132,0.77069,0.008662
