In [2]:
import pandas as pd
import numpy as np

# 构造一个数据

In [3]:
pd.Series([1,2,np.nan])

0    1.0
1    2.0
2    NaN
dtype: float64

# 构造一组带行列索引的数据

In [4]:
dates = pd.date_range('20200731', periods=6)

In [5]:
df = pd.DataFrame(np.random.randn(6,4),
                  index = dates, # 列索引
                  columns = ['a', 'b','c', 'd']) # 行索引

In [6]:
df

Unnamed: 0,a,b,c,d
2020-07-31,0.350104,-0.657049,1.091479,0.537234
2020-08-01,0.238727,0.385829,-0.763607,0.570765
2020-08-02,2.600317,0.38045,0.340633,-1.074143
2020-08-03,0.353885,-0.717597,-1.492258,-0.142132
2020-08-04,-1.783086,-0.722723,-0.542189,1.432078
2020-08-05,0.037807,0.391603,-0.210115,2.227556


# 构造一组不带行列索引的数据

In [7]:
pd.DataFrame(np.arange(12).reshape(3,4)) # 没有指定索引，则自动生成数字为索引

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


# 构造数据并详细定义每一列的内容

In [8]:
df = pd.DataFrame({'A': 1.,
                   'B': pd.Timestamp('20200731'),
                   'C': pd.Series(1, index=list(range(4)),dtype='float32'),
                   'D': np.array([3]*4, dtype='int32'),
                   'E': pd.Categorical(['test', 'train', 'test', 'train']),
                   'F': 'foo'})

In [9]:
df

Unnamed: 0,A,B,C,D,E,F
0,1.0,2020-07-31,1.0,3,test,foo
1,1.0,2020-07-31,1.0,3,train,foo
2,1.0,2020-07-31,1.0,3,test,foo
3,1.0,2020-07-31,1.0,3,train,foo


In [10]:
df.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [11]:
df.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [12]:
df.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [13]:
df.values

array([[1.0, Timestamp('2020-07-31 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2020-07-31 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2020-07-31 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2020-07-31 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [14]:
df.describe

<bound method NDFrame.describe of      A          B    C  D      E    F
0  1.0 2020-07-31  1.0  3   test  foo
1  1.0 2020-07-31  1.0  3  train  foo
2  1.0 2020-07-31  1.0  3   test  foo
3  1.0 2020-07-31  1.0  3  train  foo>

In [15]:
df.T  # 行列互换

Unnamed: 0,0,1,2,3
A,1,1,1,1
B,2020-07-31 00:00:00,2020-07-31 00:00:00,2020-07-31 00:00:00,2020-07-31 00:00:00
C,1,1,1,1
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


In [16]:
df.sort_index(axis=1,ascending=False)  # 基于列索引的排序

Unnamed: 0,F,E,D,C,B,A
0,foo,test,3,1.0,2020-07-31,1.0
1,foo,train,3,1.0,2020-07-31,1.0
2,foo,test,3,1.0,2020-07-31,1.0
3,foo,train,3,1.0,2020-07-31,1.0


In [17]:
df.sort_index(axis=0,ascending=False)  # 基于行索引的排序

Unnamed: 0,A,B,C,D,E,F
3,1.0,2020-07-31,1.0,3,train,foo
2,1.0,2020-07-31,1.0,3,test,foo
1,1.0,2020-07-31,1.0,3,train,foo
0,1.0,2020-07-31,1.0,3,test,foo


In [18]:
df.sort_values(by='E')

Unnamed: 0,A,B,C,D,E,F
0,1.0,2020-07-31,1.0,3,test,foo
2,1.0,2020-07-31,1.0,3,test,foo
1,1.0,2020-07-31,1.0,3,train,foo
3,1.0,2020-07-31,1.0,3,train,foo
