pandas像是一个字典形式的numpy

In [93]:
import pandas as pd
import numpy as np

+ 创建序列 非常像numpy的一维矩阵

In [94]:
s=pd.Series([1,3,6,np.nan,44,1])
s

0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64

+ 创建日期序列

In [95]:
dates = pd.date_range('20210701',periods=6)
dates

DatetimeIndex(['2021-07-01', '2021-07-02', '2021-07-03', '2021-07-04',
               '2021-07-05', '2021-07-06'],
              dtype='datetime64[ns]', freq='D')

## 用np矩阵来创建pd的dateframe 顺便定义行名列名

In [96]:
A=np.random.randn(6,4)
A

array([[-0.92347201,  0.1426074 ,  0.44120991, -0.15442801],
       [-0.57796599,  1.72977313, -0.7070984 ,  1.32113698],
       [ 0.46714575, -0.08248246,  0.25800077, -0.31002977],
       [ 0.60906566, -0.01889174, -1.97834218, -0.97700432],
       [ 0.5171865 , -1.33315371,  0.46842158, -1.3895784 ],
       [-1.76163776,  0.20309684, -0.58224352, -0.17229411]])

+ 未添加 行名、列名

In [97]:
df=pd.DataFrame(A)
df

Unnamed: 0,0,1,2,3
0,-0.923472,0.142607,0.44121,-0.154428
1,-0.577966,1.729773,-0.707098,1.321137
2,0.467146,-0.082482,0.258001,-0.31003
3,0.609066,-0.018892,-1.978342,-0.977004
4,0.517187,-1.333154,0.468422,-1.389578
5,-1.761638,0.203097,-0.582244,-0.172294


+ 添加了 行名、列名

In [98]:
df=pd.DataFrame(A,index=dates,columns=['a','b','c','d'])
df

Unnamed: 0,a,b,c,d
2021-07-01,-0.923472,0.142607,0.44121,-0.154428
2021-07-02,-0.577966,1.729773,-0.707098,1.321137
2021-07-03,0.467146,-0.082482,0.258001,-0.31003
2021-07-04,0.609066,-0.018892,-1.978342,-0.977004
2021-07-05,0.517187,-1.333154,0.468422,-1.389578
2021-07-06,-1.761638,0.203097,-0.582244,-0.172294


## 用字典{} 的方式创建 dataframe

In [99]:
#  这里先给出一个字典
B={
'A' : 1.,                                                        # 一个数
'B' : pd.Timestamp('20130102'),                                  # 一个时间
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),         # 一列数
'D' : np.array([3] * 4,dtype='int32'),                           # 一行数
'E' : pd.Categorical(["test","train","test","train"]),           # 一行字符串
'F' : 'foo'                                                      # 一个字符串
}
B

{'A': 1.0,
 'B': Timestamp('2013-01-02 00:00:00'),
 'C': 0    1.0
 1    1.0
 2    1.0
 3    1.0
 dtype: float32,
 'D': array([3, 3, 3, 3]),
 'E': ['test', 'train', 'test', 'train']
 Categories (2, object): ['test', 'train'],
 'F': 'foo'}

+ DataFrame的每一列导入了字典的每一条

In [100]:
df2=pd.DataFrame(B)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


# DataFrame的属性

+ 显示数据类型dtypes

In [101]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

+ 显示行坐标

In [102]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

+ 显示列坐标

In [103]:
df.columns

Index(['a', 'b', 'c', 'd'], dtype='object')

+ 显示所有的值

In [104]:
df2.values

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

+ 显示所有的数学特征

In [105]:
df2.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


+ DataFrame的转置

In [106]:
df2.T

Unnamed: 0,0,1,2,3
A,1,1,1,1
B,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00
C,1,1,1,1
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


+ index排序(注意：对行对列都是df.sort_index-----不存在df.sort_column)

In [107]:
df2.sort_index(axis=0,ascending=False)  # 对行名倒序

Unnamed: 0,A,B,C,D,E,F
3,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
0,1.0,2013-01-02,1.0,3,test,foo


In [108]:
df2.sort_index(axis=1,ascending=False)  # 对列名倒序

Unnamed: 0,F,E,D,C,B,A
0,foo,test,3,1.0,2013-01-02,1.0
1,foo,train,3,1.0,2013-01-02,1.0
2,foo,test,3,1.0,2013-01-02,1.0
3,foo,train,3,1.0,2013-01-02,1.0


+ values排序

In [109]:
df2.sort_values(by='E',ascending=False)

Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
3,1.0,2013-01-02,1.0,3,train,foo
0,1.0,2013-01-02,1.0,3,test,foo
2,1.0,2013-01-02,1.0,3,test,foo
