# Pandas

In [1]:
import pandas as pd
import numpy as np

## 创建数据

1. 通过传递值列表来创建一个系列，让Pandas创建一个默认的整数索引 

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])

In [3]:
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


2. 通过传递numpy数组，使用datetime索引和标记列来创建DataFrame

In [4]:
dates = pd.date_range('20170101', periods=7)

In [5]:
print(dates)

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07'],
              dtype='datetime64[ns]', freq='D')


In [6]:
df = pd.DataFrame(np.random.randn(7,4), index=dates, columns=list('ABCD'))

In [7]:
df

Unnamed: 0,A,B,C,D
2017-01-01,0.729698,0.044011,0.261206,-1.527255
2017-01-02,0.109631,1.657526,0.383637,-1.549447
2017-01-03,-0.030893,0.862774,-0.177088,-0.271816
2017-01-04,-0.746675,-1.947104,-0.648412,-0.950424
2017-01-05,-0.232225,0.455546,1.049164,-0.693351
2017-01-06,0.254631,0.419872,1.417201,0.965971
2017-01-07,0.186613,-0.319605,-0.676643,-0.017391


2. 通过传递可以转换为类似系列的对象的字典来创建DataFrame。参考以下示例代码

In [8]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20170102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })

In [9]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2017-01-02,1.0,3,test,foo
1,1.0,2017-01-02,1.0,3,train,foo
2,1.0,2017-01-02,1.0,3,test,foo
3,1.0,2017-01-02,1.0,3,train,foo


## 查看数据

In [10]:
dates = pd.date_range('20170101', periods=7)
df = pd.DataFrame(np.random.randn(7,4), index=dates, columns=list('ABCD'))

In [11]:
print("index is :" )
print(df.index)

index is :
DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07'],
              dtype='datetime64[ns]', freq='D')


In [12]:
print("columns is :" )
print(df.columns)

columns is :
Index(['A', 'B', 'C', 'D'], dtype='object')


In [13]:
print("values is :" )
print(df.values)

values is :
[[ 0.54603322  0.2847513  -2.85829589 -1.01937589]
 [-2.20437172  0.32522085  1.53482712  0.13570044]
 [-1.5148804   0.82330776 -0.4617352   1.52613377]
 [ 0.96446165  1.77887667  1.47354868  2.1567146 ]
 [ 0.47093723 -1.80546641 -0.17841994  1.78291227]
 [-0.4417899  -1.65670112 -3.39577544  0.08950297]
 [ 1.50438206  0.84433418  0.01310636  0.79961203]]


In [14]:
df.describe()

Unnamed: 0,A,B,C,D
count,7.0,7.0,7.0,7.0
mean,-0.096461,0.084903,-0.553249,0.7816
std,1.353849,1.335159,1.929197,1.124568
min,-2.204372,-1.805466,-3.395775,-1.019376
25%,-0.978335,-0.685975,-1.660016,0.112602
50%,0.470937,0.325221,-0.17842,0.799612
75%,0.755247,0.833821,0.743328,1.654523
max,1.504382,1.778877,1.534827,2.156715


In [15]:
df.T

Unnamed: 0,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00
A,0.546033,-2.204372,-1.51488,0.964462,0.470937,-0.44179,1.504382
B,0.284751,0.325221,0.823308,1.778877,-1.805466,-1.656701,0.844334
C,-2.858296,1.534827,-0.461735,1.473549,-0.17842,-3.395775,0.013106
D,-1.019376,0.1357,1.526134,2.156715,1.782912,0.089503,0.799612


In [16]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2017-01-01,-1.019376,-2.858296,0.284751,0.546033
2017-01-02,0.1357,1.534827,0.325221,-2.204372
2017-01-03,1.526134,-0.461735,0.823308,-1.51488
2017-01-04,2.156715,1.473549,1.778877,0.964462
2017-01-05,1.782912,-0.17842,-1.805466,0.470937
2017-01-06,0.089503,-3.395775,-1.656701,-0.44179
2017-01-07,0.799612,0.013106,0.844334,1.504382


In [17]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2017-01-05,0.470937,-1.805466,-0.17842,1.782912
2017-01-06,-0.44179,-1.656701,-3.395775,0.089503
2017-01-01,0.546033,0.284751,-2.858296,-1.019376
2017-01-02,-2.204372,0.325221,1.534827,0.1357
2017-01-03,-1.51488,0.823308,-0.461735,1.526134
2017-01-07,1.504382,0.844334,0.013106,0.799612
2017-01-04,0.964462,1.778877,1.473549,2.156715


## 选择区块

注意虽然用于选择和设置的标准Python/Numpy表达式是直观的，可用于交互式工作，但对于生产代码，但建议使用优化的Pandas数据访问方法.at，.iat，.loc，.iloc和.ix。

In [18]:
dates = pd.date_range('20170101', periods=7)
df = pd.DataFrame(np.random.randn(7,4), index=dates, columns=list('ABCD'))

1. 获取

In [19]:
df['A']

2017-01-01    0.307134
2017-01-02    0.778095
2017-01-03   -1.628621
2017-01-04    0.875299
2017-01-05   -0.016105
2017-01-06    0.301839
2017-01-07   -1.139688
Freq: D, Name: A, dtype: float64

In [20]:
df[0:3]

Unnamed: 0,A,B,C,D
2017-01-01,0.307134,-0.441571,0.753072,0.512239
2017-01-02,0.778095,0.437527,1.375217,1.071458
2017-01-03,-1.628621,0.299026,0.794208,1.839665


In [21]:
df['20170102':'20170103']

Unnamed: 0,A,B,C,D
2017-01-02,0.778095,0.437527,1.375217,1.071458
2017-01-03,-1.628621,0.299026,0.794208,1.839665


2. 按标签选择

In [24]:
df.loc[dates[0]]

A    0.307134
B   -0.441571
C    0.753072
D    0.512239
Name: 2017-01-01 00:00:00, dtype: float64

In [25]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2017-01-01,0.307134,-0.441571
2017-01-02,0.778095,0.437527
2017-01-03,-1.628621,0.299026
2017-01-04,0.875299,-0.968723
2017-01-05,-0.016105,-0.184765
2017-01-06,0.301839,-0.394164
2017-01-07,-1.139688,-1.247736


In [26]:
df.loc['20170102':'20170104',['A','B']]

Unnamed: 0,A,B
2017-01-02,0.778095,0.437527
2017-01-03,-1.628621,0.299026
2017-01-04,0.875299,-0.968723


In [27]:
df.loc['20170102',['A','B']]

A    0.778095
B    0.437527
Name: 2017-01-02 00:00:00, dtype: float64

In [28]:
df.loc[dates[0],'A']

0.30713351881735834

In [29]:
df.at[dates[0],'A']

0.30713351881735834

3. 通过位置选择

In [30]:
df.iloc[3]

A    0.875299
B   -0.968723
C    1.190858
D   -1.297871
Name: 2017-01-04 00:00:00, dtype: float64

In [31]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2017-01-04,0.875299,-0.968723
2017-01-05,-0.016105,-0.184765


In [32]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2017-01-02,0.778095,1.375217
2017-01-03,-1.628621,0.794208
2017-01-05,-0.016105,-1.372532


In [33]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2017-01-02,0.778095,0.437527,1.375217,1.071458
2017-01-03,-1.628621,0.299026,0.794208,1.839665


In [34]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2017-01-01,-0.441571,0.753072
2017-01-02,0.437527,1.375217
2017-01-03,0.299026,0.794208
2017-01-04,-0.968723,1.190858
2017-01-05,-0.184765,-1.372532
2017-01-06,-0.394164,-0.63596
2017-01-07,-1.247736,0.490039


In [35]:
df.iloc[1,1]

0.43752671150853895

In [36]:
df.iat[1,1]

0.43752671150853895

4. 布尔索引

In [37]:
dates = pd.date_range('20170101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [38]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2017-01-01,0.752037,0.648224,-0.01887,-0.004859
2017-01-02,0.667376,0.213684,0.471995,-0.050146
2017-01-03,0.656514,-0.022863,-1.015044,0.409723
2017-01-04,1.929655,-1.046293,0.37983,-1.5724
2017-01-05,0.747939,0.486642,-0.125356,-0.016793


In [39]:
df[df > 0]

Unnamed: 0,A,B,C,D
2017-01-01,0.752037,0.648224,,
2017-01-02,0.667376,0.213684,0.471995,
2017-01-03,0.656514,,,0.409723
2017-01-04,1.929655,,0.37983,
2017-01-05,0.747939,0.486642,,
2017-01-06,,,1.678511,


In [40]:
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']

In [41]:
df2

Unnamed: 0,A,B,C,D,E
2017-01-01,0.752037,0.648224,-0.01887,-0.004859,one
2017-01-02,0.667376,0.213684,0.471995,-0.050146,one
2017-01-03,0.656514,-0.022863,-1.015044,0.409723,two
2017-01-04,1.929655,-1.046293,0.37983,-1.5724,three
2017-01-05,0.747939,0.486642,-0.125356,-0.016793,four
2017-01-06,-0.033908,-0.533736,1.678511,-1.393022,three


In [42]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2017-01-03,0.656514,-0.022863,-1.015044,0.409723,two
2017-01-05,0.747939,0.486642,-0.125356,-0.016793,four


# pandas.Series

创建Series对象

```python3
pandas.Series( data, index, dtype, copy)
```

## 创建一个空的系列

In [1]:
import pandas as pd

In [2]:
s = pd.Series()

In [3]:
s

Series([], dtype: float64)

## 从ndarray创建一个系列

In [4]:
import numpy as np

In [5]:
data = np.array(['a','b','c','d'])

In [6]:
s = pd.Series(data)

In [7]:
s

0    a
1    b
2    c
3    d
dtype: object

In [8]:
s = pd.Series(data,index=[100,101,102,103])

In [9]:
s

100    a
101    b
102    c
103    d
dtype: object

## 从字典创建一个系列

In [10]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}

In [11]:
s = pd.Series(data)

In [12]:
s

a    0.0
b    1.0
c    2.0
dtype: float64

In [13]:
s = pd.Series(data,index=['b','c','d','a'])

In [14]:
s

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

## 从标量创建一个系列

In [16]:
s = pd.Series(5, index=[0, 1, 2, 3])

In [17]:
s

0    5
1    5
2    5
3    5
dtype: int64

## 从具有位置的系列中访问数据

In [18]:
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])

In [20]:
s[0]

1

In [22]:
s[:3]

a    1
b    2
c    3
dtype: int64

In [24]:
s[-3:]

c    3
d    4
e    5
dtype: int64

## 使用标签检索数据(索引)

In [25]:
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])

In [26]:
s['a']

1

In [27]:
s[['a','c','d']]

a    1
c    3
d    4
dtype: int64

In [None]:
s['f']

# pandas.DataFrame

创建DataFrame

```python
pandas.DataFrame( data, index, columns, dtype, copy)
```

## 创建一个空的DataFrame

In [29]:
import pandas as pd

In [30]:
df = pd.DataFrame()

In [31]:
df

## 从列表创建DataFrame

In [32]:
data = [1,2,3,4,5]

In [34]:
df = pd.DataFrame(data)

In [35]:
df

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5


In [36]:
data = [['Alex',10],['Bob',12],['Clarke',13]]

In [37]:
df = pd.DataFrame(data,columns=['Name','Age'])

In [38]:
df

Unnamed: 0,Name,Age
0,Alex,10
1,Bob,12
2,Clarke,13


In [39]:
data = [['Alex',10],['Bob',12],['Clarke',13]]

In [40]:
df = pd.DataFrame(data,columns=['Name','Age'],dtype=float)

In [41]:
df

Unnamed: 0,Name,Age
0,Alex,10.0
1,Bob,12.0
2,Clarke,13.0


## 从ndarrays/Lists的字典来创建DataFrame

In [42]:
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}

In [43]:
df = pd.DataFrame(data)

In [44]:
df

Unnamed: 0,Name,Age
0,Tom,28
1,Jack,34
2,Steve,29
3,Ricky,42


In [45]:
df = pd.DataFrame(data, index=['rank1','rank2','rank3','rank4'])

In [46]:
df

Unnamed: 0,Name,Age
rank1,Tom,28
rank2,Jack,34
rank3,Steve,29
rank4,Ricky,42


## 从列表创建数据帧DataFrame

In [47]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]

In [48]:
df = pd.DataFrame(data)

In [49]:
df

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [50]:
df = pd.DataFrame(data, index=['first', 'second'])

In [51]:
df

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [53]:
df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b'])

In [55]:
df1

Unnamed: 0,a,b
first,1,2
second,5,10


In [54]:
df2 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b1'])

In [56]:
df2

Unnamed: 0,a,b1
first,1,
second,5,


## 从系列的字典来创建DataFrame

In [57]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

In [58]:
df = pd.DataFrame(d)

In [59]:
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


## 列选择

In [60]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

In [61]:
df = pd.DataFrame(d)

In [63]:
df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

## 列添加

In [None]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

In [65]:
df = pd.DataFrame(d)

In [66]:
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [67]:
df['three']=pd.Series([10,20,30],index=['a','b','c'])

In [68]:
df

Unnamed: 0,one,two,three
a,1.0,1,10.0
b,2.0,2,20.0
c,3.0,3,30.0
d,,4,


In [69]:
df['four']=df['one']+df['three']

In [70]:
df

Unnamed: 0,one,two,three,four
a,1.0,1,10.0,11.0
b,2.0,2,20.0,22.0
c,3.0,3,30.0,33.0
d,,4,,


##  列删除

In [71]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']), 
     'three' : pd.Series([10,20,30], index=['a','b','c'])}

In [72]:
df = pd.DataFrame(d)

In [73]:
df

Unnamed: 0,one,two,three
a,1.0,1,10.0
b,2.0,2,20.0
c,3.0,3,30.0
d,,4,


In [74]:
del df['one']

In [75]:
df

Unnamed: 0,two,three
a,1,10.0
b,2,20.0
c,3,30.0
d,4,


In [76]:
df.pop('two')

a    1
b    2
c    3
d    4
Name: two, dtype: int64

In [77]:
df

Unnamed: 0,three
a,10.0
b,20.0
c,30.0
d,


## 行选择，添加和删除

In [78]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

In [79]:
df = pd.DataFrame(d)

1. 标签选择

In [80]:
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [81]:
df.loc['b']

one    2.0
two    2.0
Name: b, dtype: float64

2. 按整数位置选择

In [82]:
df.iloc[2]

one    3.0
two    3.0
Name: c, dtype: float64