The fundamental behavior about data types, indexing, and axis labeling / alignment apply across all of the objects

In [69]:
import numpy as np
import pandas as pd
from sklearn import datasets

# Data Structure

## Series

In [9]:
s = pd.Series(np.random.randn(5),index=['a','c','c','d','e'])

In [10]:
s

a    2.198734
c   -1.116958
c    0.821596
d   -0.714139
e   -0.778944
dtype: float64

In [11]:
s.index

Index(['a', 'c', 'c', 'd', 'e'], dtype='object')

In [12]:
s = pd.Series(np.random.randn(5))

In [13]:
s

0   -0.508352
1   -0.687531
2   -0.584319
3   -0.754704
4   -0.043011
dtype: float64

In [14]:
s.index

RangeIndex(start=0, stop=5, step=1)

In [15]:
pd.Series({'a':1,'b':2,'c':3})

a    1
b    2
c    3
dtype: int64

In [16]:
pd.Series({'a':1,'b':2,'c':3},index=['e','f','d'])

e   NaN
f   NaN
d   NaN
dtype: float64

In [17]:
pd.Series([1,2,3])

0    1
1    2
2    3
dtype: int64

In [18]:
pd.Series(['a',1])

0    a
1    1
dtype: object

In [20]:
pd.Series(5,index=[1,2,3,4,5,6,])

1    5
2    5
3    5
4    5
5    5
6    5
dtype: int64

### Series is ndarray-like

In [21]:
s[0]

-0.5083520596421597

In [22]:
s[:3]

0   -0.508352
1   -0.687531
2   -0.584319
dtype: float64

In [23]:
s[s>s.median()]

0   -0.508352
4   -0.043011
dtype: float64

In [24]:
s[[4,3,1]]

4   -0.043011
3   -0.754704
1   -0.687531
dtype: float64

In [25]:
np.exp(s)

0    0.601486
1    0.502816
2    0.557485
3    0.470150
4    0.957901
dtype: float64

### Series is dict-like

In [27]:
s[0]

-0.5083520596421597

In [28]:
2 in s

True

In [29]:
s.get('f')

In [30]:
s.get('f','none') # dict的get方法

'none'

In [31]:
s.get(1)

-0.687530892697016

### Vectorized operations and label alignment with Series

### Name attribute

In [32]:
s = pd.Series(np.random.randn(5),name='something')

In [33]:
s

0   -0.586496
1    0.272442
2    2.759564
3   -1.227503
4    0.714770
Name: something, dtype: float64

In [34]:
s2 = s.rename("different")

In [35]:
s2

0   -0.586496
1    0.272442
2    2.759564
3   -1.227503
4    0.714770
Name: different, dtype: float64

In [36]:
s

0   -0.586496
1    0.272442
2    2.759564
3   -1.227503
4    0.714770
Name: something, dtype: float64

## DataFrame

1. 字典的关键字表示列标签
2. DF的行标签将是字典嵌入的字典的关键字的联合
3. 可以自己传入index,columns当缺失时用NaN进行匹配

In [45]:
d = {'one':pd.Series(np.random.randn(3),index=['a','b','c'])}
df = pd.DataFrame(d)

In [46]:
df

Unnamed: 0,one
a,1.829928
b,1.152277
c,0.515089


In [47]:
pd.DataFrame(d,index=['a','1']) # 指定index 作用于Series

Unnamed: 0,one
a,1.829928
1,


从列表或者ndarrays的字典产生
**一定要保持一样的长度**

In [50]:
pd.DataFrame({'one':[1,2,3,4,5],'two':[7,8,9,10,11]},index = ['a','b','c','d','e'],columns=['1','one'])

Unnamed: 0,1,one
a,,1
b,,2
c,,3
d,,4
e,,5


In [51]:
data = np.zeros((2,), dtype=[('A', 'i4'),('B', 'f4'),('C', 'a10')])

In [52]:
data

array([(0, 0., b''), (0, 0., b'')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

从字典的列表中产生，每一个字典相当于一行，此时可以指定index,index一定要与list的长度一致,column就相当于是一种选择作用了,index与columns不可以同时使用,columns一定要在其标签之中

In [53]:
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

In [54]:
data2

[{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

In [55]:
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [66]:
pd.DataFrame(data2, index=['first', 'second'])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [67]:
pd.DataFrame(data2,columns=['a','b','c'])

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


从一个元组的字典中产生-----多重标签

从一个series产生，产生一列，列标签时其名字，或者重新提供

### 列选择，增加，删除

可以像字典那样操作
选择，列相当于ndarrays，删除del，pop，当从别的地方增加时，当index不匹配时，则用nan表示，不匹配项丢弃，增加ndarray但是一定要保持一样的 长度，用insert可以插入到任意位置

### assign method

In [81]:
iris1 = datasets.load_iris()
iris = np.c_[np.array(iris1.data),np.array(iris1.target).reshape(150,1)]
iris

array([[5.1, 3.5, 1.4, 0.2, 0. ],
       [4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ],
       [4.6, 3.1, 1.5, 0.2, 0. ],
       [5. , 3.6, 1.4, 0.2, 0. ],
       [5.4, 3.9, 1.7, 0.4, 0. ],
       [4.6, 3.4, 1.4, 0.3, 0. ],
       [5. , 3.4, 1.5, 0.2, 0. ],
       [4.4, 2.9, 1.4, 0.2, 0. ],
       [4.9, 3.1, 1.5, 0.1, 0. ],
       [5.4, 3.7, 1.5, 0.2, 0. ],
       [4.8, 3.4, 1.6, 0.2, 0. ],
       [4.8, 3. , 1.4, 0.1, 0. ],
       [4.3, 3. , 1.1, 0.1, 0. ],
       [5.8, 4. , 1.2, 0.2, 0. ],
       [5.7, 4.4, 1.5, 0.4, 0. ],
       [5.4, 3.9, 1.3, 0.4, 0. ],
       [5.1, 3.5, 1.4, 0.3, 0. ],
       [5.7, 3.8, 1.7, 0.3, 0. ],
       [5.1, 3.8, 1.5, 0.3, 0. ],
       [5.4, 3.4, 1.7, 0.2, 0. ],
       [5.1, 3.7, 1.5, 0.4, 0. ],
       [4.6, 3.6, 1. , 0.2, 0. ],
       [5.1, 3.3, 1.7, 0.5, 0. ],
       [4.8, 3.4, 1.9, 0.2, 0. ],
       [5. , 3. , 1.6, 0.2, 0. ],
       [5. , 3.4, 1.6, 0.4, 0. ],
       [5.2, 3.5, 1.5, 0.2, 0. ],
       [5.2, 3.4, 1.4, 0.2, 0. ],
       [4.7, 3

In [82]:
iris = pd.DataFrame(iris1.data,columns=iris1.feature_names)

In [76]:
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [83]:
iris.assign(sepal_ratio = iris[iris1.feature_names[1]]/iris[iris1.feature_names[0]])

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),sepal_ratio
0,5.1,3.5,1.4,0.2,0.686275
1,4.9,3.0,1.4,0.2,0.612245
2,4.7,3.2,1.3,0.2,0.680851
3,4.6,3.1,1.5,0.2,0.673913
4,5.0,3.6,1.4,0.2,0.720000
5,5.4,3.9,1.7,0.4,0.722222
6,4.6,3.4,1.4,0.3,0.739130
7,5.0,3.4,1.5,0.2,0.680000
8,4.4,2.9,1.4,0.2,0.659091
9,4.9,3.1,1.5,0.1,0.632653


In [84]:
(iris.query(iris1.feature_names[0]>5)
 .assign(SepalRatio=lambda x:x[iris1.feature_names[1]]/x[iris1.feature_names[0]],
        PetaRatio=lambda x:x[iris1.feature_names[3]]/x[iris1.feature_names[2]])
 .plot(kind='scatter',x'SepalRatio,y=PetalRatio))

SyntaxError: EOL while scanning string literal (<ipython-input-84-8ed304c4b010>, line 4)

In [85]:
df = pd.DataFrame(np.random.randn(5,3),columns=['A','B','C'])

In [86]:
df

Unnamed: 0,A,B,C
0,-1.137566,-0.03494,1.64009
1,-0.31508,-0.014534,0.172069
2,-1.842109,-0.442642,-0.673982
3,-0.152348,1.792957,1.005501
4,2.968072,0.447327,-0.923681


In [91]:
df.assign(D=lambda x: x['A']+x['B']+x['C'])

Unnamed: 0,A,B,C,D
0,-1.137566,-0.03494,1.64009,0.467583
1,-0.31508,-0.014534,0.172069,-0.157544
2,-1.842109,-0.442642,-0.673982,-2.958732
3,-0.152348,1.792957,1.005501,2.64611
4,2.968072,0.447327,-0.923681,2.491718


In [93]:
df.query('A<0').assign(D=lambda x:x['A']*10)

Unnamed: 0,A,B,C,D
0,-1.137566,-0.03494,1.64009,-11.375659
1,-0.31508,-0.014534,0.172069,-3.150801
2,-1.842109,-0.442642,-0.673982,-18.421086
3,-0.152348,1.792957,1.005501,-1.523484


### indexing selection

df[]单个选择列,且一次只能选择一个，切片选择行  
loc[]等同于二维dict用标签  
iloc等同于ndarrays用整数  
用attribute的方式获取列

In [94]:
df

Unnamed: 0,A,B,C
0,-1.137566,-0.03494,1.64009
1,-0.31508,-0.014534,0.172069
2,-1.842109,-0.442642,-0.673982
3,-0.152348,1.792957,1.005501
4,2.968072,0.447327,-0.923681


In [96]:
df['A']

0   -1.137566
1   -0.315080
2   -1.842109
3   -0.152348
4    2.968072
Name: A, dtype: float64

### Data alignment and arithmetic

当两个不一致的df相加时，不一致的都添加然后赋值NaN  
Df +- Series --> broadcasting row-wise

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
A    5 non-null float64
B    5 non-null float64
C    5 non-null float64
dtypes: float64(3)
memory usage: 200.0 bytes


In [99]:
print(df.to_string())

          A         B         C
0 -1.137566 -0.034940  1.640090
1 -0.315080 -0.014534  0.172069
2 -1.842109 -0.442642 -0.673982
3 -0.152348  1.792957  1.005501
4  2.968072  0.447327 -0.923681


In [104]:
pd.set_option('display.width',2)

In [106]:
print(df.<TAB>)

SyntaxError: invalid syntax (<ipython-input-106-d3a923cf54e0>, line 1)