
# 内建函数transfrom
### transform可以产生一个标量值，并广播到各分组的尺寸数据中
### transform可以产生一个与输入分组尺寸相同的对象
### transform不可改变它的输入

In [1]:
from datetime import datetime
import pandas as pd
import numpy as np

In [5]:
df=pd.DataFrame({"key":['a','b','c']*4,
                "values":np.arange(12.)})
df

Unnamed: 0,key,values
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [7]:
g=df.groupby("key").values

In [8]:
g.mean()

key
a    4.5
b    5.5
c    6.5
Name: values, dtype: float64

In [9]:
g.transform(lambda x:x.mean())

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: values, dtype: float64

In [10]:
g.transform("mean")

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: values, dtype: float64

In [12]:
g.transform(lambda x:x.rank(ascending=True))

0     1.0
1     1.0
2     1.0
3     2.0
4     2.0
5     2.0
6     3.0
7     3.0
8     3.0
9     4.0
10    4.0
11    4.0
Name: values, dtype: float64

In [14]:
normalized=(df["values"]-g.transform("mean"))/g.transform("std")
normalized

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: values, dtype: float64

#  DF 数据与NP数据转化 

In [16]:
data=pd.DataFrame({'x0':[1,2,3,4,5],"x1":[-0.1,0.01,0.5,-3,0.2],"y0":[1.2,4.5,5,3,3]})
data

Unnamed: 0,x0,x1,y0
0,1,-0.1,1.2
1,2,0.01,4.5
2,3,0.5,5.0
3,4,-3.0,3.0
4,5,0.2,3.0


In [18]:
# 将DataFrame转换为NumPy数组，使用．values属性：
data.values

array([[ 1.  , -0.1 ,  1.2 ],
       [ 2.  ,  0.01,  4.5 ],
       [ 3.  ,  0.5 ,  5.  ],
       [ 4.  , -3.  ,  3.  ],
       [ 5.  ,  0.2 ,  3.  ]])

In [19]:
data.columns

Index(['x0', 'x1', 'y0'], dtype='object')

In [20]:
# 将数组再转换为DataFrame，可以传递一个含有列名的二维ndarray
df2=pd.DataFrame(data.values,columns=["x0","x1","y0"])
df2

Unnamed: 0,x0,x1,y0
0,1.0,-0.1,1.2
1,2.0,0.01,4.5
2,3.0,0.5,5.0
3,4.0,-3.0,3.0
4,5.0,0.2,3.0


In [21]:
data3=data.copy()
data3["string"]=['a','b','c','d','e']
data3

Unnamed: 0,x0,x1,y0,string
0,1,-0.1,1.2,a
1,2,0.01,4.5,b
2,3,0.5,5.0,c
3,4,-3.0,3.0,d
4,5,0.2,3.0,e


In [22]:
data3.values

array([[1, -0.1, 1.2, 'a'],
       [2, 0.01, 4.5, 'b'],
       [3, 0.5, 5.0, 'c'],
       [4, -3.0, 3.0, 'd'],
       [5, 0.2, 3.0, 'e']], dtype=object)

In [23]:
model_cols=["x0","x1"]

In [24]:
data.loc[:,model_cols].values

array([[ 1.  , -0.1 ],
       [ 2.  ,  0.01],
       [ 3.  ,  0.5 ],
       [ 4.  , -3.  ],
       [ 5.  ,  0.2 ]])

In [25]:
data['catagory']=pd.Categorical(['a','b','c','d','e'],categories=['a','c'])
data

Unnamed: 0,x0,x1,y0,catagory
0,1,-0.1,1.2,a
1,2,0.01,4.5,
2,3,0.5,5.0,c
3,4,-3.0,3.0,
4,5,0.2,3.0,


In [27]:
# 如果想使用虚拟变量替代’category’列，先创建虚拟变量，之后删除’categroy’列，然后连接结果：

dummies=pd.get_dummies(data.catagory,prefix="cataggory")
dummies

Unnamed: 0,cataggory_a,cataggory_c
0,1,0
1,0,0
2,0,1
3,0,0
4,0,0


In [29]:
data_xith_dummies=data.drop("catagory",axis=1)
data_xith_dummies

Unnamed: 0,x0,x1,y0
0,1,-0.1,1.2
1,2,0.01,4.5
2,3,0.5,5.0
3,4,-3.0,3.0
4,5,0.2,3.0


In [30]:
data_xith_dummies1=data.drop("catagory",axis=1).join(dummies)
data_xith_dummies1

Unnamed: 0,x0,x1,y0,cataggory_a,cataggory_c
0,1,-0.1,1.2,1,0
1,2,0.01,4.5,0,0
2,3,0.5,5.0,0,1
3,4,-3.0,3.0,0,0
4,5,0.2,3.0,0,0
