In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# we don't like warnings
# you can comment the following 2 lines if you'd like to
import warnings
warnings.filterwarnings('ignore')

sns.set()

# Graphics in SVG format are more sharp and legible
#%config InlineBackend.figure_format = 'svg'

### groupby

In [2]:

df = pd.DataFrame({'key1':list('aabba'),
                  'key2': ['one','two','one','two','one'],
                  'data1': np.random.randn(5),
                  'data2': np.random.randn(5)})
df


Unnamed: 0,key1,key2,data1,data2
0,a,one,1.082617,0.093074
1,a,two,0.868984,-0.547033
2,b,one,0.589184,-0.512862
3,b,two,0.362422,-2.459838
4,a,one,-2.34652,0.481837


In [3]:
grouped=df['data1'].groupby(df['key1'])
grouped.mean()

key1
a   -0.131640
b    0.475803
Name: data1, dtype: float64

In [4]:
states=np.array(['Ohio','California','California','Ohio','Ohio'])
years=np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()

California  2005    0.868984
            2006    0.589184
Ohio        2005    0.722519
            2006   -2.346520
Name: data1, dtype: float64

In [5]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.13164,0.009293
b,0.475803,-1.48635


In [10]:
for name, group in df.groupby('key1'):
    print (name)
    print (group)

a
  key1 key2     data1     data2
0    a  one  1.082617  0.093074
1    a  two  0.868984 -0.547033
4    a  one -2.346520  0.481837
b
  key1 key2     data1     data2
2    b  one  0.589184 -0.512862
3    b  two  0.362422 -2.459838


In [11]:
for (k1,k2),group in df.groupby(['key1','key2']):
    print ('===k1,k2:')
    print (k1,k2)
    print ('===k3:')
    print (group)

===k1,k2:
a one
===k3:
  key1 key2     data1     data2
0    a  one  1.082617  0.093074
4    a  one -2.346520  0.481837
===k1,k2:
a two
===k3:
  key1 key2     data1     data2
1    a  two  0.868984 -0.547033
===k1,k2:
b one
===k3:
  key1 key2     data1     data2
2    b  one  0.589184 -0.512862
===k1,k2:
b two
===k3:
  key1 key2     data1     data2
3    b  two  0.362422 -2.459838


In [13]:
piece=dict(list(df.groupby('key1')))
piece['a']

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.082617,0.093074
1,a,two,0.868984,-0.547033
4,a,one,-2.34652,0.481837


In [16]:
df.groupby(['key1','key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.287456
a,two,-0.547033
b,one,-0.512862
b,two,-2.459838


In [24]:
people=pd.DataFrame(np.random.randn(5,5),
                   columns=list('abcde'),
                   index=['Joe','Steve','Wes','Jim','Travis'])

people.ix[2:3,['b','c']]=np.nan #设置几个nan
people


Unnamed: 0,a,b,c,d,e
Joe,-0.300022,1.249401,-0.681748,0.323971,1.126348
Steve,-0.943565,-1.992419,-1.169443,0.014601,0.251953
Wes,-1.818113,,,0.066929,-0.427958
Jim,-0.184395,1.316471,0.627734,-0.368668,-0.211192
Travis,-0.014976,-0.55673,-0.686327,0.089777,2.845932


In [25]:
mapping={'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column=people.groupby(mapping,axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,-0.357777,2.075727
Steve,-1.154842,-2.68403
Wes,0.066929,-2.246072
Jim,0.259066,0.920884
Travis,-0.59655,2.274226


In [26]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-2.30253,2.565873,-0.054014,0.022232,0.487198
5,-0.943565,-1.992419,-1.169443,0.014601,0.251953
6,-0.014976,-0.55673,-0.686327,0.089777,2.845932


In [27]:
key_list=['one','one','one','two','two'] 
people.groupby([len, key_list]).sum()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-2.118135,1.249401,-0.681748,0.3909,0.698389
3,two,-0.184395,1.316471,0.627734,-0.368668,-0.211192
5,one,-0.943565,-1.992419,-1.169443,0.014601,0.251953
6,two,-0.014976,-0.55673,-0.686327,0.089777,2.845932


### meshgrid

In [6]:
x = np.arange(-5, 5, 0.1)
y = np.arange(-5, 5, 0.1)
xx, yy = np.meshgrid(x, y, sparse=Ture)
z = np.sin(xx**2 + yy**2) / (xx**2 + yy**2)
h = plt.contourf(x,y,z)

NameError: name 'Ture' is not defined

In [None]:
nx,ny = (3, 2)
x = np.linspace(0,1,nx)
y = np.linspace(0,1,ny)
print(x, y)

xv, yv = np.meshgrid(x, y, sparse=False, indexing='ij')
xv
yv

In [None]:
for i in range(nx):
    for j in range(ny):
        print(xv[i,j], yv[i,j])

In [None]:
from sklearn import tree
X = [[0, 0], [1, 1]]
Y = [0, 1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

In [None]:
clf.predict([[2., 2.]])

In [None]:
clf.predict_proba([[2., 2.]])

In [None]:
from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

func:`export_graphviz` 出导出还支持各种美化，包括通过他们的类着色节点（或回归值），如果需要，使用显式变量和类名。Jupyter notebook也可以自动找出相同的模块

In [None]:
import graphviz
#dot_data = tree.export_graphviz(clf, out_file='1.data')
#graph = graphviz.Source(dot_data)

dot_data = tree.export_graphviz(clf, out_file=None,
                            feature_names=iris.feature_names, 
                            class_names=iris.target_names, 
                            filled=True, rounded=True, 
                            special_characters=True) 
graph = graphviz.Source(dot_data) 
graph.render("iris")
graph

In [None]:
clf.predict(iris.data[:1, :])

In [None]:
clf.predict_proba(iris.data[:1, :])