#  matplotlib常用技巧

导入数据包

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt

设置绘图样式

In [2]:
plt.style.use('classic')

- %matplotlib notebook会在Notebook中启动交互式图形
- %matplotlib inline会在Notebook中启动静态图形|

In [3]:
%matplotlib inline

In [4]:
import numpy as np
x = np.linspace(0, 10, 100)
fig = plt.figure()
plt.plot(x, np.sin(x), '-')
plt.plot(x, np.cos(x), '--')

[<matplotlib.lines.Line2D at 0x7f447e352b70>]

## 将图形保存为文件

In [5]:
fig.savefig('my_figure.png')

In [6]:
!ls -lh my_figure.png

-rw-r--r-- 1 kesci users 26K Jan  9 12:58 my_figure.png


确认一下是否是我们想要的内容

In [7]:
from IPython.display import Image
Image('my_figure.png')

查看系统支持的格式

In [8]:
fig.canvas.get_supported_filetypes()

{'ps': 'Postscript',
 'eps': 'Encapsulated Postscript',
 'pdf': 'Portable Document Format',
 'pgf': 'PGF code for LaTeX',
 'png': 'Portable Network Graphics',
 'raw': 'Raw RGBA bitmap',
 'rgba': 'Raw RGBA bitmap',
 'svg': 'Scalable Vector Graphics',
 'svgz': 'Scalable Vector Graphics',
 'jpg': 'Joint Photographic Experts Group',
 'jpeg': 'Joint Photographic Experts Group',
 'tif': 'Tagged Image File Format',
 'tiff': 'Tagged Image File Format'}

# 两种画图接口

## MATLAB风格接口

In [9]:
plt.figure() # 创建图形
# 创建两个子图中的第一个，设置坐标
plt.subplot(2, 1, 1) # (行、 列、 子图编号)
plt.plot(x, np.sin(x))

# 创建两个子图中的第二个设置坐标轴
plt.subplot(2, 1, 2)
plt.plot(x, np.cos(x))

[<matplotlib.lines.Line2D at 0x7f447b7b7668>]

## 面向对象接口

In [10]:
# 先创建图形网格
# ax是一个包含两个Axes对象的数组
fig, ax = plt.subplots(2)
# 每个对象上调用plot()方法
ax[0].plot(x, np.sin(x))
ax[1].plot(x, np.cos(x))

[<matplotlib.lines.Line2D at 0x7f447b701b70>]

# 简易线形图

In [11]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import numpy as np

In [12]:
fig = plt.figure()
ax = plt.axes()

In [13]:
fig = plt.figure()
ax = plt.axes()
x = np.linspace(0, 10, 1000)
ax.plot(x, np.sin(x))

[<matplotlib.lines.Line2D at 0x7f447b64ce80>]

In [14]:
plt.plot(x, np.sin(x))

[<matplotlib.lines.Line2D at 0x7f447b59bcf8>]

如果想在一张图中创建多条线，可以重复条用plot命令

In [15]:
plt.plot(x, np.sin(x))
plt.plot(x, np.cos(x))

[<matplotlib.lines.Line2D at 0x7f447b5b60f0>]

## 调整图形：线条的颜色与风格

In [16]:
plt.plot(x, np.sin(x - 0), color='blue') # 标准颜色名称
plt.plot(x, np.sin(x - 1), color='g') # 缩写颜色代码（rgbcmyk）
plt.plot(x, np.sin(x - 2), color='0.75') # 范围在0~1的灰度值
plt.plot(x, np.sin(x - 3), color='#FFDD44') # 十六进制（RRGGBB, 00~FF）
plt.plot(x, np.sin(x - 4), color=(1.0, 0.2, 0.3)) # RGB元祖，范围在0~1
plt.plot(x, np.sin(x - 5), color='chartreuse') # HTML颜色名称

[<matplotlib.lines.Line2D at 0x7f447b578ef0>]

In [18]:
plt.plot(x, x + 0, linestyle='solid')
plt.plot(x, x + 1, linestyle='dashed')
plt.plot(x, x + 2, linestyle='dashdot')
plt.plot(x, x + 3, linestyle='dotted')

# 简写形式
plt.plot(x, x + 4, linestyle='-') # 实线
plt.plot(x, x + 5, linestyle='--') # 虚线
plt.plot(x, x + 6, linestyle='-.') # 点划线
plt.plot(x, x + 7, linestyle=':'); # 实点线

In [19]:
plt.plot(x, x + 0, '-g') # 绿色实线
plt.plot(x, x + 1, '--c') # 青色虚线
plt.plot(x, x + 2, '-.k') # 黑色点划线
plt.plot(x, x + 3, ':r'); # 红色实点线

## 调整图形：坐标上下限

In [20]:
plt.plot(x, np.sin(x))
plt.xlim(-1, 11)
plt.ylim(-1.5, 1.5);

坐标逆序显示，并设置刻度值

In [21]:
plt.plot(x, np.sin(x))
plt.xlim(10, 0)
plt.ylim(1.2, -1.2);

In [22]:
plt.plot(x, np.sin(x))
plt.axis([-1, 11, -1.5, 1.5]);

In [23]:
plt.plot(x, np.sin(x))
plt.axis('tight');

In [26]:
plt.plot(x, np.sin(x))
plt.axis('equal'); # 图形分辨率1:1

## 设置图形标签

In [27]:
plt.plot(x, np.sin(x))
plt.title("A Sine Curve")
plt.xlabel("x")
plt.ylabel("sin(x)");

In [28]:
plt.plot(x, np.sin(x), '-g', label='sin(x)')
plt.plot(x, np.cos(x), ':b', label='cos(x)')
plt.axis('equal')
plt.legend();

一些MATLAB风格的方法和面向对象方法的转换：

- plt.xlabel() -> ax.set_xlabel()
- plt.ylabel() -> ax.set_ylabel()
- plt.xlim() -> ax.set_xlim()
- plt.ylim() -> ax.set_ylim()
- plt.title() -> ax.set_title()

# 简易散点图

In [30]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import numpy as np

In [31]:
x = np.linspace(0, 10, 30)
y = np.sin(x)
plt.plot(x, y, 'o', color='black');

In [33]:
rng = np.random.RandomState(0)
for marker in ['o', '.', ',', 'X', '+', 'v', '^', '<', '>', 's', 'd']:
    plt.plot(rng.rand(5), rng.rand(5), marker, label="marker='{0}'".format(marker))
    plt.legend(numpoints=1)
    plt.xlim(0, 1.8);
    

In [34]:
plt.plot(x, y, '-ok');

In [35]:
plt.plot(x, y, '-p', color='gray',markersize=15, linewidth=4, markerfacecolor='white', markeredgecolor='gray', markeredgewidth=2)
plt.ylim(-1.2, 1.2);

## 用plt.scatter划散点图

In [37]:
plt.scatter(x, y, marker='o');

In [39]:
rng = np.random.RandomState(0)
x = rng.randn(100)
y = rng.randn(100)
colors = rng.rand(100)
sizes = 1000 * rng.rand(100)
plt.scatter(x, y, c=colors, s=sizes, alpha=0.3, cmap='viridis')
plt.colorbar(); # 显示颜色调

In [41]:
from sklearn.datasets import load_iris
iris = load_iris()
features = iris.data.T

plt.scatter(features[0], features[1], alpha=0.2, s=100*features[3], c=iris.target, cmap='viridis')
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1]);

# 可视化异常处理

## 基本误差线

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import numpy as np

In [2]:
x = np.linspace(0, 10, 50)
dy = 0.8
y = np.sin(x) + dy * np.random.randn(50)
plt.errorbar(x, y, yerr=dy, fmt='.k');

In [3]:
plt.errorbar(x, y, yerr=dy, fmt='o', color='black', ecolor='lightgray', elinewidth=3, capsize=0);

## 连续误差

调用Scikit-Learn程序库API里面的`高斯过程回归方法（GRR）`来演示 

导入包失败

In [5]:
from sklearn.gaussian_process import GaussianProcess

# 定义模型和要画的数据
model = lambda x: x * np.sin(x)
xdata = np.array([1, 3, 5, 6, 8])
ydata = model(xdata)

# 计算高斯过程你和结果
gp = GaussianProcess('corr=cubic', theta0=1e-2, thetaL=1e-4, thetaU=1E-1, random_start=100)
gp.fit(xdata[:, np.newaxis], ydata)

xfit = np.linspace(0, 10, 100)
yfit, MSE  =gp.predict(xfit[:, np.newaxis], eval_MSE=True)
dyfit = 2 * np.sqrt(MSE) # 2*sigma~96%置信区间

# 将数据可视化
plt.plot(xdata, ydata, 'or')
plt.plot(xfit, yfit, '-', color='gray')
plt.fill_between(xfit, yfit - dyfit, yfit + dyfit, color='gray', alpha=0.2)
plt.xlim(0, 10);

ImportError: cannot import name 'GaussianProcess'

In [6]:
import GaussianProcess

ModuleNotFoundError: No module named 'GaussianProcess'

In [9]:
!pip install GaussianProcess

Collecting GaussianProcess
[31m  Could not find a version that satisfies the requirement GaussianProcess (from versions: )[0m
[31mNo matching distribution found for GaussianProcess[0m


## 高密度与等高线图

**三维函数的数据可视化**

首先用函数$z = f(x, y)$演示一个等高图， 按照下面的方式生成函数$f$样本数据

等高图可以通过plt.contour函数来创建，它需要三个参数：$x$轴、$y$轴、$z$轴三个坐标轴的网格数据，$x$轴与$y$轴表示图形中的位置，而$z$将通过等高线的等级来表示

In [19]:
def f(x, y):
    return np.sin(x) ** 10 + np.cos(10 + y * x) * np.cos(x)

In [20]:
x = np.linspace(0, 5, 50)
y = np.linspace(0, 5, 40)
X, Y = np.meshgrid(x, y)
Z = f(X, Y)

In [21]:
plt.contour(X, Y, Z, colors='black');

In [22]:
plt.contour(X, Y, Z, 20, cmap='RdGy'); # RdGY（红-灰，Red-Dray的缩写）

In [23]:
plt.contourf(X, Y, Z, 20, cmap='RdGy')
plt.colorbar();

In [26]:
# 渲染成渐变图
plt.imshow(Z, extent=[0, 5, 0, 5], origin='lower', cmap='RdGy')
plt.colorbar()
plt.axis(aspect='image');

- plt.imshow()不支持用x轴和y轴数据设置表格， 而是必须通过extent参数设置图形的坐标范围[xmin, xmax, ymin, ymax]。
- plt.imshow()默认使用标准的图形定义数组，就是远点位于左上角（浏览器都是这样），而不是绝大多数等高线使用的左下角，这一点在显示网格数据的时候必须调整。
- plt.imshow()会自动调整坐标轴的精度以适应数据显示，可以通过plt.axis(aspect='image)来设置x周与y轴的单位。

In [28]:
# 加上数据标签
contours = plt.contour(X, Y, Z, 3, colors='black')
plt.clabel(contours, inline=True, fontsize=8)
plt.imshow(Z, extent=[0, 5, 0, 5], origin='lower', cmap='RdGy', alpha=0.5)
plt.colorbar();

# 频次直方图、数据区间划分和分布密度

In [29]:
data = np.random.randn(1000)
plt.hist(data);

In [30]:
# 个性化的频次直方图
plt.hist(data, bins=30, normed=True, alpha=0.5, histtype='stepfilled', color='steelblue', edgecolor='none');

The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead.
  alternative="'density'", removal="3.1")


In [31]:
x1 = np.random.normal(0, 0.8, 1000)
x2 = np.random.normal(-2, 1, 1000)
x3 = np.random.normal(3, 2, 1000)
kwargs = dict(histtype='stepfilled', alpha=0.3, normed=True, bins=40)
plt.hist(x1, **kwargs)
plt.hist(x2, **kwargs)
plt.hist(x3, **kwargs);

The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead.
  alternative="'density'", removal="3.1")


**二维频次直方图与数据区间划分**

In [33]:
mean = [0, 0]
cov = [[1, 1], [1, 2]]
x, y = np.random.multivariate_normal(mean, cov, 10000).T

In [35]:
plt.hist2d(x, y, bins=30, cmap='Blues')
cb = plt.colorbar()
cb.set_label('counts in bin')

In [36]:
# 调整样式
plt.hexbin(x, y, gridsize=30, cmap='Blues')
cb = plt.colorbar(label='count in bin')

**使用核密度估计（KDE）**

In [38]:
from scipy.stats import gaussian_kde

# 拟合数组维度[Ndim, Nsamples]
data = np.vstack([x, y])
kde = gaussian_kde(data)

# 用一队规则的网格数据进行拟合
xgrid = np.linspace(-3.5, 3.5, 40)
ygrid = np.linspace(-6, 6, 40)
Xgrid, Ygrid = np.meshgrid(xgrid, ygrid)
Z = kde.evaluate(np.vstack([Xgrid.ravel(), Ygrid.ravel()]))

# 画出结果图
plt.imshow(Z.reshape(Xgrid.shape), origin='lower', aspect='auto', extent=[-3.5, 3.5, -6, 6], cmap='Blues')
cb = plt.colorbar(label="density")

# 配置图例

In [39]:
plt.style.use('classic')

In [40]:
x = np.linspace(0, 10, 1000)
fig, ax = plt.subplots()
ax.plot(x, np.sin(x), '-b', label='Sine')
ax.plot(x, np.cos(x), '--r', label='Cosine')
ax.axis('equal')
leg = ax.legend();

设置图例位置，并取消外边框

In [41]:
ax.legend(loc='upper left', frameon=False)
fig

设置图例的标签列数

In [43]:
ax.legend(frameon=False, loc='lower center', ncol=2)
fig

图例自定义圆角边框、增加阴影、改变外框透明度，或者改变文字间距

In [46]:
ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1)
fig

更多信息参考plt.legend程序文档

## 选择图例显示的元素

In [47]:
y = np.sin(x[:, np.newaxis] + np.pi * np.arange(0, 2, 0.5))

# lines变量是一组plt.Line2D实例
lines = plt.plot(x, y)
plt.legend(lines[:2], ['fiest', 'second']);

In [49]:
plt.plot(x, y[:, 0], label='first')
plt.plot(x, y[:, 1], label='second')
plt.plot(x, y[:, 2:])
plt.legend(framealpha=1, frameon=True);

## 在图例中显示不同尺寸的点

In [51]:
import pandas as pd
cities = pd.read_csv('/home/kesci/input/Handbook6828/california_cities.csv')

# 提取感兴趣的数据
lat, lon = cities['latd'], cities['longd']
population, area = cities['population_total'], cities['area_total_km2']

# 用不同尺寸和颜色的散点图表示数据，但是不带标签
plt.scatter(lon, lat, label=None, c=np.log10(population), cmap='viridis', s = area, linewidth=0, alpha=0.5)
plt.axis(aspect='equal')
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.colorbar(label='log$_{10}$(population)')
plt.clim(3, 7)

# 下面创建一个图例
# 画一些带标签和尺寸的控列表
for area in [100, 300, 500]:
    plt.scatter([], [], c='k', alpha=0.3, s=area, label=str(area) + 'km$^2$')
    plt.legend(scatterpoints=1, frameon=False, labelspacing=1, title="City Area")
    plt.title("California Cities: Area and Population");

  return f(*args, **kwds)


## 同时显示多个图例

In [54]:
fig, ax = plt.subplots()
lines = []
styles = ['-', '--', '-.', ':']
x = np.linspace(0, 10, 1000)

for i in range(4):
    lines += ax.plot(x, np.sin(x - i * np.pi / 2), styles[i], color='black')
    ax.axis('equal')
    
# 设置第一个图例要显示线条和标签
ax.legend(lines[:2], ['line A', 'line B'], loc='upper right', frameon=False)

# 创建第二个图例， 通过add_artist方法添加到图上
from matplotlib.legend import Legend
leg = Legend(ax, lines[2:], ['line C', 'line D'], loc='lower right', frameon=False)
ax.add_artist(leg);

# 配置颜色条

In [2]:
import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib inline
import numpy as np

In [3]:
x = np.linspace(0, 10, 1000)
I = np.sin(x) * np.cos(x[:, np.newaxis])
plt.imshow(I)
plt.colorbar();

## 配置

In [4]:
plt.imshow(I, cmap='gray');

In [5]:
from matplotlib.colors import LinearSegmentedColormap
def grayscale_cmap(cmap):
    """ 为配色方案显示灰度图"""
    cmap = plt.cm.get_cmap(cmap)
    colors = cmap(np.arange(cmap.N))
    
    # 将RGBA色转为不同凉都的灰度值
    RGB_weight = [0.299, 0.587, 0.114]
    luminance = np.sqrt(np.dot(colors[:, :3] ** 2, RGB_weight))
    colors[:, :3] = luminance[:, np.newaxis]
    return LinearSegmentedColormap.from_list(cmap.name + "_gray", colors, cmap.N)
    
def view_colormap(cmap):
    """用等价值的灰度图表示配色方案"""
    cmap = plt.cm.get_cmap(cmap)
    colors = cmap(np.arange(cmap.N))
    cmap = grayscale_cmap(cmap)
    grayscale = cmap(np.arange(cmap.N))
    fig, ax = plt.subplots(2, figsize=(6, 2), subplot_kw=dict(xticks=[], yticks=[]))
    ax[0].imshow([colors], extent=[0, 10, 0, 1])
    ax[1].imshow([grayscale], extent=[0, 10, 0, 1])

In [6]:
view_colormap('jet')

In [7]:
view_colormap('viridis')

In [8]:
view_colormap('cubehelix')

In [9]:
view_colormap('RdBu')

**颜色刻度的限制与扩展功能的设置**

In [10]:
"""为图形像素设置1%噪点"""
speckles = (np.random.random(I.shape) < 0.01)
I[speckles] = np.random.normal(0, 3, np.count_nonzero(speckles))
plt.figure(figsize=(10, 3.5))
plt.subplot(1, 2, 1)
plt.imshow(I, cmap='RdBu')
plt.colorbar()
plt.subplot(1, 2, 2)
plt.imshow(I, cmap='RdBu')
plt.colorbar(extend='both')
plt.clim(-1, 1);

离散型颜色条

In [11]:
plt.imshow(I, cmap=plt.cm.get_cmap('Blues', 6))
plt.colorbar()
plt.clim(-1, 1);

## 案例：手写数字

In [13]:
"""加载数字0~5的图形，对其进行可视化"""
from sklearn.datasets import load_digits
digits = load_digits(n_class=6)

fig, ax = plt.subplots(8, 8, figsize=(6, 6))
for i, axi in enumerate(ax.flat):
    axi.imshow(digits.images[i], cmap='binary')
    axi.set(xticks=[], yticks=[])

In [15]:
"""用IsoMap方法将数字投影到二维空间"""
from sklearn.manifold import Isomap
iso = Isomap(n_components=2)
projection = iso.fit_transform(digits.data)

"""画图"""
plt.scatter(projection[:, 0], projection[:, 1], lw=0.1, c=digits.target, cmap=plt.cm.get_cmap('cubehelix', 6))
plt.colorbar(ticks=range(6), label='digit value')
plt.clim(-0.5, 5.5);

# 多子图

In [16]:
%matplotlib inline
import matplotlib.pyplot as plot
plt.style.use('seaborn-white')
import numpy as np

## 手动创建子图

plt.axes

In [17]:
ax1 = plt.axes() # 默认坐标轴
ax2 = plt.axes([0.65, 0.65, 0.2, 0.2])

In [18]:
fig = plt.figure()
ax1 = fig.add_axes([0.1, 0.5, 0.8, 0.4], xticklabels=[], ylim=(-1.2, 1.2))
ax2 = fig.add_axes([0.1, 0.1, 0.8, 0.4], ylim=(-1.2, 1.2))
x = np.linspace(0, 10)
ax1.plot(np.sin(x))
ax2.plot(np.cos(x));

## 简易网格图
`plt.subplot`

In [19]:
for i in range(1, 7):
    plt.subplot(2, 3, i)
    plt.text(0.5, 0.5, str((2, 3, i)), fontsize=18, ha='center')

In [20]:
fig = plot.figure()
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(1, 7):
    ax = fig.add_subplot(2, 3, i)
    ax.text(0.5, 0.5, str((2, 3, i)), fontsize=18, ha='center')

## 一行代码创建
`plt.subplots`

In [21]:
fig, ax = plt.subplots(2, 3, sharex='col', sharey='row')

In [24]:
# 坐标轴存放在一个NumPy数组中， 按照[row, col]取值
for i in range(2):
    for j in range(3):
        ax[i, j].text(0.5, 0.5, str((i, j)), fontsize=18, ha='center')
        
fig

## 复杂排列方式
`plt.GridSpec`

In [25]:
grid = plt.GridSpec(2, 3, wspace=0.4, hspace=0.3)

In [26]:
plt.subplot(grid[0, 0])
plt.subplot(grid[0, 1:])
plt.subplot(grid[1, :2])
plt.subplot(grid[1, 2]);

In [30]:
# 创建一些正态分布数据
mean = [0, 0]
cov = [[1, 1], [1, 2]]
x, y = np.random.multivariate_normal(mean, cov, 3000).T

# 设置坐标轴和网格配置方式
fig = plot.figure(figsize=(6, 6))
grid = plt.GridSpec(4, 4, hspace=0.2, wspace=0.2)
main_ax = fig.add_subplot(grid[:-1, 1:])
y_hist = fig.add_subplot(grid[:-1, 0], xticklabels=[], sharey=main_ax)
x_hist = fig.add_subplot(grid[-1, 1:], yticklabels=[], sharex=main_ax)

# 柱坐标轴画散点图
main_ax.plot(x, y, 'ok', markersize=3, alpha=0.2)

# 次坐标轴画频次直方图
x_hist.hist(x, 40, histtype='stepfilled', orientation='vertical', color='gray')
x_hist.invert_yaxis()

y_hist.hist(y, 40, histtype='stepfilled', orientation='horizontal', color='gray')
y_hist.invert_xaxis()

# 文字与注释

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
plt.style.use('seaborn-whitegrid')
import numpy as np
import pandas as pd

  'Matplotlib is building the font cache using fc-list. '
  return f(*args, **kwds)


## 案例：节假日对美国出生率的影响

In [3]:
births = pd.read_csv('/home/kesci/input/Handbook6828/births.csv')
quartiles = np.percentile(births['births'], [25, 50, 75])
mu, sig = quartiles[1], 0.74 * (quartiles[2] - quartiles[0])
births = births.query('(births > @mu -5 * @sig) & (births < @mu + 5 * @sig)')
births['day'] = births['day'].astype(int)
births.index = pd.to_datetime(10000 * births.year + 100 * births.month + births.day, format='%Y%m%d')
births_by_date = births.pivot_table('births', [births.index.month, births.index.day])
births_by_date.index = [pd.datetime(2012, month, day) for (month, day) in births_by_date.index]

In [4]:
fig, ax = plt.subplots(figsize=(12, 4))
births_by_date.plot(ax=ax);

增加注释

In [5]:
fig, ax = plt.subplots(figsize=(12, 4))
births_by_date.plot(ax=ax)

# 在图上增加文字标签
style = dict(size=10, color='gray')
ax.text('2012-1-1', 3950, "New Year's Day", **style)
ax.text('2012-7-4', 4250, "Independence Day", ha='center', **style)
ax.text('2012-9-4', 4850, "Labor Day", ha='center', **style)
ax.text('2012-10-31', 4600, "Halloween", ha='right', **style)
ax.text('2012-11-25', 4450, "Thanksgiving", ha='center', **style)
ax.text('2012-12-25', 3850, "Christmas", ha='right', **style)

# 设置坐标轴标题
ax.set(title='USA births by day of year (1969-1988)', ylabel='average dail births')

# 设置x轴坐标标题
ax.xaxis.set_major_locator(mpl.dates.MonthLocator())
ax.xaxis.set_major_locator(mpl.dates.MonthLocator(bymonthday=15))
ax.xaxis.set_major_formatter(plt.NullFormatter())
ax.xaxis.set_major_formatter(mpl.dates.DateFormatter('%h'));

用三中不同变换方式将文字画在不同位置

In [6]:
fig, ax = plt.subplots(facecolor='lightgray')
ax.axis([0, 10, 0, 10])

# transform=ax.transData是默认值
ax.text(1, 5, ". Data: (1, 5)", transform=ax.transData)
ax.text(0.5, 0.1, ". Axes: (0.5, 0.1)", transform=ax.transAxes)
ax.text(0.2, 0.2, ". Figure: (0.2, 0.2)", transform=fig.transFigure);

In [7]:
ax.set_xlim(0, 2)
ax.set_ylim(-6, 6)
fig

## 箭头与注释

In [8]:
fig, ax = plt.subplots()
x = np.linspace(0, 20, 1000)
ax.plot(x, np.cos(x))
ax.axis('equal')
ax.annotate('local maximun', xy=(6.28, 1), xytext=(10, 4), arrowprops=dict(facecolor='black', shrink=0.05))
ax.annotate('local maximun', xy=(5 * np.pi, -1), xytext=(2, -6), arrowprops=dict(arrowstyle='->', connectionstyle="angle3,angleA=0,angleB=-90"));

In [20]:
fig, ax = plt.subplots(figsize=(12, 4))
births_by_date.plot(ax=ax)

# 在图上增加箭头标签
ax.annotate("New Year's Day", xy=('2012-1-1', 4100), xycoords='data', xytext=(50, -30), 
  textcoords='offset points', arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=-0.2"))

ax.annotate("Independence Day", xy=('2012-7-4', 4250), xycoords='data', 
  bbox=dict(boxstyle="round", fc="none", ec="gray"), xytext=(10, -40), textcoords='offset points', ha='center', 
  arrowprops=dict(arrowstyle="->"))

ax.annotate("Labor Day", xy=('2012-9-4', 4850), xycoords='data', xytext=(0, -20), textcoords='offset points', ha='center')

ax.annotate('', xy=('2012-9-1', 4850), xycoords='data', textcoords='data',
  xytext=('2012-9-7', 4850), arrowprops={'arrowstyle' : '|-|, widthA=0.2, widthB=0.2', })

ax.annotate("Halloween", xy=('2012-10-31', 4600), xycoords='data', 
  xytext=(-80, -40), textcoords='offset points', 
  arrowprops=dict(arrowstyle="fancy", fc='0.6', ec='none', connectionstyle='angle3,angleA=0,angleB=-90'))
 

ax.annotate("Thankgiving", xy=('2012-11-25', 4500), xycoords='data', 
  bbox=dict(boxstyle="round4,pad=.5", fc="0.9"), xytext=(-120, -60), textcoords='offset points', 
  arrowprops=dict(arrowstyle="->", connectionstyle='angle,angleA=0,angleB=80,rad=20'))

ax.annotate("Christmas", xy=('2012-12-25', 3850), xycoords='data', 
  xytext=(-30, 0), textcoords='offset points', ha='right', size=13, va='center', bbox=dict(boxstyle="round4", alpha=0.1),
  arrowprops=dict(arrowstyle="wedge,tail_width=0.5", alpha=0.1));

# 设置坐标轴标题
ax.set(title='USA births by day of year (1969-1988)', ylabel='average dail births')

# 设置x轴坐标标题
ax.xaxis.set_major_locator(mpl.dates.MonthLocator())
ax.xaxis.set_major_locator(mpl.dates.MonthLocator(bymonthday=15))
ax.xaxis.set_major_formatter(plt.NullFormatter())
ax.xaxis.set_major_formatter(mpl.dates.DateFormatter('%h'));

ax.set_ylim(3600, 5400);

# 自定义坐标轴刻度

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import numpy as np
ax = plt.axes(xscale='log', yscale='log')

In [2]:
print(ax.xaxis.get_major_locator())
print(ax.xaxis.get_minor_locator())

<matplotlib.ticker.LogLocator object at 0x7f8d61dec0b8>
<matplotlib.ticker.LogLocator object at 0x7f8d61dd8e80>


In [3]:
print(ax.xaxis.get_major_formatter())
print(ax.xaxis.get_minor_formatter())

<matplotlib.ticker.LogFormatterSciNotation object at 0x7f8d61dd8ac8>
<matplotlib.ticker.LogFormatterSciNotation object at 0x7f8d61dd8ba8>


## 隐藏刻度与标签

In [4]:
ax = plt.axes()
ax.plot(np.random.rand(50))

ax.yaxis.set_major_locator(plt.NullLocator())
ax.yaxis.set_major_formatter(plt.NullFormatter())

举例

In [5]:
fig, ax = plt.subplots(5, 5, figsize=(5, 5))
fig.subplots_adjust(hspace=0, wspace=0)

# 从scikit-learn获取一些人脸相片数据
from sklearn.datasets import fetch_olivetti_faces
faces = fetch_olivetti_faces().images

for i in range(5):
    for j in range(5):
        ax[i, j].yaxis.set_major_locator(plt.NullLocator())
        ax[i, j].yaxis.set_major_formatter(plt.NullFormatter())
        ax[i, j].imshow(faces[10 * i + j], cmap='bone')

downloading Olivetti faces from https://ndownloader.figshare.com/files/5976027 to /home/kesci/scikit_learn_data


## 增减刻度数量

刻度拥挤的图形

In [6]:
fig, ax = plt.subplots(4, 4, sharex=True, sharey=True)

In [7]:
#　为每个坐标轴设置主要刻度定位器
for axi in ax.flat:
    axi.xaxis.set_major_locator(plt.MaxNLocator(2))
    axi.yaxis.set_major_locator(plt.MaxNLocator(1))
    
fig

## 花哨的刻度格式

In [8]:
fig, ax = plt.subplots()
x = np.linspace(0, 3 * np.pi, 1000)
ax.plot(x, np.sin(x), lw=3, label='Sine')
ax.plot(x, np.cos(x), lw=3, label='Cosine')

ax.grid(True)
ax.legend(frameon=False)
ax.axis('equal')
ax.set_xlim(0, 3 * np.pi);

In [9]:
ax.xaxis.set_major_locator(plt.MultipleLocator(np.pi / 2))
ax.xaxis.set_major_locator(plt.MultipleLocator(np.pi / 4))
fig

看起来有点奇怪所以改改

In [10]:
def format_func(value, tick_number):
    # 找到π/2的倍数刻度
    N = int(np.round(2 * value / np.pi))
    if N == 0:
        return "0"
    elif N == 1:
        return r"$\pi/2$"
    elif N == 2:
        return r"$\pi$"
    elif N % 2 > 0:
        return r"${0}\pi/2$".format(N)
    else:
        return r"${0}\pi$".format(N // 2)
        
ax.xaxis.set_major_formatter(plt.FuncFormatter(format_func))
fig

matplotlib是支持LaTeX的

# 配置文件与样式表

## 手动配置图形

In [11]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('classic')
import numpy as np


In [12]:
x = np.random.randn(1000)
plt.hist(x);

美化图片

In [13]:
# 用灰色背景
ax = plt.axes(facecolor='#E6E6E6') # 原文是axisbg，但是报错，所以我修改了
ax.set_axisbelow(True)

# 画上白色的网格线
plt.grid(color='w', linestyle='solid')

# 隐藏坐标轴的线条
for spine in ax.spines.values():
    spine.set_visible(False)

# 隐藏上边与右边的刻度
ax.xaxis.tick_bottom()
ax.yaxis.tick_left()

# 弱化刻度与标签
ax.tick_params(colors='gray', direction='out')
for tick in ax.get_xticklabels():
    tick.set_color('gray')
for tick in ax.get_yticklabels():
    tick.set_color('gray')

# 设置频次直方图轮廓色与填充色
ax.hist(x, edgecolor='#E6E6E6', color='#EE6666');

## 修改默认配置

In [14]:
IPython_default = plt.rcParams.copy()

In [15]:
from matplotlib import cycler
colors = cycler('color', ['#EE6666', '#3388BB', '#9988DD', '#EECC55', '#88BB44', '#FFBBBB'])
plt.rc('axes', facecolor='#E6E6E6', edgecolor='none', axisbelow=True, grid=True, prop_cycle=colors)
plt.rc('grid', color='w', linestyle='solid')
plt.rc('xtick', direction='out', color='gray')
plt.rc('ytick', direction='out', color='gray')
plt.rc('patch', edgecolor='#E6E6E6')
plt.rc('lines', linewidth=2)

In [16]:
plt.hist(x);

In [17]:
for i in range(4):
    plt.plot(np.random.rand(10))

## 样式表

In [18]:
def hist_and_lines():
    np.random.seed(0)
    fig, ax = plt.subplots(1, 2, figsize=(11, 4))
    ax[0].hist(np.random.rand(1000))
    for i in range(3):
        ax[1].plot(np.random.rand(10))
    ax[1].legend(['a', 'b', 'c'], loc='lower left')

In [19]:
plt.rcParams.update(IPython_default);

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))
The text.latex.unicode rcparam was deprecated in Matplotlib 2.2 and will be removed in 3.1.
  "2.2", name=key, obj_type="rcparam", addendum=addendum)


In [20]:
hist_and_lines()

In [21]:
with plt.style.context('fivethirtyeight'):
    hist_and_lines()

In [22]:
with plt.style.context('ggplot'):
    hist_and_lines()

In [23]:
with plt.style.context('bmh'):
    hist_and_lines()

In [24]:
with plt.style.context('dark_background'):
    hist_and_lines()

In [25]:
with plt.style.context('grayscale'):
    hist_and_lines()

In [26]:
import seaborn
hist_and_lines()

  return f(*args, **kwds)


# 画三维图

In [27]:
from mpl_toolkits import mplot3d

In [28]:
fig = plt.figure()
ax = plt.axes(projection='3d')

## 三维数据点与线

In [29]:
ax = plt.axes(projection='3d')

# 三维先的数据
zline = np.linspace(0, 15, 1000)
xline = np.sin(zline)
yline = np.cos(zline)
ax.plot3D(xline, yline, zline, 'gray')

# 三维散点的数据
zdata = 15 * np.random.random(100)
xdata = np.sin(zdata) + 0.1 + np.random.randn(100)
ydata = np.cos(zdata) + 0.1 + np.random.randn(100)
ax.scatter3D(xdata, ydata, zdata, c=zdata, cmap='Greens');

## 三维等高图

In [30]:
def f(x, y):
    return np.sin(np.sqrt(x ** 2 + y ** 2))
x = np.linspace(-6, 6, 30)
y = np.linspace(-6, 6, 30)

X, Y = np.meshgrid(x, y)
Z = f(X, Y)

In [31]:
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.contour3D(X, Y, Z, 50, cmap='binary')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z');

In [32]:
ax.view_init(60, 35)
fig

## 线框图和曲面图

In [34]:
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.plot_wireframe(X, Y, Z, color='black')
ax.set_title('wireframe');

In [35]:
ax = plt.axes(projection='3d')
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='viridis', edgecolor='none')
ax.set_title('surface');

切片可视化

In [37]:
r = np.linspace(0, 6, 20)
theta = np.linspace(-0.9 * np.pi, 0.8 * np.pi, 40)
r, theta = np.meshgrid(r, theta)

X = r * np.sin(theta)
Y = r * np.cos(theta)
Z = f(X, Y)
ax = plt.axes(projection='3d')
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='viridis', edgecolor='none');

## 曲面三角剖分

In [38]:
theta = 2 * np.pi * np.random.random(1000)
r = 6 * np.random.random(1000)
x = np.ravel(r * np.sin(theta))
y = np.ravel(r * np.cos(theta))
z = f(x, y)

In [39]:
ax = plt.axes(projection='3d')
ax.scatter(x, y, z, c=z, cmap='viridis', linewidth=0.5);

修补

In [40]:
ax = plt.axes(projection='3d')
ax.plot_trisurf(x, y, z, cmap='viridis',edgecolor='none');

案例：莫比乌斯带

> 莫比乌斯带是把一根纸条扭转180度后，再把两头粘起来做成的纸带圈。很神奇，因为他只有一个面！

绘图参数：由于它是一条二维带，因此需要两个内在维度。把一个维度定义为$\theta$，取值范围为0~2$\pi$；另一个维度是$\omega$，取值范围是-1~1，表示莫比乌斯带的宽度：

In [41]:
theta = np.linspace(0, 2 * np.pi, 30)
w = np.linspace(-0.25, 0.25, 8)
w, theta = np.meshgrid(w, theta)

In [42]:
phi = 0.5 * theta

In [43]:
# x - y 平面内的半径
r = 1 + w * np.cos(phi)

x = np.ravel(r * np.cos(theta))
y = np.ravel(r * np.sin(theta))
z = np.ravel(w * np.sin(phi))

In [44]:
# 用基本参数
from matplotlib.tri import Triangulation
tri = Triangulation(np.ravel(w), np.ravel(theta))

ax = plt.axes(projection='3d')
ax.plot_trisurf(x, y, z, triangles=tri.triangles, cmap='viridis', linewidths=0.2);
ax.set_xlim(-1, 1);
ax.set_ylim(-1, 1);
ax.set_zlim(-1, 1);

用Basemap可视化地理数据 

暂时安装不了这个组件，所以后期有空再做

# 用Seaborn做数据可视化

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('classic')
import numpy as np
import pandas as pd

创建一些数据

In [4]:
rng = np.random.RandomState(0)
x = np.linspace(0, 10, 500)
y = np.cumsum(rng.randn(500, 6), 0)

In [5]:
# 用matplotloib默认样式画
plt.plot(x, y)
plt.legend('ABCDEF', ncol=2, loc='upper left');

使用Seaborn

In [6]:
import seaborn as sns
sns.set()

In [7]:
# 画同样的图
plt.plot(x, y)
plt.legend('ABCDEF', ncol=2, loc='upper left');

## Seaborn图形介绍

1.频次直方图、KDE和密度图

In [8]:
data = np.random.multivariate_normal([0, 0], [[5, 2], [2, 2]], size=2000)
data = pd.DataFrame(data, columns=['x', 'y'])

for col in 'xy':
    plt.hist(data[col], normed=True, alpha=0.5)

The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead.
  alternative="'density'", removal="3.1")
The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead.
  alternative="'density'", removal="3.1")


In [9]:
for col in 'xy':
    sns.kdeplot(data[col], shade=True)

频次和KDE结合起来

In [10]:
sns.distplot(data['x'])
sns.distplot(data['y']);

如果是二维数据集，还可以得到一个二维可视化图

In [11]:
sns.kdeplot(data);



In [12]:
with sns.axes_style('white'):
    sns.jointplot('x', 'y', data, kind='kde');

改变图形

In [13]:
with sns.axes_style('white'):
    sns.jointplot('x', 'y', data, kind='hex');

2.矩阵图

使用著名的鸢尾花数据集来演示

In [14]:
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [15]:
sns.pairplot(iris, hue='species', size=2.5);



样本的多维度关系

3.分娩频次直方图

服务员收取小费数据

In [16]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [17]:
tips['tip_pct'] = 100 * tips['tip'] / tips['total_bill']

grid = sns.FacetGrid(tips, row='sex', col='time', margin_titles=True)
grid.map(plt.hist, "tip_pct", bins=np.linspace(0, 40, 15));

4.因子图

In [18]:
with sns.axes_style(style='ticks'):
    g = sns.factorplot("day", "total_bill", "sex", data=tips, kind="box")
    g.set_axis_labels("Day", "Total Bill");



5.联合分布

In [19]:
with sns.axes_style('white'):
    sns.jointplot("total_bill", "tip", data=tips, kind='hex')

自动KDE和回归

In [20]:
sns.jointplot("total_bill", "tip", data=tips, kind='reg');

6.条形图

In [21]:
planets = sns.load_dataset('planets')
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [22]:
with sns.axes_style('white'):
    g = sns.factorplot("year", data=planets, aspect=2, kind='count', color='steelblue')
    g.set_xticklabels(step=5)



In [23]:
with sns.axes_style('white'):
    g = sns.factorplot("year", data=planets, aspect=4.0, kind='count', hue='method', order=range(2001, 2015))
    g.set_ylabels('Number of Planets Discovered')



## 案例：探索马拉松比赛成绩数据

In [24]:
!curl -O https://raw.githubusercontent.com/jakevdp/marathon-data/master/marathon-data.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  836k  100  836k    0     0  48125      0  0:00:17  0:00:17 --:--:-- 55214


In [25]:
data = pd.read_csv('marathon-data.csv')
data.head()

Unnamed: 0,age,gender,split,final
0,33,M,01:05:38,02:08:51
1,32,M,01:06:26,02:09:28
2,31,M,01:06:49,02:10:42
3,38,M,01:06:16,02:13:45
4,31,M,01:06:32,02:13:59


In [26]:
data.dtypes

age        int64
gender    object
split     object
final     object
dtype: object

写一个把字符串变成时间类型的函数

In [38]:
import datetime as dt
def convert_time(s):
    h, m, s = map(int, s.split(':'))
    return dt.timedelta(hours=h, minutes=m, seconds=s) # 书上使用的是pd.datetools.timedelta()，但是已经被弃用

data = pd.read_csv('marathon-data.csv', converters={
    'split' : convert_time,
    'final' : convert_time
})
data.head()

Unnamed: 0,age,gender,split,final
0,33,M,01:05:38,02:08:51
1,32,M,01:06:26,02:09:28
2,31,M,01:06:49,02:10:42
3,38,M,01:06:16,02:13:45
4,31,M,01:06:32,02:13:59


In [39]:
data.dtypes

age                 int64
gender             object
split     timedelta64[ns]
final     timedelta64[ns]
dtype: object


将时间换成秒

In [40]:
data['split_sec'] = data['split'].astype(int) / 1E9
data['final_sec'] = data['final'].astype(int) / 1E9
data.head()

Unnamed: 0,age,gender,split,final,split_sec,final_sec
0,33,M,01:05:38,02:08:51,3938.0,7731.0
1,32,M,01:06:26,02:09:28,3986.0,7768.0
2,31,M,01:06:49,02:10:42,4009.0,7842.0
3,38,M,01:06:16,02:13:45,3976.0,8025.0
4,31,M,01:06:32,02:13:59,3992.0,8039.0


In [42]:
with sns.axes_style('white'):
    g = sns.jointplot("split_sec", "final_sec", data, kind='hex')
    g.ax_joint.plot(np.linspace(4000, 16000), np.linspace(8000, 32000), ':k')

增加一列，衡量前半程或者后半程加速的程度

In [43]:
data['split_frac'] = 1 -2 * data['split_sec'] / data['final_sec']
data.head()

Unnamed: 0,age,gender,split,final,split_sec,final_sec,split_frac
0,33,M,01:05:38,02:08:51,3938.0,7731.0,-0.018756
1,32,M,01:06:26,02:09:28,3986.0,7768.0,-0.026262
2,31,M,01:06:49,02:10:42,4009.0,7842.0,-0.022443
3,38,M,01:06:16,02:13:45,3976.0,8025.0,0.009097
4,31,M,01:06:32,02:13:59,3992.0,8039.0,0.006842


画出差异系数分布图

In [45]:
sns.distplot(data['split_frac'], kde=False);
plt.axvline(0, color="k", linestyle='--');

In [47]:
sum(data.split_frac < 0)

251

在大约4万名马拉松比赛选手中，只有251人能做做到后半程加速。

In [52]:
g = sns.PairGrid(data, vars=['age', 'split_sec', 'final_sec', 'split_frac'], hue='gender', palette='RdBu_r')
g.map(plt.scatter, alpha=0.8)
g.add_legend();

In [53]:
sns.kdeplot(data.split_frac[data.gender=='M'], label='men', shade=True)
sns.kdeplot(data.split_frac[data.gender=='W'], label='women', shade=True)
plt.xlabel('split_frac');

用小提琴图进行两种分布对比

In [54]:
sns.violinplot('gender', 'split_frac', data=data, palette=['lightblue', 'lightpink']);

上面是性别组成的，下面看由年龄组成的小提琴图

In [55]:
data['age_dec'] = data.age.map(lambda age: 10 * (age // 10))
data.head()

Unnamed: 0,age,gender,split,final,split_sec,final_sec,split_frac,age_dec
0,33,M,01:05:38,02:08:51,3938.0,7731.0,-0.018756,30
1,32,M,01:06:26,02:09:28,3986.0,7768.0,-0.026262,30
2,31,M,01:06:49,02:10:42,4009.0,7842.0,-0.022443,30
3,38,M,01:06:16,02:13:45,3976.0,8025.0,0.009097,30
4,31,M,01:06:32,02:13:59,3992.0,8039.0,0.006842,30


In [58]:
men = (data.gender == 'M')
women = (data.gender == 'W')

with sns.axes_style(style=None):
    sns.violinplot('age_dec', 'split_frac', hue='gender', data=data, split=True, inner='quartile', palette=['lightblue', 'lightpink']);

发现80周岁的女选手比同年龄段的男选手成绩好，这可能是选手太少

In [59]:
(data.age > 80).sum()

7

In [60]:
((data.age > 80) & (data.gender == 'W')).sum()

2

看看后半程加速选手的数据

In [62]:
g = sns.lmplot('final_sec', 'split_frac', col='gender', data=data, markers='.', scatter_kws=dict(color='c'))
g.map(plt.axhline, y=0.1, color='k', ls=':');

看得出来后半程加速的都是成绩在15000秒的种子选手，低于这个成绩的很少有显著的后半程加速。