## Ch08 绘图和可视化

绘图是数据分析工作中最重要的任务之一，是探索过程的一部分。

### matplotlib API入门

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from numpy.random import randn

In [None]:
%matplotlib inline

 ipython --pylab
 In [1]: import numpy as np                    
 In [2]: plot(np.arange(10)) 
 Out[2]: [<matplotlib.lines.Line2D at 0x7fcb355c0208>]

#### FIgure和Subplot

In [None]:
#ipython --pylab
fig = plt.figure()

In [None]:
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)

In [None]:
plt.plot([1.5,3.5,-2,1.6])

In [None]:
plt.plot(randn(50).cumsum(), 'k--')

In [None]:
_ = ax1.hist(randn(100), bins=20, color='k',alpha=0.3)
ax2.scatter(np.arange(30), np.arange(30)+3*randn(30))

In [None]:
plt.close('all')

In [None]:
fig, axes = plt.subplots(2,3)
axes

**pyplot.subplots的选项**
- nrows
- ncols
- sharex
- sharey
- subplot_kw
- **fig_kw

#### 调整subplot周围的间距

In [None]:
plt.subplots_adjust(left=None, bottom=None, right=None, top=None,
                wspace=None, hspace=None)

In [None]:
fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)
for i in range(2):
    for j in range(2):
        axes[i, j].hist(randn(500), bins=50, color='k', alpha=0.5)
plt.subplots_adjust(wspace=0, hspace=0)

#### 颜色、标记和线形

In [None]:
ax.plot(x,y,'g--')
ax.plot(x,y,linestyle='--', color='g')

In [None]:
plt.plot(randn(30).cumsum(),'ko--')  

In [None]:
data = randn(30).cumsum()
plt.plot(data,'k--', label='Default')
plt.plot(data,
         linestyle = '-', 
         color='r', 
         marker='o',
         drawstyle='steps-post',
         label= 'steps-post'
        )
plt.legend(loc='best')

#### 刻度、标签和图例

    pyplot接口的设计目的就是交互使用，含有诸如xlim,xticks,xticklabels
    
    - plt.xlim 图表的范围
    - plt.xticks 刻度位置
    - plt.xticklabel 刻度标签
    所有的上面方法都是对当前或者最近创建的AxesSubplot起作用的。例如：
    - plt.xlim对应于ax.get_xlim,ax.set_xlim
    

##### 设置标题、轴标签、刻度以及刻度标签

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(randn(1000).cumsum())

**要修改X轴的刻度，最简单的办法是使用** 
- set_xticks
- set_xticklabels

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)

ticks = ax.set_xticks([0,250,500,750,1000])
label = ax.set_xticklabels(['one', 'two','three','four','five'],
                           rotation=30,fontsize='small'
                          )
ax.set_title('My first matplotlib plot')
ax.set_xlabel('Stages')

ax.plot(randn(1000).cumsum())

##### 添加图例

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)

ax.plot(randn(1000).cumsum(),
        'k',
        label='one',
        color='r'
       )
ax.plot(randn(1000).cumsum(),
        'k--',
        label='two',
        color='g'
       )
ax.plot(randn(1000).cumsum(),
        'k.',
        label='three',
        color='b'
       )

ax.legend(loc='best')

#### 注解以及在Subplot上面绘图
    绘制一些自定义的注解（文本、箭头或者其他的图形）
- text
- arrow
- annotate

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)

x=0
y=0
ax.text(x,y, 'HelloWorld',
        family='monospace',
        fontsize=30
       )

ax.plot(randn(1000).cumsum(),
        'k--',
        label='one',
        color='r'
       )

In [None]:
from datetime import datetime

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

data = pd.read_csv('../pydata/ch08/spx.csv', index_col=0, parse_dates=True)
spx = data['SPX']

spx.plot(ax=ax, style='k-')

crisis_data = [
    (datetime(2007, 10, 11), 'Peak of bull market'),
    (datetime(2008, 3, 12), 'Bear Stearns Fails'),
    (datetime(2008, 9, 15), 'Lehman Bankruptcy')
]

for date, label in crisis_data:
    ax.annotate(label, xy=(date, spx.asof(date) + 50),
                xytext=(date, spx.asof(date) + 200),
                arrowprops=dict(facecolor='black'),
                horizontalalignment='left', verticalalignment='top')

# Zoom in on 2007-2010
ax.set_xlim(['1/1/2007', '1/1/2011'])
ax.set_ylim([600, 1800])

ax.set_title('Important dates in 2008-2009 financial crisis')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)
circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)
pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],
                   color='g', alpha=0.5)

ax.add_patch(rect)
ax.add_patch(circ)
ax.add_patch(pgon)

#### 将图表保存到文件

In [None]:
plt.savefig('figpath.svg')
fig.savefig('figpath.png', dpi=400, bbox_inches='tight')

In [None]:
from io import BytesIO
buffer = BytesIO()
plt.savefig(buffer)
plot_data = buffer.getvalue()

**Figure.saving的选项**
- fname
- dpi
- facecolor,edgecolor
- format
- bbox_inches

#### matplotlib配置
    自带的配色方案
- 图像大小
- subplot边距
- 配色方案
- 字体大小
- 网格类型

    配置方式有两种：
- plt.rc()函数
- .matplotlibrc配置脚本

In [None]:
plt.rc('figure', figsize=(10, 10))

In [None]:
font_options = {
    'family' : 'monospace',
    'weight' : 'bold',
    'size'   : 12
}

plt.rc('font', **font_options)

## Test it as bellow
fig = plt.figure()
ax = fig.add_subplot(1,1,1)

x=0
y=0
ax.text(x,y, 'HelloWorld',
        family='monospace',
        fontsize=30
       )

ax.plot(randn(1000).cumsum(),
        'k--',
        label='one',
        color='r'
       )

### pandas中的绘图函数
    虽然matplotlib能够画图，但是需要创建很多的对象来控制。比如：
    
- 数据展示（图表类型）
    - 线型图
    - 柱状图
    - 盒状图
    - 散布图
    - 等值线图
- 图例
- 标题
- 刻度标签
- 注解

    上面的功能可以在pandas中简单实现

#### 线型图
    Series和DataFrame都有一个用于生成各类图表的plot方法。默认是线型图

In [None]:
from pandas import Series, DataFrame

In [None]:
s =Series(randn(10).cumsum(),
          index=np.arange(0,100,10)
         )
s.plot()

In [None]:
df = DataFrame( randn(10,4).cumsum(0),
               columns=['A','B','C','D'],
               index=np.arange(0,100,10)
              )
df.plot()

    Series.plot方法的参数

- label 用于图例的标签
- ax  要在其上绘制的matplotlib subplot对象
- style 如： 'ko--'
- alpha 不透明度，0-1之间
- kind line,bar,barh,kde
- logy 在Y轴上使用对数标尺
- use_index 将对象的索引用作刻度标签
- rot 刻度标签的旋转度（0-360之间）
- xticks X轴刻度
- yticks Y轴刻度
- xlim X轴界限
- ylim Y轴界限
- grip 显示轴网格线（默认打开）

    专用于DataFrame的plot的选项
- subplot
- sharex
- sharey
- figsize
- title
- legend
- sort_columns

#### 柱状图

In [None]:
fig, axes = plt.subplots(2,1)
data = Series(np.random.rand(16), 
              index=list('abcdefghijklmnop')
             )
data.plot(kind='bar', 
          ax=axes[0], 
          color='k',
          alpha=0.7)
data.plot(kind='barh', 
          ax=axes[1],
          color='r',
          alpha=0.7
         )

In [None]:
df = DataFrame(np.random.rand(6, 4),
               index=['one', 'two', 'three', 'four', 'five', 'six'],
               columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))
df
df.plot(kind='bar')

In [None]:
plt.figure()

In [None]:
df.plot(kind='barh', stacked=True, alpha=0.5)

In [None]:
tips.head()

In [None]:
tips = pd.read_csv('../pydata/ch08/tips.csv')

party_counts = pd.crosstab(tips.day, tips['size'])
party_counts

In [None]:
party_counts = party_counts.loc[:,2:5]

In [None]:
# Normalize to sum to 1
party_pcts = party_counts.div(party_counts.sum(1).astype(float), axis=0)
party_pcts

party_pcts.plot(kind='bar', stacked=True)

#### 直方图和密度图
    直方图（histogram）是一种对值频率进行离散化显示的柱状图

In [None]:
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips['tip_pct'].hist(bins=12)

In [None]:
tips['tip_pct'].plot(kind='hist', bins=12)

In [None]:
tips['tip_pct'].plot(kind='kde')

In [None]:
comp1 = np.random.normal(0,1,size=200) #N(0,1)
comp2 = np.random.normal(10,4,size=200) #N(10,4)
values = Series(np.concatenate([comp1,comp2]))
values.plot(kind='hist',
            bins=100,
            alpha=0.3,
            color='k',
            normed=True
           )
values.plot(kind='kde',
            style='k--'
           )


#### 散布图
    散布图(Scatter plot)是观察两个一维数据之间关系的有效手段

In [None]:
macro= pd.read_csv('../pydata/ch08/macrodata.csv')
data = macro[['cpi','m1','tbilrate','unemp']]

In [None]:
trans_data= np.log(data).diff().dropna()
trans_data[-5:]

In [None]:
plt.scatter(trans_data['m1'], trans_data['unemp'])
plt.title('Changes in log %s vs. log %s' %('m1', 'unemp'))

In [None]:
pd.plotting.scatter_matrix(trans_data, diagonal='kde', 
                  color='k', alpha=0.3)

### 绘制地图： 图形化显示海地地震危机数据

### Python图形化工具生态系统