diff --git a/.gitignore b/.gitignore
index 300dbef05b06f..f21889b9b421b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@
 *.DS_Store
 ._*
 docs/_build/
+docs/api/
+docs/doxyoutput/
diff --git a/docs/conf.py b/docs/conf.py
index bb399a47f9138..b1ea576299a44 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -172,3 +172,46 @@
 
 # A list of files that should not be packed into the epub file.
 epub_exclude_files = ['search.html']
+
+extensions = [
+    # there may be others here already, e.g. 'sphinx.ext.mathjax'
+    'breathe',
+    'exhale'
+]
+
+# Setup the breathe extension
+breathe_projects = {
+    "My Project": "./doxyoutput/xml"
+}
+breathe_default_project = "My Project"
+
+# Setup the exhale extension
+exhale_args = {
+    # These arguments are required
+    "containmentFolder":     "./api",
+    "rootFileName":          "library_root.rst",
+    "rootFileTitle":         "Library API",
+    "doxygenStripFromPath":  "..",
+    # Suggested optional arguments
+    "createTreeView":        True,
+    # TIP: if using the sphinx-bootstrap-theme, you need
+    # "treeViewIsBootstrap": True,
+    "exhaleExecutesDoxygen": True,
+    "exhaleDoxygenStdin":    "INPUT = paddle_include_file"
+}
+
+# Tell sphinx what the primary language being documented is.
+primary_domain = 'cpp'
+
+# Tell sphinx what the pygments highlight language should be.
+highlight_language = 'cpp'
+
+import os
+
+on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
+
+if not on_rtd:  # only import and set the theme if we're building docs locally
+    import sphinx_rtd_theme
+    html_theme = 'sphinx_rtd_theme'
+    #html_theme = "alabaster"
+    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
diff --git a/docs/index.rst b/docs/index.rst
index c1b14a0d1e458..30af359119269 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -38,7 +38,6 @@ Welcome to Paddle-Inference's documentation!
   
   tools/visual
   tools/x2paddle
-
   
 .. toctree::
   :maxdepth: 1
@@ -48,10 +47,10 @@ Welcome to Paddle-Inference's documentation!
   benchmark/benchmark
   
 .. toctree::
-  :maxdepth: 1
+  :maxdepth: 2
   :caption: API文档
 
-  api_reference/cxx_api_doc
+  api/library_root
 
 .. toctree::
   :maxdepth: 1
diff --git a/docs/introduction/quick_start.md b/docs/introduction/quick_start.md
deleted file mode 100644
index 2792cda710c29..0000000000000
--- a/docs/introduction/quick_start.md
+++ /dev/null
@@ -1,175 +0,0 @@
-# Quick Start
-
-<h2 align="left">前提准备</h2>
-接下来我们会通过几段Python代码的方式对Paddle Inference使用进行介绍，
-为了能够成功运行代码，请您在环境中（Mac， Windows，Linux）安装不低于1.7版本的Paddle，
-安装Paddle 请参考[飞桨官网主页](https://www.paddlepaddle.org.cn/)。
-
-## 导出预测模型文件
-
-在模型训练期间，我们通常使用Python来构建模型结构，比如：
-
-```python
-import paddle.fluid as fluid
-res = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu", param_attr=param_attr)
-```
-
-在模型部署时，我们需要提前将这种Python表示的结构以及参数序列化到磁盘中。那是如何做到的呢？
-
-在模型训练过程中或者模型训练结束后，我们可以通过`save_inference_model` 接口来导出标准化的模型文件。    
-
-我们用一个简单的代码例子来展示下导出模型文件的这一过程。
-
-
-```python
-import paddle
-import paddle.fluid as fluid
-# 建立一个简单的网络，网络的输入的shape为[batch, 3, 28, 28]
-image_shape = [3, 28, 28]
-
-img = fluid.layers.data(name='image', shape=image_shape, dtype='float32', append_batch_size=True)
-# 模型包含两个Conv层
-conv1 = fluid.layers.conv2d(
-    input=img,
-    num_filters=8,
-    filter_size=3,
-    stride=2,
-    padding=1,
-    groups=1,
-    act=None,
-    bias_attr=True)
-    
-out = fluid.layers.conv2d(
-    input=conv1,
-    num_filters=8,
-    filter_size=3,
-    stride=2,
-    padding=1,
-    groups=1,
-    act=None,
-    bias_attr=True)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-# 创建网络中的参数变量，并初始化参数变量
-exe.run(fluid.default_startup_program())
-
-# 如果存在预训练模型
-# def if_exist(var):
-#            return os.path.exists(os.path.join("./ShuffleNet", var.name))
-#    fluid.io.load_vars(exe, "./pretrained_model", predicate=if_exist)
-# 保存模型到model目录中，只保存与输入image和输出与推理相关的部分网络
-fluid.io.save_inference_model(dirname='./sample_model', feeded_var_names=['image'], target_vars = [out], executor=exe, model_filename='model', params_filename='params')
-```
-
-该程序运行结束后，会在本目录中生成一个sample_model目录，目录中包含model, params 两个文件，model文件表示模型的结构文件，params表示所有参数的融合文件。 
-
-
-飞桨提供了**两种标准**的模型文件，一种为Combined方式， 一种为No-Combined的方式。
-
-- Combined的方式
-
-```
-fluid.io.save_inference_model(dirname='./sample_model', feeded_var_names=['image'], target_vars = [out], executor=exe, model_filename='model', params_filename='params')
-```
-`model_filename`，`params_filename`表示要生成的模型结构文件、融合参数文件的名字。
-
-
-* No-Combined的方式  
-
-```
-fluid.io.save_inference_model(dirname='./sample_model', feeded_var_names=['image'], target_vars = [out], executor=exe)
-```
-
-如果不指定`model_filename`，`params_filename`，会在sample_model目录下生成`__model__` 模型结构文件，以及一系列的参数文件。
-
-
-在模型部署期间，**我们更推荐使用Combined的方式**，因为涉及模型上线加密的场景时，这种方式会更友好一些。
-
-
-
-## 加载模型预测
-
-1）使用load_inference方式
-
-我们可以使用`load_inference_model` 接口加载训练好的模型（以`sample_model`模型举例），并复用训练框架的前向计算，直接完成推理。示例程序如下所示：
-
-```
-import paddle.fluid as fluid
-import numpy as np
-
-data = np.ones((1, 3, 28, 28)).astype(np.float32)
-exe = fluid.Executor(fluid.CPUPlace())
-
-# 加载Combined的模型需要指定model_filename, params_filename
-# 加载No-Combined的模型不需要指定model_filename, params_filename
-[inference_program, feed_target_names, fetch_targets] = \
-            fluid.io.load_inference_model(dirname='sample_model', executor=exe, model_filename='model', params_filename='params')
-
-with fluid.program_guard(inference_program):
-   results = exe.run(inference_program,
-                    feed={feed_target_names[0]: data},
-                        fetch_list=fetch_targets, return_numpy=False)
-    
-   print (np.array(results[0]).shape)
-   # (1, 8, 7, 7)
-```
-
-在上述方式中，在模型加载后会按照执行顺序将所有的OP进行拓扑排序，在运行期间Op会按照排序一一运行，整个过程中运行的为训练中前向的OP，期间不会有任何的优化（OP融合，显存优化，预测Kernel针对优化）。 因此，`load_inference_model`的方式预测期间很可能不会有很好的性能表现，此方式比较适合用来做实验（测试模型的效果、正确性等）使用，并不适用于真正的部署上线。接下来我们会重点介绍Paddle Inference的使用。
-
-2）使用Paddle Inference Api 方式
-
-不同于 `load_inference_model` 方式，Paddle Inference 在模型加载后会进行一系列的优化，包括： Kernel优化，OP横向，纵向融合，显存/内存优化，以及MKLDNN，TensorRT的集成等，性能和吞吐会得到大幅度的提升。这些优化会在之后的文档中进行详细的介绍。
-
-那我们先用一个简单的代码例子来介绍Paddle Inference 的使用。
-
-```Python
-from paddle.fluid.core import AnalysisConfig
-from paddle.fluid.core import create_paddle_predictor
-
-import numpy as np
-
-# 配置运行信息
-# config = AnalysisConfig("./sample_model") # 加载non-combined 模型格式
-config = AnalysisConfig("./sample_model/model", "./sample_model/params") # 加载combine的模型格式
-
-config.switch_use_feed_fetch_ops(False)
-config.enable_memory_optim()
-config.enable_use_gpu(1000, 0)
-
-# 根据config创建predictor
-predictor = create_paddle_predictor(config)
-
-img = np.ones((1, 3, 28, 28)).astype(np.float32)
-
-# 准备输入
-input_names = predictor.get_input_names()
-input_tensor = predictor.get_input_tensor(input_names[0])
-input_tensor.reshape(img.shape)   
-input_tensor.copy_from_cpu(img.copy())
-
-# 运行
-predictor.zero_copy_run()
-
-# 获取输出
-output_names = predictor.get_output_names()
-output_tensor = predictor.get_output_tensor(output_names[0])
-output_data = output_tensor.copy_to_cpu()
-
-print (output_data)
-```
-
-
-上述的代码例子，我们通过加载一个简答模型以及随机输入的方式，展示了如何使用Paddle Inference进行模型预测。可能对于刚接触Paddle Inferenece同学来说，代码中会有一些陌生名词出现，比如AnalysisConfig, Predictor 等。先不要着急，接下来的文章中会对这些概念进行详细的介绍。 
-
-
-<h2 align="left">相关链接</h2>
-
-[Python API 使用介绍](../user_guides/inference_python_api)
-
-[C++ API使用介绍](../user_guides/cxx_api)
-
-[Python 使用样例](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/python)
-
-[C++ 使用样例](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B)
-
diff --git a/docs/introduction/quick_start.rst b/docs/introduction/quick_start.rst
new file mode 100644
index 0000000000000..c00ba9cea6292
--- /dev/null
+++ b/docs/introduction/quick_start.rst
@@ -0,0 +1,176 @@
+Quick Start
+=================
+
+**前提准备**
+接下来我们会通过几段Python代码的方式对Paddle Inference使用进行介绍，
+为了能够成功运行代码，请您在环境中（Mac， Windows，Linux）安装不低于1.7版本的Paddle，
+安装Paddle 请参考 `飞桨官网主页 <https://www.paddlepaddle.org.cn/>`_。
+
+导出预测模型文件
+----------------
+
+在模型训练期间，我们通常使用Python来构建模型结构，比如：
+
+.. code:: python
+
+	import paddle.fluid as fluid
+	res = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu", param_attr=param_attr)
+
+在模型部署时，我们需要提前将这种Python表示的结构以及参数序列化到磁盘中。那是如何做到的呢？
+
+在模型训练过程中或者模型训练结束后，我们可以通过save_inference_model接口来导出标准化的模型文件。    
+
+我们用一个简单的代码例子来展示下导出模型文件的这一过程。
+
+
+.. code:: python
+
+	import paddle
+	import paddle.fluid as fluid
+	# 建立一个简单的网络，网络的输入的shape为[batch, 3, 28, 28]
+	image_shape = [3, 28, 28]
+
+	img = fluid.layers.data(name='image', shape=image_shape, dtype='float32', append_batch_size=True)
+	# 模型包含两个Conv层
+	conv1 = fluid.layers.conv2d(
+		input=img,
+		num_filters=8,
+		filter_size=3,
+		stride=2,
+		padding=1,
+		groups=1,
+		act=None,
+		bias_attr=True)
+
+	out = fluid.layers.conv2d(
+		input=conv1,
+		num_filters=8,
+		filter_size=3,
+		stride=2,
+		padding=1,
+		groups=1,
+		act=None,
+		bias_attr=True)
+
+	place = fluid.CPUPlace()
+	exe = fluid.Executor(place)
+	# 创建网络中的参数变量，并初始化参数变量
+	exe.run(fluid.default_startup_program())
+
+	# 如果存在预训练模型
+	# def if_exist(var):
+	#            return os.path.exists(os.path.join("./ShuffleNet", var.name))
+	#    fluid.io.load_vars(exe, "./pretrained_model", predicate=if_exist)
+	# 保存模型到model目录中，只保存与输入image和输出与推理相关的部分网络
+	fluid.io.save_inference_model(dirname='./sample_model', feeded_var_names=['image'], target_vars = [out], executor=exe, model_filename='model', params_filename='params')
+
+该程序运行结束后，会在本目录中生成一个sample_model目录，目录中包含model, params 两个文件，model文件表示模型的结构文件，params表示所有参数的融合文件。 
+
+
+飞桨提供了 **两种标准** 的模型文件，一种为Combined方式， 一种为No-Combined的方式。
+
+- Combined的方式
+
+.. code:: python
+
+	fluid.io.save_inference_model(dirname='./sample_model', feeded_var_names=['image'], target_vars = [out], executor=exe, model_filename='model', params_filename='params')
+
+model_filename，params_filename表示要生成的模型结构文件、融合参数文件的名字。
+
+
+* No-Combined的方式  
+
+.. code:: python
+
+	fluid.io.save_inference_model(dirname='./sample_model', feeded_var_names=['image'], target_vars = [out], executor=exe)
+
+如果不指定model_filename，params_filename，会在sample_model目录下生成__model__ 模型结构文件，以及一系列的参数文件。
+
+
+在模型部署期间，**我们更推荐使用Combined的方式**，因为涉及模型上线加密的场景时，这种方式会更友好一些。
+
+
+
+加载模型预测
+----------------
+
+1）使用load_inference方式
+
+我们可以使用load_inference_model接口加载训练好的模型（以sample_model模型举例），并复用训练框架的前向计算，直接完成推理。
+示例程序如下所示：
+
+.. code:: python
+
+	import paddle.fluid as fluid
+	import numpy as np
+
+	data = np.ones((1, 3, 28, 28)).astype(np.float32)
+	exe = fluid.Executor(fluid.CPUPlace())
+
+	# 加载Combined的模型需要指定model_filename, params_filename
+	# 加载No-Combined的模型不需要指定model_filename, params_filename
+	[inference_program, feed_target_names, fetch_targets] = \
+		fluid.io.load_inference_model(dirname='sample_model', executor=exe, model_filename='model', params_filename='params')
+
+	with fluid.program_guard(inference_program):
+	results = exe.run(inference_program,
+		feed={feed_target_names[0]: data},
+		fetch_list=fetch_targets, return_numpy=False)
+
+	print (np.array(results[0]).shape)
+	# (1, 8, 7, 7)
+
+在上述方式中，在模型加载后会按照执行顺序将所有的OP进行拓扑排序，在运行期间Op会按照排序一一运行，整个过程中运行的为训练中前向的OP，期间不会有任何的优化（OP融合，显存优化，预测Kernel针对优化）。 因此，load_inference_model的方式预测期间很可能不会有很好的性能表现，此方式比较适合用来做实验（测试模型的效果、正确性等）使用，并不适用于真正的部署上线。接下来我们会重点介绍Paddle Inference的使用。
+
+2）使用Paddle Inference API方式
+
+不同于 load_inference_model方式，Paddle Inference 在模型加载后会进行一系列的优化，包括： Kernel优化，OP横向，纵向融合，显存/内存优化，以及MKLDNN，TensorRT的集成等，性能和吞吐会得到大幅度的提升。这些优化会在之后的文档中进行详细的介绍。
+
+那我们先用一个简单的代码例子来介绍Paddle Inference 的使用。
+
+.. code::
+
+	from paddle.fluid.core import AnalysisConfig
+	from paddle.fluid.core import create_paddle_predictor
+
+	import numpy as np
+
+	# 配置运行信息
+	# config = AnalysisConfig("./sample_model") # 加载non-combined 模型格式
+	config = AnalysisConfig("./sample_model/model", "./sample_model/params") # 加载combine的模型格式
+
+	config.switch_use_feed_fetch_ops(False)
+	config.enable_memory_optim()
+	config.enable_use_gpu(1000, 0)
+
+	# 根据config创建predictor
+	predictor = create_paddle_predictor(config)
+
+	img = np.ones((1, 3, 28, 28)).astype(np.float32)
+
+	# 准备输入
+	input_names = predictor.get_input_names()
+	input_tensor = predictor.get_input_tensor(input_names[0])
+	input_tensor.reshape(img.shape)   
+	input_tensor.copy_from_cpu(img.copy())
+
+	# 运行
+	predictor.zero_copy_run()
+
+	# 获取输出
+	output_names = predictor.get_output_names()
+	output_tensor = predictor.get_output_tensor(output_names[0])
+	output_data = output_tensor.copy_to_cpu()
+
+	print (output_data)
+
+上述的代码例子，我们通过加载一个简答模型以及随机输入的方式，展示了如何使用Paddle Inference进行模型预测。可能对于刚接触Paddle Inferenece同学来说，代码中会有一些陌生名词出现，比如AnalysisConfig, Predictor 等。先不要着急，接下来的文章中会对这些概念进行详细的介绍。 
+
+
+**相关链接**
+
+`Python API 使用介绍 <../user_guides/inference_python_api.html>`_
+`C++ API使用介绍 <../user_guides/cxx_api.html>`_
+`Python 使用样例 <https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/python>`_
+`C++ 使用样例 <https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B>`_
+
diff --git a/docs/introduction/summary.md b/docs/introduction/summary.md
index 5935151a26e01..16ead666d12c2 100644
--- a/docs/introduction/summary.md
+++ b/docs/introduction/summary.md
@@ -1,9 +1,11 @@
-
-# 概述
+概述
+========
 
 Paddle Inference为飞桨核心框架推理引擎。Paddle Inference功能特性丰富，性能优异，针对不同平台不同的应用场景进行了深度的适配优化,做到高吞吐、低时延，保证了飞桨模型在服务器端即训即用，快速部署。    
 
-### 特性
+特性
+-------
+
 - 通用性。支持对Paddle训练出的所有模型进行预测。
 
 - 内存/显存复用。在推理初始化阶段，对模型中的OP输出Tensor 进行依赖分析，将两两互不依赖的Tensor在内存/显存空间上进行复用，进而增大计算并行量，提升服务吞吐量。
@@ -15,26 +17,28 @@ Paddle Inference为飞桨核心框架推理引擎。Paddle Inference功能特性
 - 高性能CPU/GPU Kernel。内置同Intel、Nvidia共同打造的高性能kernel，保证了模型推理高性能的执行。
 
 
-- 子图集成[TensorRT](https://developer.nvidia.com/tensorrt)。Paddle Inference采用子图的形式集成TensorRT，针对GPU推理场景，TensorRT可对一些子图进行优化，包括OP的横向和纵向融合，过滤冗余的OP，并为OP自动选择最优的kernel，加快推理速度。
+- 子图集成 `TensorRT <https://developer.nvidia.com/tensorrt>`_。Paddle Inference采用子图的形式集成TensorRT，针对GPU推理场景，TensorRT可对一些子图进行优化，包括OP的横向和纵向融合，过滤冗余的OP，并为OP自动选择最优的kernel，加快推理速度。
 
 
 - 集成MKLDNN
    
-- 支持加载PaddleSlim量化压缩后的模型。 [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)是飞桨深度学习模型压缩工具，Paddle Inference可联动PaddleSlim，支持加载量化、裁剪和蒸馏后的模型并部署，由此减小模型存储空间、减少计算占用内存、加快模型推理速度。其中在模型量化方面，[Paddle Inference在X86 CPU上做了深度优化](https://github.com/PaddlePaddle/PaddleSlim/tree/80c9fab3f419880dd19ca6ea30e0f46a2fedf6b3/demo/mkldnn_quant/quant_aware)，常见分类模型的单线程性能可提升近3倍，ERNIE模型的单线程性能可提升2.68倍。
+- 支持加载PaddleSlim量化压缩后的模型。 `PaddleSlim <https://github.com/PaddlePaddle/PaddleSlim>`_ 是飞桨深度学习模型压缩工具，Paddle Inference可联动PaddleSlim，支持加载量化、裁剪和蒸馏后的模型并部署，由此减小模型存储空间、减少计算占用内存、加快模型推理速度。其中在模型量化方面，`Paddle Inference在X86 CPU上做了深度优化 <https://github.com/PaddlePaddle/PaddleSlim/tree/80c9fab3f419880dd19ca6ea30e0f46a2fedf6b3/demo/mkldnn_quant/quant_aware>`_ ，常见分类模型的单线程性能可提升近3倍，ERNIE模型的单线程性能可提升2.68倍。
 	
-### 支持系统及硬件   
+支持系统及硬件   
+------------
 
 支持服务器端X86 CPU、NVIDIA GPU芯片，兼容Linux/macOS/Windows系统。     
 
 同时也支持NVIDIA Jetson嵌入式平台。
 
-### 语言支持
+语言支持
+------------
 
 - 支持Pyhton语言
 - 支持C++ 语言 
 - 支持Go语言 
 - 支持R语言  
 	
-<h2 align="left">下一步</h2>
+**下一步**
 
-- 如果您刚接触Paddle Inference， 请访问[Quick start](./quick_start)。
+- 如果您刚接触Paddle Inference， 请访问 `Quick start <./quick_start.html>`_。
diff --git a/docs/paddle_include_file/paddle_analysis_config.h b/docs/paddle_include_file/paddle_analysis_config.h
new file mode 100644
index 0000000000000..2002d1f76abfe
--- /dev/null
+++ b/docs/paddle_include_file/paddle_analysis_config.h
@@ -0,0 +1,579 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file paddle_analysis_config.h
+///
+/// \brief Paddle Analysis Config API信息
+///
+/// \author paddle-infer@baidu.com
+/// \date 2020-03-20
+/// \since 1.7
+///
+
+#pragma once
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+/*! \file */
+
+// Here we include some header files with relative paths, for that in deploy,
+// the abstract path of this header file will be changed.
+#include "paddle_api.h"           // NOLINT
+#include "paddle_pass_builder.h"  // NOLINT
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle_mkldnn_quantizer_config.h"  // NOLINT
+#endif
+
+namespace paddle {
+
+class AnalysisPredictor;
+struct MkldnnQuantizerConfig;
+
+///
+/// \brief configuration manager for AnalysisPredictor.
+/// \since 1.7.0
+///
+/// AnalysisConfig manages configurations of AnalysisPredictor.
+/// During inference procedure, there are many parameters(model/params path,
+/// place of inference, etc.)
+/// to be specified, and various optimizations(subgraph fusion, memory
+/// optimazation, TensorRT engine, etc.)
+/// to be done. Users can manage these settings by creating and modifying an
+/// AnalysisConfig,
+/// and loading it into AnalysisPredictor.
+///
+struct AnalysisConfig {
+  AnalysisConfig() = default;
+  ///
+  /// \brief Construct a new AnalysisConfig from another
+  /// AnalysisConfig.
+  ///
+  /// \param[in] other another AnalysisConfig
+  ///
+  explicit AnalysisConfig(const AnalysisConfig& other);
+  ///
+  /// \brief Construct a new AnalysisConfig from a no-combined model.
+  ///
+  /// \param[in] model_dir model directory of the no-combined model.
+  ///
+  explicit AnalysisConfig(const std::string& model_dir);
+  ///
+  /// \brief Construct a new AnalysisConfig from a combined model.
+  ///
+  /// \param[in] prog_file model file path of the combined model.
+  /// \param[in] params_file params file path of the combined model.
+  ///
+  explicit AnalysisConfig(const std::string& prog_file,
+                          const std::string& params_file);
+  ///
+  /// \brief Precision of inference in TensorRT.
+  ///
+  enum class Precision {
+    kFloat32 = 0,  ///< fp32
+    kInt8,         ///< int8
+    kHalf,         ///< fp16
+  };
+
+  ///
+  /// \brief Set the no-combined model dir path.
+  ///
+  /// \param model_dir model dir path.
+  ///
+  void SetModel(const std::string& model_dir) { model_dir_ = model_dir; }
+
+  ///
+  /// \brief Set the combined model with two specific pathes for program and
+  /// parameters.
+  ///
+  /// \param prog_file_path model file path of the combined model.
+  /// \param params_file_path params file path of the combined model.
+  ///
+  void SetModel(const std::string& prog_file_path,
+                const std::string& params_file_path);
+  ///
+  /// \brief Set the model file path of a combined model.
+  ///
+  /// \param x model file path.
+  ///
+  void SetProgFile(const std::string& x) { prog_file_ = x; }
+  ///
+  /// \brief Set the params file path of a combined model.
+  ///
+  /// \param x params file path.
+  ///
+  void SetParamsFile(const std::string& x) { params_file_ = x; }
+
+  ///
+  /// \brief Set the path of optimization cache directory.
+  ///
+  /// \param opt_cache_dir the path of optimization cache directory.
+  ///
+  void SetOptimCacheDir(const std::string& opt_cache_dir) {
+    opt_cache_dir_ = opt_cache_dir;
+  }
+  ///
+  /// \brief Get the model directory path.
+  ///
+  /// \return const std::string& The model directory path.
+  ///
+  const std::string& model_dir() const { return model_dir_; }
+  ///
+  /// \brief Get the program file path.
+  ///
+  /// \return const std::string& The program file path.
+  ///
+  const std::string& prog_file() const { return prog_file_; }
+  ///
+  /// \brief Get the combined parameters file.
+  ///
+  /// \return const std::string& The combined parameters file.
+  ///
+  const std::string& params_file() const { return params_file_; }
+
+  // Padding related.
+
+  ///
+  /// \brief Turn off FC Padding.
+  ///
+  ///
+  void DisableFCPadding();
+  ///
+  /// \brief A boolean state telling whether fc padding is used.
+  ///
+  /// \return bool Whether fc padding is used.
+  ///
+  bool use_fc_padding() const { return use_fc_padding_; }
+
+  // GPU related.
+
+  ///
+  /// \brief Turn on GPU.
+  ///
+  /// \param memory_pool_init_size_mb initial size of the GPU memory pool in MB.
+  /// \param device_id device_id the GPU card to use (default is 0).
+  ///
+  void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0);
+  ///
+  /// \brief Turn off GPU.
+  ///
+  ///
+  void DisableGpu();
+  ///
+  /// \brief A boolean state telling whether the GPU is turned on.
+  ///
+  /// \return bool Whether the GPU is turned on.
+  ///
+  bool use_gpu() const { return use_gpu_; }
+  ///
+  /// \brief Get the GPU device id.
+  ///
+  /// \return int The GPU device id.
+  ///
+  int gpu_device_id() const { return device_id_; }
+  ///
+  /// \brief Get the initial size in MB of the GPU memory pool.
+  ///
+  /// \return int The initial size in MB of the GPU memory pool.
+  ///
+  int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; }
+  ///
+  /// \brief Get the proportion of the initial memory pool size compared to the
+  /// device.
+  ///
+  /// \return float The proportion of the initial memory pool size.
+  ///
+  float fraction_of_gpu_memory_for_pool() const;
+
+  // CUDNN related.
+  ///
+  /// \brief Turn on CUDNN.
+  ///
+  ///
+  void EnableCUDNN();
+  ///
+  /// \brief A boolean state telling whether to use CUDNN.
+  ///
+  /// \return bool Whether to use CUDNN.
+  ///
+  bool cudnn_enabled() const { return use_cudnn_; }
+
+  ///
+  /// \brief Control whether to perform IR graph optimization.
+  /// If turned off, the AnalysisConfig will act just like a NativeConfig.
+  ///
+  /// \param x Whether the ir graph optimization is actived.
+  ///
+  void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; }
+  ///
+  /// \brief A boolean state telling whether the ir graph optimization is
+  /// actived.
+  ///
+  /// \return bool Whether to use ir graph optimization.
+  ///
+  bool ir_optim() const { return enable_ir_optim_; }
+
+  ///
+  /// \brief INTERNAL Determine whether to use the feed and fetch operators.
+  /// Just for internal development, not stable yet.
+  /// When ZeroCopyTensor is used, this should be turned off.
+  ///
+  /// \param x Whether to use the feed and fetch operators.
+  ///
+  void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; }
+  ///
+  /// \brief A boolean state telling whether to use the feed and fetch
+  /// operators.
+  ///
+  /// \return bool Whether to use the feed and fetch operators.
+  ///
+  bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; }
+
+  ///
+  /// \brief Control whether to specify the inputs' names.
+  /// The ZeroCopyTensor type has a name member, assign it with the
+  /// corresponding
+  /// variable name. This is used only when the input ZeroCopyTensors passed to
+  /// the
+  /// AnalysisPredictor.ZeroCopyRun() cannot follow the order in the training
+  /// phase.
+  ///
+  /// \param x Whether to specify the inputs' names.
+  ///
+  void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; }
+  ///
+  /// \brief A boolean state tell whether the input ZeroCopyTensor names
+  /// specified should
+  /// be used to reorder the inputs in AnalysisPredictor.ZeroCopyRun().
+  ///
+  /// \return bool Whether to specify the inputs' names.
+  ///
+  bool specify_input_name() const { return specify_input_name_; }
+
+  ///
+  /// \brief Turn on the TensorRT engine.
+  /// The TensorRT engine will accelerate some subgraphes in the original Fluid
+  /// computation graph. In some models such as resnet50, GoogleNet and so on,
+  /// it gains significant performance acceleration.
+  ///
+  /// \param workspace_size The memory size(in byte) used for TensorRT
+  /// workspace.
+  /// \param max_batch_size The maximum batch size of this prediction task,
+  /// better set as small as possible for less performance loss.
+  /// \param min_subgrpah_size The minimum TensorRT subgraph size needed, if a
+  /// subgraph is smaller than this, it will not be transferred to TensorRT
+  /// engine.
+  /// \param precision The precision used in TensorRT.
+  /// \param use_static Serialize optimization information to disk for reusing.
+  /// \param use_calib_mode Use TRT int8 calibration(post training
+  /// quantization).
+  ///
+  ///
+  void EnableTensorRtEngine(int workspace_size = 1 << 20,
+                            int max_batch_size = 1, int min_subgraph_size = 3,
+                            Precision precision = Precision::kFloat32,
+                            bool use_static = false,
+                            bool use_calib_mode = true);
+  ///
+  /// \brief A boolean state telling whether the TensorRT engine is used.
+  ///
+  /// \return bool Whether the TensorRT engine is used.
+  ///
+  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
+  ///
+  /// \brief Set min, max, opt shape for TensorRT Dynamic shape mode.
+  /// \param min_input_shape The min input shape of the subgraph input.
+  /// \param max_input_shape The max input shape of the subgraph input.
+  /// \param opt_input_shape The opt input shape of the subgraph input.
+  /// \param disable_trt_plugin_fp16 Setting this parameter to true means that
+  /// TRT plugin will not run fp16.
+  ///
+  void SetTRTDynamicShapeInfo(
+      std::map<std::string, std::vector<int>> min_input_shape,
+      std::map<std::string, std::vector<int>> max_input_shape,
+      std::map<std::string, std::vector<int>> optim_input_shape,
+      bool disable_trt_plugin_fp16 = false);
+  ///
+  /// \brief Turn on the usage of Lite sub-graph engine.
+  ///
+  /// \param precision_mode Precion used in Lite sub-graph engine.
+  /// \param passes_filter Set the passes used in Lite sub-graph engine.
+  /// \param ops_filter Operators not supported by Lite.
+  ///
+  void EnableLiteEngine(
+      AnalysisConfig::Precision precision_mode = Precision::kFloat32,
+      const std::vector<std::string>& passes_filter = {},
+      const std::vector<std::string>& ops_filter = {});
+
+  ///
+  /// \brief A boolean state indicating whether the Lite sub-graph engine is
+  /// used.
+  ///
+  /// \return bool whether the Lite sub-graph engine is used.
+  ///
+  bool lite_engine_enabled() const { return use_lite_; }
+
+  ///
+  /// \brief Control whether to debug IR graph analysis phase.
+  /// This will generate DOT files for visualizing the computation graph after
+  /// each analysis pass applied.
+  ///
+  /// \param x whether to debug IR graph analysis phase.
+  ///
+  void SwitchIrDebug(int x = true);
+
+  ///
+  /// \brief Turn on MKLDNN.
+  ///
+  ///
+  void EnableMKLDNN();
+  ///
+  /// \brief Set the cache capacity of different input shapes for MKLDNN.
+  /// Default value 0 means not caching any shape.
+  ///
+  /// \param capacity The cache capacity.
+  ///
+  void SetMkldnnCacheCapacity(int capacity);
+  ///
+  /// \brief A boolean state telling whether to use the MKLDNN.
+  ///
+  /// \return bool Whether to use the MKLDNN.
+  ///
+  bool mkldnn_enabled() const { return use_mkldnn_; }
+
+  ///
+  /// \brief Set the number of cpu math library threads.
+  ///
+  /// \param cpu_math_library_num_threads The number of cpu math library
+  /// threads.
+  ///
+  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads);
+  ///
+  /// \brief An int state telling how many threads are used in the CPU math
+  /// library.
+  ///
+  /// \return int The number of threads used in the CPU math library.
+  ///
+  int cpu_math_library_num_threads() const {
+    return cpu_math_library_num_threads_;
+  }
+
+  ///
+  /// \brief Transform the AnalysisConfig to NativeConfig.
+  ///
+  /// \return NativeConfig The NativeConfig transformed.
+  ///
+  NativeConfig ToNativeConfig() const;
+  ///
+  /// \brief Specify the operator type list to use MKLDNN acceleration.
+  ///
+  /// \param op_list The operator type list.
+  ///
+  void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
+    mkldnn_enabled_op_types_ = op_list;
+  }
+
+  ///
+  /// \brief Turn on MKLDNN quantization.
+  ///
+  ///
+  void EnableMkldnnQuantizer();
+
+  ///
+  /// \brief A boolean state telling whether the MKLDNN quantization is enabled.
+  ///
+  /// \return bool Whether the MKLDNN quantization is enabled.
+  ///
+  bool mkldnn_quantizer_enabled() const { return use_mkldnn_quantizer_; }
+
+  ///
+  /// \brief Get MKLDNN quantizer config.
+  ///
+  /// \return MkldnnQuantizerConfig* MKLDNN quantizer config.
+  ///
+  MkldnnQuantizerConfig* mkldnn_quantizer_config() const;
+
+  ///
+  /// \brief Specify the memory buffer of program and parameter.
+  /// Used when model and params are loaded directly from memory.
+  ///
+  /// \param prog_buffer The memory buffer of program.
+  /// \param prog_buffer_size The size of the model data.
+  /// \param params_buffer The memory buffer of the combined parameters file.
+  /// \param params_buffer_size The size of the combined parameters data.
+  ///
+  void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
+                      const char* params_buffer, size_t params_buffer_size);
+  ///
+  /// \brief A boolean state telling whether the model is set from the CPU
+  /// memory.
+  ///
+  /// \return bool Whether model and params are loaded directly from memory.
+  ///
+  bool model_from_memory() const { return model_from_memory_; }
+
+  ///
+  /// \brief Turn on memory optimize
+  /// NOTE still in development.
+  ///
+  void EnableMemoryOptim();
+  ///
+  /// \brief A boolean state telling whether the memory optimization is
+  /// activated.
+  ///
+  /// \return bool Whether the memory optimization is activated.
+  ///
+  bool enable_memory_optim() const;
+
+  ///
+  /// \brief Turn on profiling report.
+  /// If not turned on, no profiling report will be generated.
+  ///
+  void EnableProfile();
+  ///
+  /// \brief A boolean state telling whether the profiler is activated.
+  ///
+  /// \return bool Whether the profiler is activated.
+  ///
+  bool profile_enabled() const { return with_profile_; }
+
+  ///
+  /// \brief Mute all logs in Paddle inference.
+  ///
+  void DisableGlogInfo();
+  ///
+  /// \brief A boolean state telling whether logs in Paddle inference are muted.
+  ///
+  /// \return bool Whether logs in Paddle inference are muted.
+  ///
+  bool glog_info_disabled() const { return !with_glog_info_; }
+
+  ///
+  /// \brief Set the AnalysisConfig to be invalid.
+  /// This is to ensure that an AnalysisConfig can only be used in one
+  /// AnalysisPredictor.
+  ///
+  void SetInValid() const { is_valid_ = false; }
+  ///
+  /// \brief A boolean state telling whether the AnalysisConfig is valid.
+  ///
+  /// \return bool Whether the AnalysisConfig is valid.
+  ///
+  bool is_valid() const { return is_valid_; }
+
+  friend class ::paddle::AnalysisPredictor;
+
+  ///
+  /// \brief Get a pass builder for customize the passes in IR analysis phase.
+  /// NOTE: Just for developer, not an official API, easy to be broken.
+  ///
+  ///
+  PassStrategy* pass_builder() const;
+  void PartiallyRelease();
+
+ protected:
+  // Update the config.
+  void Update();
+
+  std::string SerializeInfoCache();
+
+ protected:
+  // Model pathes.
+  std::string model_dir_;
+  mutable std::string prog_file_;
+  mutable std::string params_file_;
+
+  // GPU related.
+  bool use_gpu_{false};
+  int device_id_{0};
+  uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
+
+  bool use_cudnn_{false};
+
+  // Padding related
+  bool use_fc_padding_{true};
+
+  // TensorRT related.
+  bool use_tensorrt_{false};
+  // For workspace_size, refer it from here:
+  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
+  int tensorrt_workspace_size_{1 << 30};
+  // While TensorRT allows an engine optimized for a given max batch size
+  // to run at any smaller size, the performance for those smaller
+  // sizes may not be as well-optimized. Therefore, Max batch is best
+  // equivalent to the runtime batch size.
+  int tensorrt_max_batchsize_{1};
+  //  We transform the Ops that can be converted into TRT layer in the model,
+  //  and aggregate these Ops into subgraphs for TRT execution.
+  //  We set this variable to control the minimum number of nodes in the
+  //  subgraph, 3 as default value.
+  int tensorrt_min_subgraph_size_{3};
+  Precision tensorrt_precision_mode_{Precision::kFloat32};
+  bool trt_use_static_engine_{false};
+  bool trt_use_calib_mode_{true};
+  std::map<std::string, std::vector<int>> min_input_shape_{};
+  std::map<std::string, std::vector<int>> max_input_shape_{};
+  std::map<std::string, std::vector<int>> optim_input_shape_{};
+  bool disable_trt_plugin_fp16_{false};
+
+  // memory reuse related.
+  bool enable_memory_optim_{false};
+
+  bool use_mkldnn_{false};
+  std::unordered_set<std::string> mkldnn_enabled_op_types_;
+
+  bool model_from_memory_{false};
+
+  bool enable_ir_optim_{true};
+  bool use_feed_fetch_ops_{true};
+  bool ir_debug_{false};
+
+  bool specify_input_name_{false};
+
+  int cpu_math_library_num_threads_{1};
+
+  bool with_profile_{false};
+
+  bool with_glog_info_{true};
+
+  // A runtime cache, shouldn't be transferred to others.
+  std::string serialized_info_cache_;
+
+  mutable std::unique_ptr<PassStrategy> pass_builder_;
+
+  bool use_lite_{false};
+  std::vector<std::string> lite_passes_filter_;
+  std::vector<std::string> lite_ops_filter_;
+  Precision lite_precision_mode_;
+
+  // mkldnn related.
+  int mkldnn_cache_capacity_{0};
+  bool use_mkldnn_quantizer_{false};
+  std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
+
+  // If the config is already used on a predictor, it becomes invalid.
+  // Any config can only be used with one predictor.
+  // Variables held by config can take up a lot of memory in some cases.
+  // So we release the memory when the predictor is set up.
+  mutable bool is_valid_{true};
+  std::string opt_cache_dir_;
+};
+
+}  // namespace paddle
diff --git a/docs/paddle_include_file/paddle_api.h b/docs/paddle_include_file/paddle_api.h
new file mode 100644
index 0000000000000..240ec08b72240
--- /dev/null
+++ b/docs/paddle_include_file/paddle_api.h
@@ -0,0 +1,407 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+/*! \file paddle_api.h
+ */
+
+/*! \mainpage Paddle Inference APIs
+ * \section intro_sec Introduction
+ * The Paddle inference library aims to offer an high performance inference SDK
+ * for Paddle users.
+ */
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+/*! \namespace paddle
+ */
+namespace paddle {
+
+/// \brief Paddle data type.
+enum PaddleDType {
+  FLOAT32,
+  INT64,
+  INT32,
+  UINT8,
+  // TODO(Superjomn) support more data types if needed.
+};
+
+/// \brief Memory manager for PaddleTensor.
+///
+/// The PaddleBuf holds a buffer for data input or output. The memory can be
+/// allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
+/// should be reused for better performance.
+///
+/// For user allocated memory, the following API can be used:
+/// - PaddleBuf(void* data, size_t length) to set an external memory by
+/// specifying the memory address and length.
+/// - Reset(void* data, size_t length) to reset the PaddleBuf with an external
+/// memory.
+/// ATTENTION, for user allocated memory, deallocation should be done by users
+/// externally after the program finished. The PaddleBuf won't do any allocation
+/// or deallocation.
+///
+/// To have the PaddleBuf allocate and manage the memory:
+/// - PaddleBuf(size_t length) will allocate a memory of size `length`.
+/// - Resize(size_t length) resize the memory to no less than `length`,
+/// ATTENTION
+///  if the allocated memory is larger than `length`, nothing will done.
+///
+/// Usage:
+///
+/// Let PaddleBuf manage the memory internally.
+/// \code{cpp}
+/// const int num_elements = 128;
+/// PaddleBuf buf(num_elements/// sizeof(float));
+/// \endcode
+///
+/// Or
+/// \code{cpp}
+/// PaddleBuf buf;
+/// buf.Resize(num_elements/// sizeof(float));
+/// \endcode
+/// Works the exactly the same.
+///
+/// One can also make the `PaddleBuf` use the external memory.
+/// \code{cpp}
+/// PaddleBuf buf;
+/// void* external_memory = new float[num_elements];
+/// buf.Reset(external_memory, num_elements*sizeof(float));
+/// ...
+/// delete[] external_memory; // manage the memory lifetime outside.
+/// \endcode
+///
+class PaddleBuf {
+ public:
+  ///
+  /// \brief PaddleBuf allocate memory internally, and manage it.
+  ///
+  /// \param[in] length The length of data.
+  ///
+  explicit PaddleBuf(size_t length)
+      : data_(new char[length]), length_(length), memory_owned_(true) {}
+  ///
+  /// \brief Set external memory, the PaddleBuf won't manage it.
+  ///
+  /// \param[in] data The start address of the external memory.
+  /// \param[in] length The length of data.
+  ///
+  PaddleBuf(void* data, size_t length)
+      : data_(data), length_(length), memory_owned_{false} {}
+  ///
+  /// \brief Copy only available when memory is managed externally.
+  ///
+  /// \param[in] other another `PaddleBuf`
+  ///
+  explicit PaddleBuf(const PaddleBuf& other);
+  ///
+  /// \brief Resize the memory.
+  ///
+  /// \param[in] length The length of data.
+  ///
+  void Resize(size_t length);
+  ///
+  /// \brief Reset to external memory, with address and length set.
+  ///
+  /// \param[in] data The start address of the external memory.
+  /// \param[in] length The length of data.
+  ///
+  void Reset(void* data, size_t length);
+  ///
+  /// \brief Tell whether the buffer is empty.
+  ///
+  bool empty() const { return length_ == 0; }
+  ///
+  /// \brief Get the data's memory address.
+  ///
+  void* data() const { return data_; }
+  ///
+  /// \brief Get the memory length.
+  ///
+  size_t length() const { return length_; }
+
+  ~PaddleBuf() { Free(); }
+  PaddleBuf& operator=(const PaddleBuf&);
+  PaddleBuf& operator=(PaddleBuf&&);
+  PaddleBuf() = default;
+  PaddleBuf(PaddleBuf&& other);
+
+ private:
+  void Free();
+  void* data_{nullptr};  ///< pointer to the data memory.
+  size_t length_{0};     ///< number of memory bytes.
+  bool memory_owned_{true};
+};
+
+///
+/// \brief Basic input and output data structure for PaddlePredictor.
+///
+struct PaddleTensor {
+  PaddleTensor() = default;
+  std::string name;  ///<  variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  ///<  blob of data.
+  PaddleDType dtype;
+  std::vector<std::vector<size_t>> lod;  ///<  Tensor+LoD equals LoDTensor
+};
+
+enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
+
+/// \brief Represents an n-dimensional array of values.
+/// The ZeroCopyTensor is used to store the input or output of the network.
+/// Zero copy means that the tensor supports direct copy of host or device data
+/// to device,
+/// eliminating additional CPU copy. ZeroCopyTensor is only used in the
+/// AnalysisPredictor.
+/// It is obtained through PaddlePredictor::GetinputTensor()
+/// and PaddlePredictor::GetOutputTensor() interface.
+class ZeroCopyTensor {
+ public:
+  /// \brief Reset the shape of the tensor.
+  /// Generally it's only used for the input tensor.
+  /// Reshape must be called before calling mutable_data() or copy_from_cpu()
+  /// \param shape The shape to set.
+  void Reshape(const std::vector<int>& shape);
+
+  /// \brief Get the memory pointer in CPU or GPU with specific data type.
+  /// Please Reshape the tensor first before call this.
+  /// It's usually used to get input data pointer.
+  /// \param place The place of the tensor.
+  template <typename T>
+  T* mutable_data(PaddlePlace place);
+
+  /// \brief Get the memory pointer directly.
+  /// It's usually used to get the output data pointer.
+  /// \param[out] place To get the device type of the tensor.
+  /// \param[out] size To get the data size of the tensor.
+  /// \return The tensor data buffer pointer.
+  template <typename T>
+  T* data(PaddlePlace* place, int* size) const;
+
+  /// \brief Copy the host memory to tensor data.
+  /// It's usually used to set the input tensor data.
+  /// \param data The pointer of the data, from which the tensor will copy.
+  template <typename T>
+  void copy_from_cpu(const T* data);
+
+  /// \brief Copy the tensor data to the host memory.
+  /// It's usually used to get the output tensor data.
+  /// \param[out] data The tensor will copy the data to the address.
+  template <typename T>
+  void copy_to_cpu(T* data);
+
+  /// \brief Return the shape of the Tensor.
+  std::vector<int> shape() const;
+
+  /// \brief Set lod info of the tensor.
+  /// More about LOD can be seen here:
+  ///  https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
+  /// \param x the lod info.
+  void SetLoD(const std::vector<std::vector<size_t>>& x);
+  /// \brief Return the lod info of the tensor.
+  std::vector<std::vector<size_t>> lod() const;
+  /// \brief Return the name of the tensor.
+  const std::string& name() const { return name_; }
+  void SetPlace(PaddlePlace place, int device = -1) {
+    place_ = place;
+    device_ = device;
+  }
+
+  /// \brief Return the data type of the tensor.
+  /// It's usually used to get the output tensor data type.
+  /// \return The data type of the tensor.
+  PaddleDType type() const;
+
+ protected:
+  explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
+  void SetName(const std::string& name) { name_ = name; }
+  void* FindTensor() const;
+
+ private:
+  std::string name_;
+  bool input_or_output_;
+  friend class AnalysisPredictor;
+  void* scope_{nullptr};
+  // The corresponding tensor pointer inside Paddle workspace is cached for
+  // performance.
+  mutable void* tensor_{nullptr};
+  PaddlePlace place_;
+  PaddleDType dtype_;
+  int device_;
+};
+
+/// \brief A Predictor for executing inference on a model.
+/// Base class for AnalysisPredictor and NativePaddlePredictor.
+class PaddlePredictor {
+ public:
+  struct Config;
+  PaddlePredictor() = default;
+  PaddlePredictor(const PaddlePredictor&) = delete;
+  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
+
+  /// \brief This interface takes input and runs the network.
+  /// There are redundant copies of data between hosts in this operation,
+  /// so it is more recommended to use the zecopyrun interface
+  /// \param[in] inputs An list of PaddleTensor as the input to the network.
+  /// \param[out] output_data Pointer to the tensor list, which holds the output
+  /// paddletensor
+  /// \param[in] batch_size This setting has been discarded and can be ignored.
+  /// \return Whether the run is successful
+  virtual bool Run(const std::vector<PaddleTensor>& inputs,
+                   std::vector<PaddleTensor>* output_data,
+                   int batch_size = -1) = 0;
+
+  /// \brief  Used to get the name of the network input.
+  /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
+  /// \return Input tensor names.
+  virtual std::vector<std::string> GetInputNames() { return {}; }
+
+  /// \brief Get the input shape of the model.
+  /// \return A map contains all the input names and shape defined in the model.
+  virtual std::map<std::string, std::vector<int64_t>> GetInputTensorShape() {
+    return {};
+  }
+
+  /// \brief Used to get the name of the network output.
+  /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
+  /// \return Output tensor names.
+  virtual std::vector<std::string> GetOutputNames() { return {}; }
+
+  /// \brief Get the input ZeroCopyTensor by name.
+  /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
+  /// The name is obtained from the GetInputNames() interface.
+  /// \param name The input tensor name.
+  /// \return Return the corresponding input ZeroCopyTensor.
+  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+
+  /// \brief Get the output ZeroCopyTensor by name.
+  /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
+  /// The name is obtained from the GetOutputNames() interface.
+  /// \param name The output tensor name.
+  /// \return Return the corresponding output ZeroCopyTensor.
+  virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  /// \brief Run the network with zero-copied inputs and outputs.
+  /// Be inherited by AnalysisPredictor and only used in ZeroCopy scenarios.
+  /// This will save the IO copy for transfering inputs and outputs to predictor
+  /// workspace
+  /// and get some performance improvement.
+  /// To use it, one should call the AnalysisConfig.SwitchUseFeedFetchOp(true)
+  /// and then use the `GetInputTensor` and `GetOutputTensor`
+  /// to directly write or read the input/output tensors.
+  /// \return Whether the run is successful
+  virtual bool ZeroCopyRun() { return false; }
+
+  /// \brief Clone an existing predictor
+  /// When using clone, the same network will be created,
+  /// and the parameters between them are shared.
+  /// \return unique_ptr which contains the pointer of predictor
+  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
+
+  /// \brief Destroy the Predictor.
+  virtual ~PaddlePredictor() = default;
+
+  virtual std::string GetSerializedProgram() const {
+    assert(false);  // Force raise error.
+    return "NotImplemented";
+  }
+
+  /// \brief Base class for NativeConfig and AnalysisConfig.
+  struct Config {
+    std::string model_dir; /*!< path to the model directory. */
+  };
+};
+
+///
+/// \brief configuration manager for `NativePredictor`.
+///
+/// `AnalysisConfig` manages configurations of `NativePredictor`.
+/// During inference procedure, there are many parameters(model/params path,
+/// place of inference, etc.)
+///
+struct NativeConfig : public PaddlePredictor::Config {
+  /// GPU related fields.
+  bool use_gpu{false};
+  int device{0};
+  float fraction_of_gpu_memory{
+      -1.f};  ///< Change to a float in (0,1] if needed.
+
+  std::string prog_file;
+  std::string
+      param_file;  ///< Specify the exact path of program and parameter files.
+
+  bool specify_input_name{false};  ///< Specify the variable's name of each
+                                   ///< input if input tensors don't follow the
+                                   ///< `feeds` and `fetches` of the phase
+                                   ///< `save_inference_model`.
+
+  /// Set and get the number of cpu math library threads.
+  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
+    cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+  }
+  int cpu_math_library_num_threads() const {
+    return cpu_math_library_num_threads_;
+  }
+
+ protected:
+  int cpu_math_library_num_threads_{1};  ///< number of cpu math library (such
+                                         ///< as MKL, OpenBlas) threads for each
+                                         ///< instance.
+};
+
+///
+/// \brief A factory to help create different predictors.
+///
+/// Usage:
+///
+/// \code{.cpp}
+/// NativeConfig config;
+/// ... // change the configs.
+/// auto native_predictor = CreatePaddlePredictor(config);
+/// \endcode
+///
+/// FOR EXTENSION DEVELOPER:
+/// Different predictors are designated by config type. Similar configs can be
+/// merged, but there shouldn't be a huge config containing different fields for
+/// more than one kind of predictors.
+////
+template <typename ConfigT>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+/// NOTE The following APIs are too trivial, we will discard it in the following
+/// versions.
+///
+enum class PaddleEngineKind {
+  kNative = 0,         ///< Use the native Fluid facility.
+  kAutoMixedTensorRT,  ///< Automatically mix Fluid with TensorRT.
+  kAnalysis,           ///< More optimization.
+};
+
+template <typename ConfigT, PaddleEngineKind engine>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+int PaddleDtypeSize(PaddleDType dtype);
+
+std::string get_version();
+
+}  // namespace paddle
diff --git a/docs/paddle_include_file/paddle_inference_api.h b/docs/paddle_include_file/paddle_inference_api.h
new file mode 100644
index 0000000000000..6f30ad95f168c
--- /dev/null
+++ b/docs/paddle_include_file/paddle_inference_api.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the definition of a simple Inference API for Paddle.
+ *
+ * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
+ * might release another API.
+ */
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle_analysis_config.h"  // NOLINT
+#include "paddle_api.h"              // NOLINT
diff --git a/docs/paddle_include_file/paddle_mkldnn_quantizer_config.h b/docs/paddle_include_file/paddle_mkldnn_quantizer_config.h
new file mode 100644
index 0000000000000..6ddbef78f9d4c
--- /dev/null
+++ b/docs/paddle_include_file/paddle_mkldnn_quantizer_config.h
@@ -0,0 +1,198 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file paddle_mkldnn_quantizer_config.h
+///
+/// \brief Mkldnn quantizer config.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2020-01-01
+/// \since 1.7.0
+///
+
+#pragma once
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle_api.h"  // NOLINT
+
+namespace paddle {
+
+///
+/// \brief Algorithms for finding scale of quantized Tensors.
+///
+enum class ScaleAlgo {
+  NONE,      ///< Do not compute scale
+  MAX,       ///< Find scale based on the max absolute value
+  MAX_CH,    ///< Find scale based on the max absolute value per output channel
+  MAX_CH_T,  ///< Find scale based on the max absolute value per output channel
+             ///< of a transposed tensor
+  KL,        ///< Find scale based on KL Divergence
+};
+
+///
+/// \class MkldnnQuantizerConfig
+///
+/// \brief Config for mkldnn quantize.
+///
+/// The MkldnnQuantizerConfig is used to configure Mkldnn's quantization
+/// parameters, including scale algorithm, warmup data, warmup batch size,
+/// quantized op list, etc.
+///
+/// It is not recommended to use this config directly, please refer to
+/// AnalysisConfig::mkldnn_quantizer_config()
+///
+struct MkldnnQuantizerConfig {
+  ///
+  /// \brief Construct a new Mkldnn Quantizer Config object
+  ///
+  MkldnnQuantizerConfig();
+
+  ///
+  /// \brief Set the scale algo
+  ///
+  /// Specify a quantization algorithm for a connection (input/output) of the
+  /// operator type.
+  /// \param[in] op_type_name the operator's name.
+  /// \param[in] conn_name name of the connection (input/output) of the
+  /// operator.
+  /// \param[in] algo the algorithm for computing scale.
+  ///
+  void SetScaleAlgo(std::string op_type_name, std::string conn_name,
+                    ScaleAlgo algo) {
+    rules_[op_type_name][conn_name] = algo;
+  }
+
+  ///
+  /// \brief Get the scale algo
+  ///
+  /// Get the quantization algorithm for a connection (input/output) of the
+  /// operator type.
+  ///
+  /// \param[in] op_type_name the operator's name.
+  /// \param[in] conn_name name of the connection (input/output) of the
+  /// operator.
+  /// \return the scale algo.
+  ///
+  ScaleAlgo scale_algo(const std::string& op_type_name,
+                       const std::string& conn_name) const;
+
+  ///
+  /// \brief Set the warmup data
+  ///
+  /// Set the batch of data to be used for warm-up iteration.
+  ///
+  /// \param[in] data batch of data.
+  ///
+  void SetWarmupData(std::shared_ptr<std::vector<PaddleTensor>> data) {
+    warmup_data_ = data;
+  }
+
+  ///
+  /// \brief Get the warmup data
+  ///
+  /// Get the batch of data used for warm-up iteration.
+  ///
+  /// \return the warm up data
+  ///
+  std::shared_ptr<std::vector<PaddleTensor>> warmup_data() const {
+    return warmup_data_;
+  }
+
+  ///
+  /// \brief Set the warmup batch size
+  ///
+  /// Set the batch size for warm-up iteration.
+  ///
+  /// \param[in] batch_size warm-up batch size
+  ///
+  void SetWarmupBatchSize(int batch_size) { warmup_bs_ = batch_size; }
+
+  ///
+  /// \brief Get the warmup batch size
+  ///
+  /// Get the batch size for warm-up iteration.
+  ///
+  /// \return the warm up batch size
+  int warmup_batch_size() const { return warmup_bs_; }
+
+  ///
+  /// \brief Set quantized op list
+  ///
+  /// In the quantization process, set the op list that supports quantization
+  ///
+  /// \param[in] op_list List of quantized ops
+  ///
+  void SetEnabledOpTypes(std::unordered_set<std::string> op_list) {
+    enabled_op_types_ = op_list;
+  }
+
+  ///
+  /// \brief Get quantized op list
+  ///
+  /// \return list of quantized ops
+  ///
+  const std::unordered_set<std::string>& enabled_op_types() const {
+    return enabled_op_types_;
+  }
+
+  ///
+  /// \brief Set the excluded op ids
+  ///
+  /// \param[in] op_ids_list excluded op ids
+  ///
+  void SetExcludedOpIds(std::unordered_set<int> op_ids_list) {
+    excluded_op_ids_ = op_ids_list;
+  }
+
+  ///
+  /// \brief Get the excluded op ids
+  ///
+  /// \return exclude op ids
+  ///
+  const std::unordered_set<int>& excluded_op_ids() const {
+    return excluded_op_ids_;
+  }
+
+  ///
+  /// \brief Set default scale algorithm
+  ///
+  /// \param[in] algo Method for calculating scale in quantization process
+  ///
+  void SetDefaultScaleAlgo(ScaleAlgo algo) { default_scale_algo_ = algo; }
+
+  ///
+  /// \brief Get default scale algorithm
+  ///
+  /// \return Method for calculating scale in quantization
+  /// process
+  ///
+  ScaleAlgo default_scale_algo() const { return default_scale_algo_; }
+
+ protected:
+  std::map<std::string, std::map<std::string, ScaleAlgo>> rules_;
+  std::unordered_set<std::string> enabled_op_types_;
+  std::unordered_set<int> excluded_op_ids_;
+  std::shared_ptr<std::vector<PaddleTensor>> warmup_data_;
+  int warmup_bs_{1};
+  ScaleAlgo default_scale_algo_{ScaleAlgo::MAX};
+};
+
+}  // namespace paddle
diff --git a/docs/paddle_include_file/paddle_pass_builder.h b/docs/paddle_include_file/paddle_pass_builder.h
new file mode 100644
index 0000000000000..bce463182d509
--- /dev/null
+++ b/docs/paddle_include_file/paddle_pass_builder.h
@@ -0,0 +1,221 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+///
+/// \file paddle_pass_builder.h
+///
+/// \brief Class Paddle Passs Builder and its subclasses(pass strategies).
+/// \section sec_intro Introduction
+/// This class aims to build passes for paddle and define passes' strategies.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2020-3-23
+/// \since 1.7
+
+/// \namespace paddle
+namespace paddle {
+
+/// \class PaddlePassBuilder
+/// \brief This class build passes based on vector<string> input. It is part of
+/// inference API. Users can build passes, insert new passes, delete passes
+/// using this class and its functions.
+///
+/// Example Usage:
+///     Build a new pass.
+/// \code{cpp}
+/// const vector<string> passes(1, "conv_relu_mkldnn_fuse_pass");
+/// PaddlePassBuilder builder(passes);
+/// \endcode
+class PaddlePassBuilder {
+ public:
+  /// \brief Constructor of the class. It stores the input passes.
+  /// \param[in] passes passes' types.
+  explicit PaddlePassBuilder(const std::vector<std::string> &passes)
+      : passes_(passes) {}
+
+  /// \brief Stores the input passes.
+  /// \param[in] passes passes' types.
+  void SetPasses(std::initializer_list<std::string> passes) {
+    passes_ = passes;
+  }
+
+  /// \brief Append a pass to the end of the passes.
+  /// \param[in] pass_type the type of the new pass.
+  void AppendPass(const std::string &pass_type);
+
+  /// \brief Insert a pass to a specific position.
+  /// \param[in] idx the position to insert.
+  /// \param[in] pass_type the type of insert pass.
+  void InsertPass(size_t idx, const std::string &pass_type);
+
+  /// \brief Delete the pass at certain position 'idx'.
+  /// \param[in] idx the position to delete.
+  void DeletePass(size_t idx);
+
+  /// \brief Delete all passes that has a certain type 'pass_type'.
+  /// \param[in] pass_type the certain pass type to be deleted.
+  void DeletePass(const std::string &pass_type);
+
+  /// \brief Delete all the passes.
+  void ClearPasses();
+
+  /// \brief Append an analysis pass.
+  /// \param[in] pass the type of the new analysis pass.
+  void AppendAnalysisPass(const std::string &pass);
+
+  /// \brief Visualize the computation graph after each pass by generating a DOT
+  /// language file, one can draw them with the Graphviz toolkit.
+  void TurnOnDebug();
+  /// \brief Human-readable information of the passes.
+  std::string DebugString();
+
+  /// \brief Get information of passes.
+  /// \return Return list of the passes.
+  const std::vector<std::string> &AllPasses() const { return passes_; }
+
+  /// \brief Get information of analysis passes.
+  /// \return Return list of analysis passes.
+  std::vector<std::string> AnalysisPasses() const {
+    auto passes = analysis_passes_;
+    // To make sure the ir_graph_to_program should be the last pass so any
+    // modication of IR will persist to the program.
+    passes.push_back("ir_graph_to_program_pass");
+    return passes;
+  }
+
+ protected:
+  /// \cond Protected
+  std::vector<std::string> analysis_passes_{
+      {"ir_graph_build_pass", "ir_graph_clean_pass", "ir_analysis_pass",
+       "ir_params_sync_among_devices_pass", "adjust_cudnn_workspace_size_pass",
+       "inference_op_replace_pass"}};
+  std::vector<std::string> passes_;
+  /// \endcond
+};
+
+/// \class PassStrategy
+/// \brief This class defines the pass strategies like whether to use gpu/cuDNN
+/// kernel/MKLDNN.
+class PassStrategy : public PaddlePassBuilder {
+ public:
+  /// \brief Constructor of PassStrategy class. It works the same as
+  /// PaddlePassBuilder class. \param[in] passes passes' types.
+  explicit PassStrategy(const std::vector<std::string> &passes)
+      : PaddlePassBuilder(passes) {}
+
+  /// \brief Enable the use of cuDNN kernel.
+  virtual void EnableCUDNN() {}
+
+  /// \brief Enable the use of MKLDNN.
+  /// The MKLDNN control exists in both CPU and GPU mode, because there can
+  /// still be some CPU kernels running in GPU mode.
+  virtual void EnableMKLDNN() {}
+
+  /// \brief Enable MKLDNN quantize optimization.
+  virtual void EnableMkldnnQuantizer() {}
+
+  /// \brief Check if we are using gpu.
+  /// \return A bool variable implying whether we are in gpu mode.
+  bool use_gpu() const { return use_gpu_; }
+
+  /// \brief Default destructor.
+  virtual ~PassStrategy() = default;
+
+ protected:
+  /// \cond Protected
+  bool use_gpu_{false};
+  bool use_mkldnn_{false};
+  /// \endcond
+};
+
+/// \class CpuPassStrategy
+/// \brief The CPU passes controller, it is used in AnalysisPredictor with CPU
+/// mode.
+class CpuPassStrategy : public PassStrategy {
+ public:
+  /// \brief Default constructor of CpuPassStrategy.
+  CpuPassStrategy();
+
+  /// \brief Construct by copying another CpuPassStrategy object.
+  /// \param[in] other The CpuPassStrategy object we want to copy.
+  explicit CpuPassStrategy(const CpuPassStrategy &other)
+      : PassStrategy(other.AllPasses()) {
+    use_gpu_ = other.use_gpu_;
+    use_mkldnn_ = other.use_mkldnn_;
+    use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
+  }
+  /// \brief Default destructor.
+  virtual ~CpuPassStrategy() = default;
+
+  /// \brief Enable the use of cuDNN kernel.
+  void EnableCUDNN() override;
+
+  /// \brief Enable the use of MKLDNN.
+  void EnableMKLDNN() override;
+
+  /// \brief Enable MKLDNN quantize optimization.
+  void EnableMkldnnQuantizer() override;
+
+ protected:
+  /// \cond Protected
+  bool use_mkldnn_quantizer_{false};
+  /// \endcond
+};
+
+/// \class GpuPassStrategy
+/// \brief The GPU passes controller, it is used in AnalysisPredictor with GPU
+/// mode.
+class GpuPassStrategy : public PassStrategy {
+ public:
+  /// \brief Default constructor of GpuPassStrategy.
+  GpuPassStrategy();
+
+  /// \brief Construct by copying another GpuPassStrategy object.
+  /// \param[in] other The GpuPassStrategy object we want to copy.
+  explicit GpuPassStrategy(const GpuPassStrategy &other)
+      : PassStrategy(other.AllPasses()) {
+    use_gpu_ = true;
+    use_cudnn_ = other.use_cudnn_;
+  }
+
+  /// \brief Enable the use of cuDNN kernel.
+  void EnableCUDNN() override;
+
+  /// \brief Not supported in GPU mode yet.
+  void EnableMKLDNN() override;
+
+  /// \brief Not supported in GPU mode yet.
+  void EnableMkldnnQuantizer() override;
+
+  /// \brief Default destructor.
+  virtual ~GpuPassStrategy() = default;
+
+ protected:
+  /// \cond Protected
+  bool use_cudnn_{false};
+  /// \endcond
+};
+/// \brief List of tensorRT subgraph passes.
+extern const std::vector<std::string> kTRTSubgraphPasses;
+
+/// \brief List of lite subgraph passes.
+extern const std::vector<std::string> kLiteSubgraphPasses;
+
+}  // namespace paddle
diff --git a/docs/requirements.txt b/docs/requirements.txt
index f11fa32f6f465..694de92684243 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -2,3 +2,4 @@ sphinx
 recommonmark
 sphinx_markdown_tables
 sphinx_rtd_theme
+exhale
diff --git a/docs/user_guides/cxx_api.md b/docs/user_guides/cxx_api.md
deleted file mode 100644
index 708ffda0d17ed..0000000000000
--- a/docs/user_guides/cxx_api.md
+++ /dev/null
@@ -1,230 +0,0 @@
-# 使用C++预测
-为了简单方便地进行推理部署，飞桨提供了一套高度优化的C++ API推理接口。下面对各主要API使用方法进行详细介绍。    
-
-在[使用流程](./tutorial)一节中，我们了解到Paddle Inference预测包含了以下几个方面：
-
-- 配置推理选项
-- 创建predictor
-- 准备模型输入
-- 模型推理
-- 获取模型输出
-
-那我们先用一个简单的程序介绍这一过程：
-
-```c++
-std::unique_ptr<paddle::PaddlePredictor> CreatePredictor() {
-  // 通过AnalysisConfig配置推理选项
-  AnalysisConfig config;
-  config.SetModel(“./resnet50/model”,
-                     "./resnet50/params");
-  config.EnableUseGpu(100, 0);
-  config.SwitchUseFeedFetchOps(false);
-  config.EnableMKLDNN();
-  config.EnableMemoryOptim();
-  // 创建predictor
-  return CreatePaddlePredictor(config);
-}
-
-void Run(paddle::PaddlePredictor *predictor,
-         const std::vector<float>& input,
-         const std::vector<int>& input_shape, 
-         std::vector<float> *out_data) {
-  // 准备模型的输入
-  int input_num = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
-
-  auto input_names = predictor->GetInputNames();
-  auto input_t = predictor->GetInputTensor(input_names[0]);
-  input_t->Reshape(input_shape);
-  input_t->copy_from_cpu(input.data());
-  // 模型推理
-  CHECK(predictor->ZeroCopyRun());
-  
-  // 获取模型的输出
-  auto output_names = predictor->GetOutputNames();
-  // there is only one output of Resnet50
-  auto output_t = predictor->GetOutputTensor(output_names[0]);
-  std::vector<int> output_shape = output_t->shape();
-  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
-  out_data->resize(out_num);
-  output_t->copy_to_cpu(out_data->data());
-}
-```
-
-以上的程序中`CreatePredictor`函数对推理过程进行了配置以及创建了Predictor。 `Run`函数进行了输入数据的准备、模型推理以及输出数据的获取过程。
-
-接下来我们依次对程序中出现的AnalysisConfig，Predictor，模型输入，模型输出做一个详细的介绍。
-
-## 一：关于AnalysisConfig
-
-AnalysisConfig管理AnalysisPredictor的推理配置，提供了模型路径设置、推理引擎运行设备选择以及多种优化推理流程的选项。配置中包括了必选配置以及可选配置。 
-
-#### 1. 必选配置
-
-**a. 设置模型和参数路径**   
-
-从磁盘加载模型时，根据模型和参数文件存储方式不同，设置AnalysisConfig加载模型和参数的路径有两种形式：
-
-* non-combined形式：模型文件夹`model_dir`下存在一个模型文件和多个参数文件时，传入模型文件夹路径，模型文件名默认为`__model__`。 使用方式为：`config->SetModel("./model_dir");`。
-* combined形式：模型文件夹`model_dir`下只有一个模型文件`model`和一个参数文件`params`时，传入模型文件和参数文件路径。使用方式为：`config->SetModel("./model_dir/model", "./model_dir/params");`。
-* 内存加载模式：如果模型是从内存加载(模型必须为combined形式)，可以使用
-
-	```c++
-	std::ifstream in_m(FLAGS_dirname + "/model");
-	std::ifstream in_p(FLAGS_dirname + "/params");
-	std::ostringstream os_model, os_param;
-	os_model << in_m.rdbuf();
-	os_param << in_p.rdbuf();
-	config.SetModelBuffer(os_model.str().data(), os_model.str().size(), os_param.str().data(), os_param.str().size());
-	```
-
-Paddle Inference有两种格式的模型，分别为`non-combined` 以及 `combined`。这两种类型我们在[Quick Start](../introduction/quick_start)一节中提到过，忘记的同学可以回顾下。
-
-**b. 关闭Feed，Fetch op** 
-`config->SwitchUseFeedFetchOps(false);  // 关闭feed和fetch OP使用，使用ZeroCopy接口必须设置此项`
-
-我们用一个小的例子来说明我们为什么要关掉它们。  
-假设我们有一个模型，模型运行的序列为:
-`input -> FEED_OP -> feed_out -> CONV_OP -> conv_out -> FETCH_OP -> output`                    
-
-序列中大些字母的`FEED_OP`, `CONV_OP`, `FETCH_OP` 为模型中的OP， 小写字母的`input`，`feed_out`，`output` 为模型中的变量。                      
-
-在ZeroCopy模式下，我们通过`predictor->GetInputTensor(input_names[0])`获取的模型输入为`FEED_OP`的输出， 即`feed_out`，我们通过`predictor->GetOutputTensor(output_names[0])`接口获取的模型输出为`FETCH_OP`的输入，即`conv_out`，这种情况下，我们在运行期间就没有必要运行feed和fetch OP了，因此需要设置`config->SwitchUseFeedFetchOps(false)`来关闭feed和fetch op。
-
-
-#### 2. 可选配置
- 
-**a. 加速CPU推理**
- 
-```
-// 开启MKLDNN，可加速CPU推理，要求预测库带MKLDNN功能。
-config->EnableMKLDNN();	  	  		
-// 可以设置CPU数学库线程数math_threads，可加速推理。
-// 注意：math_threads * 外部线程数 需要小于总的CPU的核心数目，否则会影响预测性能。
-config->SetCpuMathLibraryNumThreads(10); 
-
-```
-
-**b. 使用GPU推理**
-
-```
-// EnableUseGpu后，模型将运行在GPU上。
-// 第一个参数表示预先分配显存数目，第二个参数表示设备的ID。
-config->EnableUseGpu(100, 0); 
-```
-
-如果使用的预测lib带Paddle-TRT子图功能，可以打开TRT选项进行加速, 详细的请访问[Paddle-TensorRT文档](../optimize/paddle_trt)： 
-
-```
-// 开启TensorRT推理，可提升GPU推理性能，需要使用带TensorRT的推理库
-config->EnableTensorRtEngine(1 << 30      	   /*workspace_size*/,   
-                        	 batch_size        /*max_batch_size*/,     
-                        	 3                 /*min_subgraph_size*/, 
-                       		 AnalysisConfig::Precision::kFloat32 /*precision*/, 
-                        	 false             /*use_static*/, 
-                        	 false             /*use_calib_mode*/);
-```
-通过计算图分析，Paddle可以自动将计算图中部分子图融合，并调用NVIDIA的 TensorRT 来进行加速。
-
-
-**c. 内存/显存优化**
-
-```
-config->EnableMemoryOptim();  // 开启内存/显存复用
-```
-该配置设置后，在模型图分析阶段会对图中的变量进行依赖分类，两两互不依赖的变量会使用同一块内存/显存空间，缩减了运行时的内存/显存占用（模型较大或batch较大时效果显著）。
-
-
-**d. debug开关**
-
-
-```
-// 该配置设置后，会关闭模型图分析阶段的任何图优化，预测期间运行同训练前向代码一致。
-config->SwitchIrOptim(false);
-```
-
-```
-// 该配置设置后，会在模型图分析的每个阶段后保存图的拓扑信息到.dot文件中，该文件可用graphviz可视化。
-config->SwitchIrDebug();
-```
-
-## 二：关于PaddlePredictor
-
-PaddlePredictor 是在模型上执行推理的预测器，根据AnalysisConfig中的配置进行创建。
-
-
-```
-std::unique_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config);
-```
-
-CreatePaddlePredictor 期间首先对模型进行加载，并且将模型转换为由变量和运算节点组成的计算图。接下来将进行一系列的图优化，包括OP的横向纵向融合，删除无用节点，内存/显存优化，以及子图（Paddle-TRT）的分析，加速推理性能，提高吞吐。
-
-
-## 三：输入输出
-
-#### 1. 准备输入
-
-**a. 获取模型所有输入的tensor名字**
-
-```
-std::vector<std::string> input_names = predictor->GetInputNames();
-```
-
-**b. 获取对应名字下的tensor**
-
-```
-// 获取第0个输入
-auto input_t = predictor->GetInputTensor(input_names[0]);
-```
-
-**c. 将数据copy到tensor中**
-
-```
-// 在copy前需要设置tensor的shape
-input_t->Reshape({batch_size, channels, height, width});
-// tensor会根据上述设置的shape从input_data中拷贝对应数目的数据到tensor中。
-input_t->copy_from_cpu<float>(input_data /*数据指针*/);
-```
-
-当然我们也可以用mutable_data获取tensor的数据指针:
-
-```
-// 参数可为PaddlePlace::kGPU, PaddlePlace::kCPU
-float *input_d = input_t->mutable_data<float>(PaddlePlace::kGPU);
-```
-
-#### 2. 获取输出
-**a. 获取模型所有输出的tensor名字**
-
-```
-std::vector<std::string> out_names = predictor->GetOutputNames();
-```
-
-**b. 获取对应名字下的tensor**
-
-```
-// 获取第0个输出
-auto output_t = predictor->GetOutputTensor(out_names[0]);
-```
-
-**c. 将数据copy到tensor中**
-
-```
-std::vector<float> out_data;
-// 获取输出的shpae
-std::vector<int> output_shape = output_t->shape();
-int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
-out_data->resize(out_num);
-output_t->copy_to_cpu(out_data->data());
-```
-
-我们可以用data接口获取tensor的数据指针：
-
-```
-// 参数可为PaddlePlace::kGPU, PaddlePlace::kCPU
-int output_size;
-float *output_d = output_t->data<float>(PaddlePlace::kGPU, &output_size);
-```
-
-### 下一步
-
-看到这里您是否已经对Paddle Inference的C++使用有所了解了呢？请访问[这里](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B)进行样例测试。
diff --git a/docs/user_guides/cxx_api.rst b/docs/user_guides/cxx_api.rst
new file mode 100644
index 0000000000000..22d62600d08aa
--- /dev/null
+++ b/docs/user_guides/cxx_api.rst
@@ -0,0 +1,245 @@
+使用C++预测
+==========
+为了简单方便地进行推理部署，飞桨提供了一套高度优化的C++ API推理接口。下面对各主要API使用方法进行详细介绍。    
+
+在 `使用流程 <./tutorial.html>`_ 一节中，我们了解到Paddle Inference预测包含了以下几个方面：
+
+- 配置推理选项
+- 创建predictor
+- 准备模型输入
+- 模型推理
+- 获取模型输出
+
+那我们先用一个简单的程序介绍这一过程：
+
+.. code:: c++
+
+	std::unique_ptr<paddle::PaddlePredictor> CreatePredictor() {
+		// 通过AnalysisConfig配置推理选项
+		AnalysisConfig config;
+		config.SetModel(“./resnet50/model”,
+	                     "./resnet50/params");
+		config.EnableUseGpu(100, 0);
+		config.SwitchUseFeedFetchOps(false);
+		config.EnableMKLDNN();
+		config.EnableMemoryOptim();
+		// 创建predictor
+		return CreatePaddlePredictor(config);
+	}
+	
+	void Run(paddle::PaddlePredictor *predictor,
+			const std::vector<float>& input,
+			const std::vector<int>& input_shape, 
+			std::vector<float> *out_data) {
+		// 准备模型的输入
+		int input_num = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+	
+		auto input_names = predictor->GetInputNames();
+		auto input_t = predictor->GetInputTensor(input_names[0]);
+		input_t->Reshape(input_shape);
+		input_t->copy_from_cpu(input.data());
+		// 模型推理
+		CHECK(predictor->ZeroCopyRun());
+	  
+		// 获取模型的输出
+		auto output_names = predictor->GetOutputNames();
+		// there is only one output of Resnet50
+		auto output_t = predictor->GetOutputTensor(output_names[0]);
+		std::vector<int> output_shape = output_t->shape();
+		int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+		out_data->resize(out_num);
+		output_t->copy_to_cpu(out_data->data());
+	}
+
+
+以上的程序中 **CreatePredictor** 函数对推理过程进行了配置以及创建了Predictor。 **Run** 函数进行了输入数据的准备、模型推理以及输出数据的获取过程。
+
+接下来我们依次对程序中出现的AnalysisConfig，Predictor，模型输入，模型输出做一个详细的介绍。
+
+一：关于AnalysisConfig
+------------------
+
+AnalysisConfig管理AnalysisPredictor的推理配置，提供了模型路径设置、推理引擎运行设备选择以及多种优化推理流程的选项。配置中包括了必选配置以及可选配置。 
+
+1. 必选配置
+>>>>>>>>>>>>
+
+**a. 设置模型和参数路径**   
+
+从磁盘加载模型时，根据模型和参数文件存储方式不同，设置AnalysisConfig加载模型和参数的路径有两种形式：
+
+* **non-combined形式** ：模型文件夹model_dir下存在一个模型文件和多个参数文件时，传入模型文件夹路径，模型文件名默认为__model__。 使用方式为： `config->SetModel("./model_dir")`;。
+* **combined形式** ：模型文件夹model_dir下只有一个模型文件`model`和一个参数文件params时，传入模型文件和参数文件路径。 使用方式为： `config->SetModel("./model_dir/model", "./model_dir/params");`。
+* 内存加载模式：如果模型是从内存加载(模型必须为combined形式)，可以使用
+
+.. code:: c++
+
+	std::ifstream in_m(FLAGS_dirname + "/model");
+	std::ifstream in_p(FLAGS_dirname + "/params");
+	std::ostringstream os_model, os_param;
+	os_model << in_m.rdbuf();
+	os_param << in_p.rdbuf();
+	config.SetModelBuffer(os_model.str().data(), os_model.str().size(), os_param.str().data(), os_param.str().size());
+
+Paddle Inference有两种格式的模型，分别为 **non-combined** 以及 **combined** 。这两种类型我们在 `Quick Start <../introduction/quick_start.html>`_ 一节中提到过，忘记的同学可以回顾下。
+
+**b. 关闭Feed，Fetch op** 
+
+config->SwitchUseFeedFetchOps(false);  // 关闭feed和fetch OP使用，使用ZeroCopy接口必须设置此项`
+
+我们用一个小的例子来说明我们为什么要关掉它们。  
+假设我们有一个模型，模型运行的序列为:
+**input -> FEED_OP -> feed_out -> CONV_OP -> conv_out -> FETCH_OP -> output**                   
+
+序列中大些字母的FEED_OP, CONV_OP, FETCH_OP 为模型中的OP， 小写字母的input，feed_out，output 为模型中的变量。                      
+
+在ZeroCopy模式下，我们通过 	`predictor->GetInputTensor(input_names[0])` 获取的模型输入为FEED_OP的输出， 即feed_out，我们通过 `predictor->GetOutputTensor(output_names[0])` 接口获取的模型输出为FETCH_OP的输入，即conv_out，这种情况下，我们在运行期间就没有必要运行feed和fetch OP了，因此需要设置 `config->SwitchUseFeedFetchOps(false)` 来关闭feed和fetch op。
+
+
+2. 可选配置
+>>>>>>>>>> 
+
+**a. 加速CPU推理**
+ 
+.. code:: 
+
+	// 开启MKLDNN，可加速CPU推理，要求预测库带MKLDNN功能。
+	config->EnableMKLDNN();	  	  		
+	// 可以设置CPU数学库线程数math_threads，可加速推理。
+	// 注意：math_threads * 外部线程数 需要小于总的CPU的核心数目，否则会影响预测性能。
+	config->SetCpuMathLibraryNumThreads(10); 
+
+
+**b. 使用GPU推理**
+
+.. code:: 
+
+	// EnableUseGpu后，模型将运行在GPU上。
+	// 第一个参数表示预先分配显存数目，第二个参数表示设备的ID。
+	config->EnableUseGpu(100, 0); 
+
+
+如果使用的预测lib带Paddle-TRT子图功能，可以打开TRT选项进行加速, 详细的请访问 `Paddle-TensorRT文档 <../optimize/paddle_trt.html>`_： 
+
+.. code:: c++
+
+	// 开启TensorRT推理，可提升GPU推理性能，需要使用带TensorRT的推理库
+	config->EnableTensorRtEngine(1 << 30      /*workspace_size*/,   
+								batch_size        /*max_batch_size*/,  
+   								3                 /*min_subgraph_size*/, 
+								AnalysisConfig::Precision::kFloat32 /*precision*/, 
+								false             /*use_static*/, 
+								false             /*use_calib_mode*/);
+
+通过计算图分析，Paddle可以自动将计算图中部分子图融合，并调用NVIDIA的 TensorRT 来进行加速。
+
+
+**c. 内存/显存优化**
+
+.. code:: c++
+
+	config->EnableMemoryOptim();  // 开启内存/显存复用
+
+该配置设置后，在模型图分析阶段会对图中的变量进行依赖分类，两两互不依赖的变量会使用同一块内存/显存空间，缩减了运行时的内存/显存占用（模型较大或batch较大时效果显著）。
+
+
+**d. debug开关**
+
+
+.. code:: c++
+	
+	// 该配置设置后，会关闭模型图分析阶段的任何图优化，预测期间运行同训练前向代码一致。
+	config->SwitchIrOptim(false);
+	// 该配置设置后，会在模型图分析的每个阶段后保存图的拓扑信息到.dot文件中，该文件可用graphviz可视化。
+	config->SwitchIrDebug();
+
+
+二：关于PaddlePredictor
+-----------------------
+PaddlePredictor 是在模型上执行推理的预测器，根据AnalysisConfig中的配置进行创建。
+
+
+.. code:: c++
+
+	std::unique_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config);
+
+
+CreatePaddlePredictor 期间首先对模型进行加载，并且将模型转换为由变量和运算节点组成的计算图。接下来将进行一系列的图优化，包括OP的横向纵向融合，删除无用节点，内存/显存优化，以及子图（Paddle-TRT）的分析，加速推理性能，提高吞吐。
+
+
+三：输入输出
+--------------------------
+
+1. 准备输入
+>>>>>>>>>>>>>>>>>
+
+**a. 获取模型所有输入的tensor名字**
+
+.. code:: c++
+
+	std::vector<std::string> input_names = predictor->GetInputNames();
+
+**b. 获取对应名字下的tensor**
+
+
+.. code:: c++
+
+	// 获取第0个输入
+	auto input_t = predictor->GetInputTensor(input_names[0]);
+
+**c. 将数据copy到tensor中**
+
+.. code:: c++
+
+	// 在copy前需要设置tensor的shape
+	input_t->Reshape({batch_size, channels, height, width});
+	// tensor会根据上述设置的shape从input_data中拷贝对应数目的数据到tensor中。
+	input_t->copy_from_cpu<float>(input_data /*数据指针*/);
+
+当然我们也可以用mutable_data获取tensor的数据指针:
+
+.. code:: c++
+
+	// 参数可为PaddlePlace::kGPU, PaddlePlace::kCPU
+	float *input_d = input_t->mutable_data<float>(PaddlePlace::kGPU);
+
+
+2. 获取输出
+>>>>>>>>
+
+**a. 获取模型所有输出的tensor名字**
+
+.. code:: c++
+
+	std::vector<std::string> out_names = predictor->GetOutputNames();
+
+**b. 获取对应名字下的tensor**
+
+.. code:: c++
+
+	// 获取第0个输出
+	auto output_t = predictor->GetOutputTensor(out_names[0]);
+
+**c. 将数据copy到tensor中**
+
+.. code:: c++
+
+	std::vector<float> out_data;
+	// 获取输出的shpae
+	std::vector<int> output_shape = output_t->shape();
+	int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, 	std::multiplies<int>());
+	out_data->resize(out_num);
+	output_t->copy_to_cpu(out_data->data());
+
+
+我们可以用data接口获取tensor的数据指针：
+
+.. code:: c++
+
+	// 参数可为PaddlePlace::kGPU, PaddlePlace::kCPU
+	int output_size;
+	float *output_d = output_t->data<float>(PaddlePlace::kGPU, &output_size);
+
+**下一步**
+
+看到这里您是否已经对Paddle Inference的C++使用有所了解了呢？请访问 `这里 <https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B>`_ 进行样例测试。
diff --git a/docs/user_guides/inference_python_api.md b/docs/user_guides/inference_python_api.md
deleted file mode 100644
index ce9bc8dd90d12..0000000000000
--- a/docs/user_guides/inference_python_api.md
+++ /dev/null
@@ -1,213 +0,0 @@
-# 使用Python预测
-
-Paddle Inference提供了高度优化的Python 和C++ API预测接口，本篇文档主要介绍Python API，使用C++ API进行预测的文档可以参考[这里](./cxx_api.md)。
-下面是详细的使用说明。
-
-使用Python预测API预测包含以下几个主要步骤：
-
-- 配置推理选项
-- 创建Predictor
-- 准备模型输入
-- 模型推理
-- 获取模型输出
-
-我们先从一个简单程序入手，介绍这一流程：
-
-``` python
-def create_predictor():
-    # 通过AnalysisConfig配置推理选项
-    config = AnalysisConfig("./resnet50/model", "./resnet50/params")
-    config.switch_use_feed_fetch_ops(False)
-    config.enable_use_gpu(100, 0)
-    config.enable_mkldnn()
-    config.enable_memory_optim()
-    predictor = create_paddle_predictor(config)
-    return predictor
-
-def run(predictor, data):
-    # 准备模型输入
-    input_names = predictor.get_input_names()
-    for i,  name in enumerate(input_names):
-        input_tensor = predictor.get_input_tensor(name)
-        input_tensor.reshape(data[i].shape)
-        input_tensor.copy_from_cpu(data[i].copy())
-
-    # 执行模型推理
-    predictor.zero_copy_run()
-
-    results = []
-    # 获取模型输出
-    output_names = predictor.get_output_names()
-    for i, name in enumerate(output_names):
-        output_tensor = predictor.get_output_tensor(name)
-        output_data = output_tensor.copy_to_cpu()
-        results.append(output_data)
-
-    return results
-```
-
-以上的程序中`create_predictor `函数对推理过程进行了配置以及创建了Predictor。 `run `函数进行了输入数据的准备、模型推理以及输出数据的获取过程。
-
-在接下来的部分中，我们会依次对程序中出现的AnalysisConfig，Predictor，模型输入，模型输出进行详细的介绍。
-
-## 一、推理配置管理器AnalysisConfig
-AnalysisConfig管理AnalysisPredictor的推理配置，提供了模型路径设置、推理引擎运行设备选择以及多种优化推理流程的选项。配置中包括了必选配置以及可选配置。
-
-### 1. 必选配置
-#### a.设置模型和参数路径
-* non-combined形式：模型文件夹`model_dir`下存在一个模型文件和多个参数文件时，传入模型文件夹路径，模型文件名默认为`__model__`。 使用方式为：
-
-``` python
-config.set_model("./model_dir")
-```
-* combined形式：模型文件夹`model_dir`下只有一个模型文件`model`和一个参数文件`params`时，传入模型文件和参数文件路径。使用方式为：
-
-``` python
-config.set_model("./model_dir/model", "./model_dir/params")
-```
-
-* 内存加载模式：如果模型是从内存加载，可以使用:
-
-``` python
-import os
-model_buffer = open('./resnet50/model','rb')
-params_buffer = open('./resnet50/params','rb')
-model_size = os.fstat(model_buffer.fileno()).st_size
-params_size = os.fstat(params_buffer.fileno()).st_size
-config.set_model_buffer(model_buffer.read(), model_size, params_buffer.read(), params_size)
-```	
-
-关于`non-combined` 以及 `combined`模型介绍，请参照[这里](../introduction/quick_start.md)。
-
-#### b. 关闭feed与fetch OP
-`config.switch_use_feed_fetch_ops(False)  # 关闭feed和fetch OP使用，使用ZeroCopy接口必须设置此项`
-
-我们用一个小的例子来说明我们为什么要关掉它们。  
-假设我们有一个模型，模型运行的序列为:
-`input -> FEED_OP -> feed_out -> CONV_OP -> conv_out -> FETCH_OP -> output`                    
-
-序列中大写字母的`FEED_OP`, `CONV_OP`, `FETCH_OP` 为模型中的OP， 小写字母的`input`，`feed_out`，`output` 为模型中的变量。                      
-
-在ZeroCopy模式下，我们通过`predictor.get_input_tensor(input_names[0])`获取的模型输入为`FEED_OP`的输出， 即`feed_out`，我们通过`predictor.get_output_tensor(output_names[0])`接口获取的模型输出为`FETCH_OP`的输入，即`conv_out`，这种情况下，我们在运行期间就没有必要运行feed和fetch OP了，因此需要设置`config.switch_use_feed_fetch_ops(False)`来关闭feed和fetch op。
-
-
-### 2. 可选配置
- 
-#### a. 加速CPU推理
- 
-``` python
-# 开启MKLDNN，可加速CPU推理，要求预测库带MKLDNN功能。
-config.enable_mkldnn()	  	  		
-# 可以设置CPU数学库线程数math_threads，可加速推理。
-# 注意：math_threads * 外部线程数 需要小于总的CPU的核心数目，否则会影响预测性能。
-config.set_cpu_math_library_num_threads(10) 
-
-```
-
-#### b. 使用GPU推理
-
-``` python
-# enable_use_gpu后，模型将运行在GPU上。
-# 第一个参数表示预先分配显存数目，第二个参数表示设备的ID。
-config.enable_use_gpu(100, 0) 
-```
-
-如果使用的预测lib带Paddle-TRT子图功能，可以打开TRT选项进行加速： 
-
-``` python
-# 开启TensorRT推理，可提升GPU推理性能，需要使用带TensorRT的推理库
-config.enable_tensorrt_engine(1 << 30,    # workspace_size   
-                        	 batch_size,    # max_batch_size     
-                        	 3,    # min_subgraph_size 
-                       		 AnalysisConfig.Precision.Float32,    # precision 
-                        	 False,    # use_static 
-                        	 False,    # use_calib_mode
-                        	 )
-```
-通过计算图分析，Paddle可以自动将计算图中部分子图融合，并调用NVIDIA的 TensorRT 来进行加速。
-使用Paddle-TensorRT 预测的完整方法可以参考[这里](../optimize/paddle_trt.md)。
-
-
-#### c. 内存/显存优化
-
-``` python
-config.enable_memory_optim()  # 开启内存/显存复用
-```
-该配置设置后，在模型图分析阶段会对图中的变量进行依赖分类，两两互不依赖的变量会使用同一块内存/显存空间，缩减了运行时的内存/显存占用（模型较大或batch较大时效果显著）。
-
-
-#### d. debug开关
-
-
-``` python
-# 该配置设置后，会关闭模型图分析阶段的任何图优化，预测期间运行同训练前向代码一致。
-config.switch_ir_optim(False)
-```
-
-``` python
-# 该配置设置后，会在模型图分析的每个阶段后保存图的拓扑信息到.dot文件中，该文件可用graphviz可视化。
-config.switch_ir_debug(True)
-```
-
-## 二、预测器PaddlePredictor
-
-PaddlePredictor 是在模型上执行推理的预测器，根据AnalysisConfig中的配置进行创建。
-
-``` python
-predictor = create_paddle_predictor(config)
-```
-
-create_paddle_predictor 期间首先对模型进行加载，并且将模型转换为由变量和运算节点组成的计算图。接下来将进行一系列的图优化，包括OP的横向纵向融合，删除无用节点，内存/显存优化，以及子图（Paddle-TRT）的分析，加速推理性能，提高吞吐。
-
-
-## 三：输入/输出
-
-### 1. 准备输入
-
-#### a. 获取模型所有输入的Tensor名字
-
-``` python
-input_names = predictor.get_input_names()
-```
-
-#### b. 获取对应名字下的Tensor
-
-``` python
-# 获取第0个输入
-input_tensor = predictor.get_input_tensor(input_names[0])
-```
-
-#### c. 将输入数据copy到Tensor中
-
-``` python
-# 在copy前需要设置Tensor的shape
-input_tensor.reshape((batch_size, channels, height, width))
-# Tensor会根据上述设置的shape从input_data中拷贝对应数目的数据。input_data为numpy数组。
-input_tensor.copy_from_cpu(input_data)
-```
-
-### 2. 获取输出
-#### a. 获取模型所有输出的Tensor名字
-
-``` python
-output_names = predictor.get_output_names()
-```
-
-#### b. 获取对应名字下的Tensor
-
-``` python
-# 获取第0个输出
-output_tensor = predictor.get_output_tensor(ouput_names[0])
-```
-
-#### c. 将数据copy到Tensor中
-
-``` python
-# output_data为numpy数组
-output_data = output_tensor.copy_to_cpu()
-```
-
-
-## 下一步
-
-看到这里您是否已经对 Paddle Inference 的 Python API 使用有所了解了呢？请访问[这里](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/python)进行样例测试。
diff --git a/docs/user_guides/inference_python_api.rst b/docs/user_guides/inference_python_api.rst
new file mode 100644
index 0000000000000..cb5d1a6253fee
--- /dev/null
+++ b/docs/user_guides/inference_python_api.rst
@@ -0,0 +1,217 @@
+使用Python预测
+===============
+
+Paddle Inference提供了高度优化的Python 和C++ API预测接口，本篇文档主要介绍Python API，使用C++ API进行预测的文档可以参考 `这里 <./cxx_api.html> `_。
+下面是详细的使用说明。
+
+使用Python预测API预测包含以下几个主要步骤：
+
+- 配置推理选项
+- 创建Predictor
+- 准备模型输入
+- 模型推理
+- 获取模型输出
+
+我们先从一个简单程序入手，介绍这一流程：
+
+.. code:: python
+
+	def create_predictor():
+		# 通过AnalysisConfig配置推理选项
+		config = AnalysisConfig("./resnet50/model", "./resnet50/params")
+		config.switch_use_feed_fetch_ops(False)
+		config.enable_use_gpu(100, 0)
+		config.enable_mkldnn()
+		config.enable_memory_optim()
+		predictor = create_paddle_predictor(config)
+		return predictor
+
+	def run(predictor, data):
+		# 准备模型输入
+		input_names = predictor.get_input_names()
+		for i,  name in enumerate(input_names):
+			input_tensor = predictor.get_input_tensor(name)
+			input_tensor.reshape(data[i].shape)
+			input_tensor.copy_from_cpu(data[i].copy())
+
+	# 执行模型推理
+	predictor.zero_copy_run()
+
+	results = []
+	# 获取模型输出
+	output_names = predictor.get_output_names()
+	for i, name in enumerate(output_names):
+		output_tensor = predictor.get_output_tensor(name)
+		output_data = output_tensor.copy_to_cpu()
+		results.append(output_data)
+
+	return results
+
+
+以上的程序中 **create_predictor** 函数对推理过程进行了配置以及创建了Predictor。 **run** 函数进行了输入数据的准备、模型推理以及输出数据的获取过程。
+
+在接下来的部分中，我们会依次对程序中出现的AnalysisConfig，Predictor，模型输入，模型输出进行详细的介绍。
+
+一、推理配置管理器AnalysisConfig
+----------------------------
+AnalysisConfig管理AnalysisPredictor的推理配置，提供了模型路径设置、推理引擎运行设备选择以及多种优化推理流程的选项。配置中包括了必选配置以及可选配置。
+
+1. 必选配置
+>>>>>>>>>>>>
+
+**a.设置模型和参数路径**
+
+* **Non-combined形式**：模型文件夹 model_dir 下存在一个模型文件和多个参数文件时，传入模型文件夹路径，模型文件名默认为__model__。 使用方式为： `config.set_model("./model_dir")`
+
+* Combined形式：模型文件夹 model_dir 下只有一个模型文件 model 和一个参数文件params时，传入模型文件和参数文件路径。使用方式为： `config.set_model("./model_dir/model", "./model_dir/params")`
+
+* 内存加载模式：如果模型是从内存加载，可以使用:
+
+	.. code:: python
+		
+		import os
+		model_buffer = open('./resnet50/model','rb')
+		params_buffer = open('./resnet50/params','rb')
+		model_size = os.fstat(model_buffer.fileno()).st_size
+		params_size = os.fstat(params_buffer.fileno()).st_size
+		config.set_model_buffer(model_buffer.read(), model_size, params_buffer.read(), params_size)
+
+
+关于 non-combined 以及 combined 模型介绍，请参照 `这里 <../introduction/quick_start.html>`_。
+
+**b. 关闭feed与fetch OP**
+
+config.switch_use_feed_fetch_ops(False)  # 关闭feed和fetch OP
+
+2. 可选配置
+>>>>>>>>>
+ 
+**a. 加速CPU推理**
+ 
+.. code:: python
+
+	# 开启MKLDNN，可加速CPU推理，要求预测库带MKLDNN功能。
+	config.enable_mkldnn()	  	  		
+	# 可以设置CPU数学库线程数math_threads，可加速推理。
+	# 注意：math_threads * 外部线程数 需要小于总的CPU的核心数目，否则会影响预测性能。
+	config.set_cpu_math_library_num_threads(10) 
+
+
+**b. 使用GPU推理**
+
+.. code:: python
+
+	# enable_use_gpu后，模型将运行在GPU上。
+	# 第一个参数表示预先分配显存数目，第二个参数表示设备的ID。
+	config.enable_use_gpu(100, 0) 
+
+如果使用的预测lib带Paddle-TRT子图功能，可以打开TRT选项进行加速： 
+
+.. code:: python
+
+
+	# 开启TensorRT推理，可提升GPU推理性能，需要使用带TensorRT的推理库
+	config.enable_tensorrt_engine(1 << 30,    # workspace_size
+			batch_size,    # max_batch_size
+			3,    # min_subgraph_size
+			AnalysisConfig.Precision.Float32,    # precision
+			False,    # use_static
+			False,    # use_calib_mode
+			)
+
+通过计算图分析，Paddle可以自动将计算图中部分子图融合，并调用NVIDIA的 TensorRT 来进行加速。
+使用Paddle-TensorRT 预测的完整方法可以参考 `这里 <../optimize/paddle_trt.html>`_。
+
+
+**c. 内存/显存优化**
+
+.. code:: python
+
+	config.enable_memory_optim()  # 开启内存/显存复用
+
+该配置设置后，在模型图分析阶段会对图中的变量进行依赖分类，两两互不依赖的变量会使用同一块内存/显存空间，缩减了运行时的内存/显存占用（模型较大或batch较大时效果显著）。
+
+
+**d. debug开关**
+
+
+.. code:: python
+
+	# 该配置设置后，会关闭模型图分析阶段的任何图优化，预测期间运行同训练前向代码一致。
+	config.switch_ir_optim(False)
+
+
+.. code:: python
+
+	# 该配置设置后，会在模型图分析的每个阶段后保存图的拓扑信息到.dot文件中，该文件可用graphviz可视化。
+	config.switch_ir_debug(True)
+
+二、预测器PaddlePredictor
+----------------------
+
+PaddlePredictor 是在模型上执行推理的预测器，根据AnalysisConfig中的配置进行创建。
+
+.. code:: python
+	
+	predictor = create_paddle_predictor(config)
+
+
+create_paddle_predictor 期间首先对模型进行加载，并且将模型转换为由变量和运算节点组成的计算图。接下来将进行一系列的图优化，包括OP的横向纵向融合，删除无用节点，内存/显存优化，以及子图（Paddle-TRT）的分析，加速推理性能，提高吞吐。
+
+
+三：输入/输出
+---------------
+
+1.准备输入
+>>>>>>>>>>>>
+
+**a. 获取模型所有输入的Tensor名字**
+
+.. code:: python
+
+	input_names = predictor.get_input_names()
+
+**b. 获取对应名字下的Tensor**
+
+.. code:: python
+
+	# 获取第0个输入
+	input_tensor = predictor.get_input_tensor(input_names[0])
+
+**c. 将输入数据copy到Tensor中**
+
+.. code:: python
+
+	# 在copy前需要设置Tensor的shape
+	input_tensor.reshape((batch_size, channels, height, width))
+	# Tensor会根据上述设置的shape从input_data中拷贝对应数目的数据。input_data为numpy数组。
+	input_tensor.copy_from_cpu(input_data)
+
+
+2.获取输出
+>>>>>>>>>
+
+**a. 获取模型所有输出的Tensor名字**
+
+.. code::python
+
+	output_names = predictor.get_output_names()
+
+**b. 获取对应名字下的Tensor**
+
+.. code:: python
+	
+	# 获取第0个输出
+	output_tensor = predictor.get_output_tensor(ouput_names[0])
+
+**c. 将数据copy到Tensor中**
+
+.. code:: python
+	
+	# output_data为numpy数组
+	output_data = output_tensor.copy_to_cpu()
+
+
+**下一步**
+
+看到这里您是否已经对 Paddle Inference 的 Python API 使用有所了解了呢？请访问 `这里 <https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/python>`_ 进行样例测试。
diff --git a/docs/user_guides/source_compile.md b/docs/user_guides/source_compile.rst
similarity index 62%
rename from docs/user_guides/source_compile.md
rename to docs/user_guides/source_compile.rst
index ecc852306418f..9ece9b2cd9155 100644
--- a/docs/user_guides/source_compile.md
+++ b/docs/user_guides/source_compile.rst
@@ -1,126 +1,142 @@
-# 源码编译
+源码编译
+========
 
-### 什么时候需要源码编译？
+什么时候需要源码编译？
+--------------
 
 深度学习的发展十分迅速，对科研或工程人员来说，可能会遇到一些需要自己开发op的场景，可以在python层面编写op，但如果对性能有严格要求的话则必须在C++层面开发op，对于这种情况，需要用户源码编译飞桨，使之生效。
-此外对于绝大多数使用C++将模型部署上线的工程人员来说，您可以直接通过飞桨官网下载已编译好的预测库，快捷开启飞桨使用之旅。[飞桨官网](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html)提供了多个不同环境下编译好的预测库。如果用户环境与官网提供环境不一致（如cuda 、cudnn、tensorrt版本不一致等），或对飞桨源代码有修改需求，或希望进行定制化构建，可查阅本文档自行源码编译得到预测库。
+此外对于绝大多数使用C++将模型部署上线的工程人员来说，您可以直接通过飞桨官网下载已编译好的预测库，快捷开启飞桨使用之旅。`飞桨官网 <https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html>`_ 提供了多个不同环境下编译好的预测库。如果用户环境与官网提供环境不一致（如cuda 、cudnn、tensorrt版本不一致等），或对飞桨源代码有修改需求，或希望进行定制化构建，可查阅本文档自行源码编译得到预测库。
+
+编译原理
+---------
+
+**一：目标产物**
 
-## 编译原理
-<h3 align="left">一：目标产物</h3>
 飞桨框架的源码编译包括源代码的编译和链接，最终生成的目标产物包括：
 
  - 含有 C++ 接口的头文件及其二进制库：用于C++环境，将文件放到指定路径即可开启飞桨使用之旅。
- - Python Wheel 形式的安装包：用于Python环境，此安装包需要参考[飞桨安装教程](https://www.paddlepaddle.org.cn/)进行安装操作。也就是说，前面讲的pip安装属于在线安装，这里属于本地安装。
+ - Python Wheel 形式的安装包：用于Python环境，此安装包需要参考 `飞桨安装教程 <https://www.paddlepaddle.org.cn/>`_ 进行安装操作。也就是说，前面讲的pip安装属于在线安装，这里属于本地安装。
+
+**二：基础概念**
 
-<h3 align="left">二：基础概念</h3>
 飞桨主要由C++语言编写，通过pybind工具提供了Python端的接口，飞桨的源码编译主要包括编译和链接两步。
 * 编译过程由编译器完成，编译器以编译单元（后缀名为 .cc 或 .cpp 的文本文件）为单位，将 C++ 语言 ASCII 源代码翻译为二进制形式的目标文件。一个工程通常由若干源码文件组织得到，所以编译完成后，将生成一组目标文件。
 * 链接过程使分离编译成为可能，由链接器完成。链接器按一定规则将分离的目标文件组合成一个能映射到内存的二进制程序文件，并解析引用。由于这个二进制文件通常包含源码中指定可被外部用户复用的函数接口，所以也被称作函数库。根据链接规则不同，链接可分为静态和动态链接。静态链接对目标文件进行归档；动态链接使用地址无关技术，将链接放到程序加载时进行。
 配合包含声明体的头文件（后缀名为 .h 或 .hpp），用户可以复用程序库中的代码开发应用。静态链接构建的应用程序可独立运行，而动态链接程序在加载运行时需到指定路径下搜寻其依赖的二进制库。
 
-<h3 align="left">三：编译方式</h3>
+**三：编译方式**
 
 飞桨框架的设计原则之一是满足不同平台的可用性。然而，不同操作系统惯用的编译和链接器是不一样的，使用它们的命令也不一致。比如，Linux 一般使用 GNU 编译器套件（GCC），Windows 则使用 Microsoft Visual C++（MSVC）。为了统一编译脚本，飞桨使用了支持跨平台构建的 CMake，它可以输出上述编译器所需的各种 Makefile 或者 Project 文件。    
-为方便编译，框架对常用的CMake命令进行了封装，如仿照 Bazel工具封装了 cc_binary 和 cc_library ，分别用于可执行文件和库文件的产出等，对CMake感兴趣的同学可在 cmake/generic.cmake 中查看具体的实现逻辑。Paddle的CMake中集成了生成python wheel包的逻辑，对如何生成wheel包感兴趣的同学可参考[相关文档](https://packaging.python.org/tutorials/packaging-projects/)。
+为方便编译，框架对常用的CMake命令进行了封装，如仿照 Bazel工具封装了 cc_binary 和 cc_library ，分别用于可执行文件和库文件的产出等，对CMake感兴趣的同学可在 cmake/generic.cmake 中查看具体的实现逻辑。Paddle的CMake中集成了生成python wheel包的逻辑，对如何生成wheel包感兴趣的同学可参考 `相关文档 <https://packaging.python.org/tutorials/packaging-projects/>`_ 。
 
 
-## 编译步骤
+编译步骤
+-----------
 
 飞桨分为 CPU 版本和 GPU 版本。如果您的计算机没有 Nvidia GPU，请选择 CPU 版本构建安装。如果您的计算机含有 Nvidia GPU（ 1.0 且预装有 CUDA / CuDNN，也可选择 GPU 版本构建安装。本节简述飞桨在常用环境下的源码编译方式，欢迎访问飞桨官网获取更详细内容。请阅读本节内容。
 
-<h3 align="left">推荐配置及依赖项</h3>
+**推荐配置及依赖项**
 
 1、稳定的互联网连接，主频 1 GHz 以上的多核处理器，9 GB 以上磁盘空间。  
 2、Python 版本 2.7 或 3.5 以上，pip 版本 9.0 及以上；CMake v3.5 及以上；Git 版本 2.17 及以上。请将可执行文件放入系统环境变量中以方便运行。  
 3、GPU 版本额外需要 Nvidia CUDA 9 / 10，CuDNN v7 及以上版本。根据需要还可能依赖 NCCL 和 TensorRT。  
 
 
-### 基于Ubuntu 18.04
+基于Ubuntu 18.04
+------------
 
-<h3 align="left">一：环境准备</h3>
+**一：环境准备**
 
 除了本节开头提到的依赖，在 Ubuntu 上进行飞桨的源码编译，您还需要准备 GCC8 编译器等工具，可使用下列命令安装：
 
-```
-sudo apt-get install gcc g++ make cmake git vim unrar python3 python3-dev python3-pip swig wget patchelf libopencv-dev
-pip3 install numpy protobuf wheel setuptools
-```
+.. code:: shell
+
+	sudo apt-get install gcc g++ make cmake git vim unrar python3 python3-dev python3-pip swig wget patchelf libopencv-dev
+	pip3 install numpy protobuf wheel setuptools
 
 若需启用 cuda 加速，需准备 cuda、cudnn、nccl。上述工具的安装请参考 nvidia 官网，以 cuda10.1，cudnn7.6 为例配置 cuda 环境。
 
-```
-# cuda
-sh cuda_10.1.168_418.67_linux.run
-export PATH=/usr/local/cuda-10.1/bin${PATH:+:${PATH}}
-export LD_LIBRARY_PATH=/usr/local/cuda-10.1/${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+.. code:: shell
 
-# cudnn
-tar -xzvf cudnn-10.1-linux-x64-v7.6.4.38.tgz
-sudo cp -a cuda/include/cudnn.h /usr/local/cuda/include/
-sudo cp -a cuda/lib64/libcudnn* /usr/local/cuda/lib64/
+	# cuda
+	sh cuda_10.1.168_418.67_linux.run
+	export PATH=/usr/local/cuda-10.1/bin${PATH:+:${PATH}}
+	export LD_LIBRARY_PATH=/usr/local/cuda-10.1/${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
 
-# nccl
-# install nccl local deb 参考https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html
-sudo dpkg -i nccl-repo-ubuntu1804-2.5.6-ga-cuda10.1_1-1_amd64.deb
-# 根据安装提示，还需要执行sudo apt-key add /var/nccl-repo-2.5.6-ga-cuda10.1/7fa2af80.pub
-sudo apt update
-sudo apt install libnccl2 libnccl-dev
+	# cudnn
+	tar -xzvf cudnn-10.1-linux-x64-v7.6.4.38.tgz
+	sudo cp -a cuda/include/cudnn.h /usr/local/cuda/include/
+	sudo cp -a cuda/lib64/libcudnn* /usr/local/cuda/lib64/
 
-sudo ldconfig
-```
+	# nccl
+	# install nccl local deb 参考https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html
+	sudo dpkg -i nccl-repo-ubuntu1804-2.5.6-ga-cuda10.1_1-1_amd64.deb
+	# 根据安装提示，还需要执行sudo apt-key add /var/nccl-repo-2.5.6-ga-cuda10.1/7fa2af80.pub
+	sudo apt update
+	sudo apt install libnccl2 libnccl-dev
 
+	sudo ldconfig
 
-> 编译飞桨过程中可能会打开很多文件，Ubuntu 18.04 默认设置最多同时打开的文件数是1024（参见 ulimit -a），需要更改这个设定值。 
+
+**编译飞桨过程中可能会打开很多文件，Ubuntu 18.04 默认设置最多同时打开的文件数是1024（参见 ulimit -a），需要更改这个设定值。** 
 
 
 在 /etc/security/limits.conf 文件中添加两行。
 
-```
-* hard noopen 102400
-* soft noopen 102400
-```
+.. code:: shell
+ 
+	* hard noopen 102400
+	* soft noopen 102400
+
 重启计算机，重启后执行以下指令，请将${user}切换成当前用户名。
 
-```
-su ${user}
-ulimit -n 102400
-```
+.. code:: shell
 
-<h3 align="left">二：编译命令</h3>
+	su ${user}
+	ulimit -n 102400
+
+
+**二：编译命令**
 
 使用 Git 将飞桨代码克隆到本地，并进入目录，切换到稳定版本（git tag显示的标签名，如v1.7.1）。  
-> 飞桨使用 develop 分支进行最新特性的开发，使用 release 分支发布稳定版本。在 GitHub 的 Releases 选项卡中，可以看到飞桨版本的发布记录。  
+**飞桨使用 develop 分支进行最新特性的开发，使用 release 分支发布稳定版本。在 GitHub 的 Releases 选项卡中，可以看到飞桨版本的发布记录。**  
 
-```
-git clone https://github.com/PaddlePaddle/Paddle.git
-cd Paddle
-git checkout v1.7.1
-```    
+.. code:: shell
+
+	git clone https://github.com/PaddlePaddle/Paddle.git
+	cd Paddle
+	git checkout v1.7.1    
 
 下面以 GPU 版本为例说明编译命令。其他环境可以参考“CMake编译选项表”修改对应的cmake选项。比如，若编译 CPU 版本，请将 WITH_GPU 设置为 OFF。
 
 
-```
-# 创建并进入 build 目录
-mkdir build_cuda && cd build_cuda
-# 执行cmake指令
-cmake -DPY_VERSION=3 \
-      -DWITH_TESTING=OFF \
-      -DWITH_MKL=ON \
-      -DWITH_GPU=ON \
-      -DON_INFER=ON \
-      -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-      ..
-# 使用make编译
+.. code:: shell
+
+	# 创建并进入 build 目录
+	mkdir build_cuda && cd build_cuda
+	# 执行cmake指令
+	cmake -DPY_VERSION=3 \
+		-DWITH_TESTING=OFF \
+		-DWITH_MKL=ON \
+		-DWITH_GPU=ON \
+		-DON_INFER=ON \
+		-DCMAKE_BUILD_TYPE=RelWithDebInfo \
+		..
+		
+**使用make编译**
+
 make -j4
-# 编译成功后可在dist目录找到生成的.whl包
+
+**编译成功后可在dist目录找到生成的.whl包**
+
 pip3 install python/dist/paddlepaddle-1.7.1-cp36-cp36m-linux_x86_64.whl
-# 预测库编译
+
+**预测库编译**
+
 make inference_lib_dist -j4
-```
 
 
-<h3 align="left">cmake编译环境表</h3>
+**cmake编译环境表**
 
 以下介绍的编译方法都是通用步骤，根据环境对应修改cmake选项即可。
 
@@ -141,34 +157,41 @@ make inference_lib_dist -j4
 
 ## 基于Windows 10 
 
-<h3 align="left">一：环境准备</h3>
+**一：环境准备**
+
 除了本节开头提到的依赖，在 Windows 10 上编译飞桨，您还需要准备 Visual Studio 2015 Update3 以上版本。本节以 Visual Studio 企业版 2019（C++ 桌面开发，含 MSVC 14.24）、Python 3.8 为例介绍编译过程。
 
 在命令提示符输入下列命令，安装必需的 Python 组件。
 
-`pip3 install numpy protobuf wheel` 
+.. code:: shell
+
+	pip3 install numpy protobuf wheel` 
 
-<h3 align="left">二：编译命令</h3>
+**二：编译命令**
  
 使用 Git 将飞桨代码克隆到本地，并进入目录，切换到稳定版本（git tag显示的标签名，如v1.7.1）。  
-> 飞桨使用 develop 分支进行最新特性的开发，使用 release 分支发布稳定版本。在 GitHub 的 Releases 选项卡中，可以看到 Paddle 版本的发布记录。  
+**飞桨使用 develop 分支进行最新特性的开发，使用 release 分支发布稳定版本。在 GitHub 的 Releases 选项卡中，可以看到 Paddle 版本的发布记录。**
 
-```
-git clone https://github.com/PaddlePaddle/Paddle.git
-cd Paddle
-git checkout v1.7.1
-```
+.. code:: shell
+
+	git clone https://github.com/PaddlePaddle/Paddle.git
+	cd Paddle
+	git checkout v1.7.1
+	
 创建一个构建目录，并在其中执行 CMake，生成解决方案文件 Solution File，以编译 CPU 版本为例说明编译命令，其他环境可以参考“CMake编译选项表”修改对应的cmake选项。
 
-```
-mkdir build
-cd build
-cmake .. -G "Visual Studio 16 2019" -A x64 -DWITH_GPU=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -DPY_VERSION=3
-```
-<center><img src="https://agroup-bos.cdn.bcebos.com/1b21aff9424cb33a98f2d1e018d8301614caedda" width="400" ></center>
+.. code:: shell
+
+	mkdir build
+	cd build
+	cmake .. -G "Visual Studio 16 2019" -A x64 -DWITH_GPU=OFF -DWITH_TESTING=OFF 
+		-DCMAKE_BUILD_TYPE=Release -DPY_VERSION=3
+	
+.. image:: https://agroup-bos.cdn.bcebos.com/1b21aff9424cb33a98f2d1e018d8301614caedda
+
 使用 Visual Studio 打开解决方案文件，在窗口顶端的构建配置菜单中选择 Release x64，单击生成解决方案，等待构建完毕即可。  
 
-<h3 align="left">cmake编译环境表</h3>
+**cmake编译环境表**
 
 |选项|说明|默认值|
 |:--:|:--:|:--:|
@@ -184,55 +207,61 @@ cmake .. -G "Visual Studio 16 2019" -A x64 -DWITH_GPU=OFF -DWITH_TESTING=OFF -DC
 |`CUDA_ARCH_NAM`E|是否只针对当前 CUDA 架构编译| All: 编译所有可支持的 CUDA 架构；Auto: 自动识别当前环境的架构编译|  
 |TENSORRT_ROOT|TensorRT lib的路径，该路径指定后会编译TRT子图功能 eg: /paddle/nvidia/TensorRT/ | /  | 
 
-<h2 align="left">结果验证/h2>
+**结果验证**
+
+**一：python whl包**
 
-<h3 align="left">一：python whl包</h3>
 编译完毕后，会在 python/dist 目录下生成一个文件名类似 paddlepaddle-1.7.1-cp36-cp36m-linux_x86_64.whl 的 Python Wheel 安装包，安装测试的命令为：  
 
-`pip3 install python/dist/paddlepaddle-1.7.1-cp36-cp36m-linux_x86_64.whl`
+.. code:: shell
+
+	pip3 install python/dist/paddlepaddle-1.7.1-cp36-cp36m-linux_x86_64.whl
+
+安装完成后，可以使用 python3 进入python解释器，输入以下指令，出现 `Your Paddle Fluid is installed succesfully! ` ，说明安装成功。
 
-安装完成后，可以使用 python3 进入python解释器，输入以下指令，出现`Your Paddle Fluid is installed succesfully! `，说明安装成功。
-```
-import paddle.fluid as fluid
-fluid.install_check.run_check()
-```
+.. code:: python
 
-<h3 align="left">二：c++ lib</h3>
+	import paddle.fluid as fluid
+	fluid.install_check.run_check()
+
+
+**二：c++ lib**
 
 预测库编译后，所有产出均位于build目录下的fluid_inference_install_dir目录内，目录结构如下。version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号。
 
-```
-build/fluid_inference_install_dir
-├── CMakeCache.txt
-├── paddle
-│   ├── include
-│   │   ├── paddle_anakin_config.h
-│   │   ├── paddle_analysis_config.h
-│   │   ├── paddle_api.h
-│   │   ├── paddle_inference_api.h
-│   │   ├── paddle_mkldnn_quantizer_config.h
-│   │   └── paddle_pass_builder.h
-│   └── lib
-│       ├── libpaddle_fluid.a (Linux)
-│       ├── libpaddle_fluid.so (Linux)
-│       └── libpaddle_fluid.lib (Windows)
-├── third_party
-│   ├── boost
-│   │   └── boost
-│   ├── eigen3
-│   │   ├── Eigen
-│   │   └── unsupported
-│   └── install
-│       ├── gflags
-│       ├── glog
-│       ├── mkldnn
-│       ├── mklml
-│       ├── protobuf
-│       ├── xxhash
-│       └── zlib
-└── version.txt
-```
+.. code:: shell
+
+	build/fluid_inference_install_dir
+	├── CMakeCache.txt
+	├── paddle
+	│   ├── include
+	│   │   ├── paddle_anakin_config.h
+	│   │   ├── paddle_analysis_config.h
+	│   │   ├── paddle_api.h
+	│   │   ├── paddle_inference_api.h
+	│   │   ├── paddle_mkldnn_quantizer_config.h
+	│   │   └── paddle_pass_builder.h
+	│   └── lib
+	│       ├── libpaddle_fluid.a (Linux)
+	│       ├── libpaddle_fluid.so (Linux)
+	│       └── libpaddle_fluid.lib (Windows)
+	├── third_party
+	│   ├── boost
+	│   │   └── boost
+	│   ├── eigen3
+	│   │   ├── Eigen
+	│   │   └── unsupported
+	│   └── install
+	│       ├── gflags
+	│       ├── glog
+	│       ├── mkldnn
+	│       ├── mklml
+	│       ├── protobuf
+	│       ├── xxhash
+	│       └── zlib
+	└── version.txt
+
 
 Include目录下包括了使用飞桨预测库需要的头文件，lib目录下包括了生成的静态库和动态库，third_party目录下包括了预测库依赖的其它库文件。
 
-您可以编写应用代码，与预测库联合编译并测试结果。请参“[C++ 预测库 API 使用](https://aistudio.baidu.com/bjcpu/user/166411/248511/notebooks/248511.ipynb?redirects=1#C++%E9%A2%84%E6%B5%8BAPI)一节。
+您可以编写应用代码，与预测库联合编译并测试结果。请参 `C++ 预测库 API 使用 <https://aistudio.baidu.com/bjcpu/user/166411/248511/notebooks/248511.ipynb?redirects=1#C++%E9%A2%84%E6%B5%8BAPI>`_ 一节。
diff --git a/docs/user_guides/tutorial.md b/docs/user_guides/tutorial.md
deleted file mode 100644
index 679d10a8626a4..0000000000000
--- a/docs/user_guides/tutorial.md
+++ /dev/null
@@ -1,60 +0,0 @@
-## 使用流程
-
-### 一： 模型准备
-Paddle Inference目前支持的模型结构为PaddlePaddle深度学习框架产出的模型格式。因此，在您开始使用 Paddle Inference框架前您需要准备一个由PaddlePaddle框架保存的模型。 如果您手中的模型是由诸如Caffe2、Tensorflow等框架产出的，那么我们推荐您使用 X2Paddle 工具进行模型格式转换。
-
-### 二： 环境准备
-
-**1） Python 环境**    
-
-安装Python环境有以下三种方式：
-
- a. 参照[官方主页](https://www.paddlepaddle.org.cn/)的引导进行pip安装。
- 
- b. 参照接下来的[预测库编译](./source_compile)页面进行自行编译。
- 
- c. 使用docker镜像
- 
-	# 拉取镜像，该镜像预装Paddle 1.8 Python环境 
-	docker pull hub.baidubce.com/paddlepaddle/paddle:1.8.0-gpu-cuda10.0-cudnn7-trt6
-	
-	export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-	export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-	export NVIDIA_SMI="-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi"
-	
-	docker run $CUDA_SO $DEVICES $NVIDIA_SMI --name trt_open --privileged --security-opt seccomp=unconfined --net=host -v $PWD:/paddle -it hub.baidubce.com/paddlepaddle/paddle:1.8.0-gpu-cuda10.0-cudnn7-trt6 /bin/bash
-
-**2） C++ 环境**
-
-获取c++预测库有以下三种方式：
-
-a. [官网](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html#linux)下载预编译库
-
-b. 使用docker镜像
- 
-	# 拉取镜像，在容器内主目录～/下存放c++预编译库。
-	docker pull hub.baidubce.com/paddlepaddle/paddle:1.8.0-gpu-cuda10.0-cudnn7-trt6
-	
-	export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-	export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-	export NVIDIA_SMI="-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi"
-	
-	docker run $CUDA_SO $DEVICES $NVIDIA_SMI --name trt_open --privileged --security-opt seccomp=unconfined --net=host -v $PWD:/paddle -it hub.baidubce.com/paddlepaddle/paddle:1.8.0-gpu-cuda10.0-cudnn7-trt6 /bin/bash
-
-c. 参照接下来的[预测库编译](./source_compile)页面进行自行编译。
-
-### 三：使用Paddle Inference执行预测
-
-使用Paddle Inference进行推理部署的流程如下所示。  
-<center><img src="https://ai-studio-static-online.cdn.bcebos.com/10d5cee239374bd59e41283b3233f49dc306109da9d540b48285980810ab4e36" width="280" ></center>   
-
-1) 配置推理选项。`AnalysisConfig`是飞桨提供的配置管理器API。在使用Paddle Inference进行推理部署过程中，需要使用`AnalysisConfig`详细地配置推理引擎参数，包括但不限于在何种设备（CPU/GPU）上部署(`config.EnableUseGPU`)、加载模型路径、开启/关闭计算图分析优化、使用MKLDNN/TensorRT进行部署的加速等。参数的具体设置需要根据实际需求来定。            
-
-2) 创建`AnalysisPredictor`。`AnalysisPredictor`是Paddle Inference提供的推理引擎。你只需要简单的执行一行代码即可完成预测引擎的初始化，`std::unique_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config)`，config为1步骤中创建的`AnalysisConfig`。
-
-3) 准备输入数据。执行 `auto input_names = predictor->GetInputNames()`，您会获取到模型所有输入tensor的名字，同时通过执行`auto tensor = predictor->GetInputTensor(input_names[i])`; 您可以获取第i个输入的tensor，通过`tensor->copy_from_cpu(data)` 方式，将data中的数据拷贝到tensor中。
-
-4) 调用predictor->ZeroCopyRun()执行推理。           
-
-5) 获取推理输出。执行 `auto out_names = predictor->GetOutputNames()`，您会获取到模型所有输出tensor的名字，同时通过执行`auto tensor = predictor->GetOutputTensor(out_names[i])`; 您可以获取第i个输出的tensor。通过 `tensor->copy_to_cpu(data)` 将tensor中的数据copy到data指针上。
-。   
diff --git a/docs/user_guides/tutorial.rst b/docs/user_guides/tutorial.rst
new file mode 100644
index 0000000000000..f9f79fe7120f7
--- /dev/null
+++ b/docs/user_guides/tutorial.rst
@@ -0,0 +1,69 @@
+使用流程
+===========
+
+一： 模型准备
+---------------
+
+Paddle Inference目前支持的模型结构为PaddlePaddle深度学习框架产出的模型格式。因此，在您开始使用 Paddle Inference框架前您需要准备一个由PaddlePaddle框架保存的模型。 如果您手中的模型是由诸如Caffe2、Tensorflow等框架产出的，那么我们推荐您使用 X2Paddle 工具进行模型格式转换。
+
+二： 环境准备
+---------------
+
+**1） Python 环境**    
+
+安装Python环境有以下三种方式：
+
+a. 参照 `官方主页 <https://www.paddlepaddle.org.cn/>`_ 的引导进行pip安装。
+ 
+b. 参照接下来的 `预测库编译 <./source_compile.html>`_ 页面进行自行编译。
+ 
+c. 使用docker镜像
+ 
+.. code:: shell
+	
+	# 拉取镜像，该镜像预装Paddle 1.8 Python环境 
+	docker pull hub.baidubce.com/paddlepaddle/paddle:1.8.0-gpu-cuda10.0-cudnn7-trt6
+
+	export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+	export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+	export NVIDIA_SMI="-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi"
+
+	docker run $CUDA_SO $DEVICES $NVIDIA_SMI --name trt_open --privileged --security-opt seccomp=unconfined --net=host -v $PWD:/paddle -it hub.baidubce.com/paddlepaddle/paddle:1.8.0-gpu-cuda10.0-cudnn7-trt6 /bin/bash
+
+**2） C++ 环境**
+
+获取c++预测库有以下三种方式：
+
+a. `官网 <https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html#linux>`_ 下载预编译库
+
+b. 使用docker镜像
+   
+.. code:: shell
+   
+	# 拉取镜像，在容器内主目录～/下存放c++预编译库。
+	docker pull hub.baidubce.com/paddlepaddle/paddle:1.8.0-gpu-cuda10.0-cudnn7-trt6
+
+	export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+	export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+	export NVIDIA_SMI="-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi"
+
+	docker run $CUDA_SO $DEVICES $NVIDIA_SMI --name trt_open --privileged --security-opt seccomp=unconfined --net=host -v $PWD:/paddle -it hub.baidubce.com/paddlepaddle/paddle:1.8.0-gpu-cuda10.0-cudnn7-trt6 /bin/bash
+
+c. 参照接下来的 `预测库编译 <./source_compile.html>`_页面进行自行编译。
+
+三：使用Paddle Inference执行预测
+-----------------
+
+使用Paddle Inference进行推理部署的流程如下所示。  
+
+.. image:: https://ai-studio-static-online.cdn.bcebos.com/10d5cee239374bd59e41283b3233f49dc306109da9d540b48285980810ab4e36
+
+1) 配置推理选项。 **AnalysisConfig** 是飞桨提供的配置管理器API。在使用Paddle Inference进行推理部署过程中，需要使用 **AnalysisConfig** 详细地配置推理引擎参数，包括但不限于在何种设备（CPU/GPU）上部署( **config.EnableUseGPU** )、加载模型路径、开启/关闭计算图分析优化、使用MKLDNN/TensorRT进行部署的加速等。参数的具体设置需要根据实际需求来定。            
+
+2) 创建	 **AnalysisPredictor** 。 **AnalysisPredictor** 是Paddle Inference提供的推理引擎。你只需要简单的执行一行代码即可完成预测引擎的初始化 **std::unique_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config)** ，config为1步骤中创建的 **AnalysisConfig**。
+
+3) 准备输入数据。执行 **auto input_names = predictor->GetInputNames()** ，您会获取到模型所有输入tensor的名字，同时通过执行 **auto tensor = predictor->GetInputTensor(input_names[i])** ; 您可以获取第i个输入的tensor，通过 **tensor->copy_from_cpu(data)** 方式，将data中的数据拷贝到tensor中。
+
+4) 调用predictor->ZeroCopyRun()执行推理。           
+
+5) 获取推理输出。执行 **auto out_names = predictor->GetOutputNames()** ，您会获取到模型所有输出tensor的名字，同时通过执行 **auto tensor = predictor->GetOutputTensor(out_names[i])** ; 您可以获取第i个输出的tensor。通过 **tensor->copy_to_cpu(data)** 将tensor中的数据copy到data指针上