# basic
---

# pandas
---
```python
# 读取csv数据，设置列名，并将指定列弹出
train = pd.read_csv(train_path, names=[], header=0) # return a DataFrame
train_x, train_y = train, train.pop([])
```



# Dataset
---
`tf.data`模块主要用于加载数据，预处理和传输到模型中。可以从`numpy.arrays`和csv文件中读取数据。

- `Dataset` - Base class
- `TextLineDataset` - Reads lines from text file.
- `TFRecordDataset` - Reads records from TFRecord files.
- `FixedLengthRecordDataset` - Reads fixed size record from binary files.
- `Iterator`

```python

# 把数据集转成Dataset格式
def train_input_fn(features, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices(dict(features), labels)

    return dataset.shuffle(1000).repeat().batch(batch_size)

dataset.make_one_shot_iterator().get_next()
```
### read data from csv
---

```python
# build dataset
ds = tf.data.TextLineDataset(train_path).skip(1)

# build a csv line parse
COLUMNS = ['SepalLength', 'SepalWidth',
       'PetalLength', 'PetalWidth',
       'label']
FIELD_DEFAULTS = [[0.0], [0.0], [0.0], [0.0], [0]]

def _parse_line(line):
    fields = tf.decode_csv(line, FIELD_DEFAULTS)
    features = dict(zip(COLUMNS, fields))
    
    label = features.pop('label')
    return features, label

# parse the lines
ds = ds.map(_parse_line)
```

# argparse
---
```python
parser = argparse.ArgumentParser()

parser.add_argument('--name', type=int, default=10, help='')

FLAGS, _ = parser.parse_known_args()
```

# tf.feature_column
---
用于将数据转成Estimator可用的格式。
```python
# 把key转成numeric_column格式
tf.feature_column.numeric_column(key)

# 列表映射
tf.feature_column.categorical_column_with_vocabulary_list(key, vocabulary_list)

# 如果不确定列表的取值，可以使用.., 每个值会被映射成数字
tf.feature_column.categorical_column_with_hash_bucket(key, hash_bucket_size, dtype=tf.string)

# 区间划分
tf.feature_column.bucketized_column(source_column, boundaries)

# 多列组合成一个keys为包含多个列名的列表，
tf.feature_column.crossed_column(keys, hash_bucket_size, hash_key=None)
```

In [21]:
tf.squeeze?

# tf.estimator
---
```python
# DNNClassifier
classifier = tf.estimator.DNNClassifier(hidden_units, feature_columns, n_classes=2)

classifier.train(input_fn, hooks=None, steps=None)
classifier.predict(input_fn, predict_keys)
classifier.evaluate(input_fn)
```
---
**input_fn**是一个返回Dataset对象的函数，输出应该是有两个元素的元组。
- `features` - 一个python的字典类型
    - key为特征的名字
    - value：包含特征值的列表
- `label` - 一个数组，为每个样本的label。

```python
def input_evaluation_set():
    features = {'SepalLength': np.array([6.4, 5.0]),
            'SepalWidth':  np.array([2.8, 2.3]),
            'PetalLength': np.array([5.6, 3.3]),
            'PetalWidth':  np.array([2.2, 1.0])}
    labels = np.array([2, 1])
    return features, labels
```
用法2：
```python
run_config = tf.estimator.RunConfig(model_dir,...)

model_params = tf.contrib.training.HParams()

estimator = tf.estimator.Estimator(model_fn, config=run_config, params=model_params)

train_spec = tf.estimator.TrainSpec()

eval_spec = tf.estimator.EvalSpec()
```

## Define the model
---
### Define the input layer
```python
# write an input function
def train_input_fn(features, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
    return dataset.make_one_shot_iterator().get_next()

# Create feature columns
my_feature_columns = []
for key in train_x.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))

# Write a model fnction
def my_model_fn(features, labels, mode, params):
    # Define the input layer
    net = tf.feature_column.input_layer(features, params['feature_columns'])
    
    # Define hidden layer
    for units in params['hidden_units']:
        net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
    
    # Define Output layer
    logits = tf.layers.dense(net, params['n_classes'], activation=None)
    return logits

# Define classifier
classifier = tf.estimator.Estimator(model_fn=my_model_fn, 
                                    params={
                                        'feature_column': my_feature_columns,
                                        'hidden_units': [10, 10],
                                        'n_classes': 3
                                    })

# Define train
classifier.train(input_fn=lambda: train_input_fn(FILE_TRAIN, True, 500))# MOdeKeys.TRAIN

# Define predict operation
predicted_classes = tf.argmax(logits, 1)
if mode == tf.estimator.ModeKeys.PREDICT:
    predictions={
        'class_ids': predicted_classes[:, tf.newaxis],
        'probabilities': tf.nn.softmax(logits),
        'logits': logits
    }
    return tf.estimator.EstimatorSpec(mode, predictions=predictions)

# loss function
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

# Evaluate operation
accuracy = tf.metrics.accuracy(labels=labels, predictions=predicted_classes, name='acc_op')

metrics = {'accuracy': accuracy}
tf.summary.scalar('accuracy', accuracy[1])

if mode == tf.estimator.ModeKeys.EVAL:
    return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)

# Train operation
if mode == tf.estimator.ModeKeys.TRAIN:
    optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
```


# tf.logging
---
```python
# 打开日志功能
tf.logging.set_verbosity(tf.logging.INFO)
```

# checkpoints
---
用于estimator保存模型,配置保存间隔，和最大保存文件数。
```python
my_checkpointing_config = tf.estimator.RunConfig(
    save_checkpoint_secs=20*60,
    keep_checkpoint_max=10
)

classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    hidden_units=[10, 10],
    n_classes=3,
    model_dir='xxx/xxx'
    config=my_checkpointing_config
)
```

# tf.layers
---
该模块主要用于创建神经网络，提供了创建全连接层，卷积层，激活函数，dropout regularization。案例为**mnist**

> CNNS(conv(relu)-pool-conv(relu)-pool-...-conv(relu)-dense-dense-output)

- **Convolutional layers**,in the last will apply RELU activation function to the output.
- **Pooling**, reduce the dimensionality.
- **Dense**, which perform classification on the features.

> build a model to classify the images in the mnist dataset.

1. conv1, `weights = [-1, 5, 5, 32]`, with ReLU
2. pool1, `kernel_size=[1, 2, 2, 1]`, `stride=[1, 2, 2, 1]`
3. conv2, `weights=[-1, 5, 5, 64]`, with ReLU
4. pool2, `kernel_size=[1, 2, 2, 1]`, `stride=[1, 2, 2, 1]`
5. dense1, `[-1, 1024]`, `dropout(0.4)`,
6. dense2, `[-1, 10]`

In [1]:
import tensorflow as tf

In [11]:
tf.estimator.DNNClassifier?

In [12]:
import numpy as np

In [13]:
np.tile?