In [1]:
import pandas as pd
from sklearn import preprocessing
import time

In [2]:
from collections import Counter
from tqdm import tqdm
import random
from scipy import sparse
import numpy as np

In [3]:
train_pd = pd.read_csv("./data/train.csv", encoding="utf-8")

In [4]:
Counter(train_pd.label)

Counter({0: 27500, 1: 2500})

In [5]:
2500/3000

0.8333333333333334

In [6]:
word_list = []
vocab_list = []
for line in tqdm(train_pd.value):
    word_list.append([w for w in line])
    vocab_list.extend([w for w in line])

100%|█████████████████████████████████████████████████████████████████████████| 30000/30000 [00:00<00:00, 91490.53it/s]


In [7]:
c = Counter(vocab_list)
vocab = np.array(list(c.keys()))
vocab.sort()
vocab

array(['%', '&', '*', '+', '-', '.', '0', '1', '2', '3', '4', '5', '6',
       '7', '8', '9', '=', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
       'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
       'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
       'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
       'v', 'w', 'x', 'y', 'z'],
      dtype='<U1')

In [8]:
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(vocab)
label_encoder.classes_
label_encoder.transform(vocab)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69], dtype=int64)

In [9]:
word_labelencoder = [list(label_encoder.transform(w)) for w in word_list]

In [10]:
word_labelencoder[0]
label_encoder.inverse_transform(word_labelencoder[0])

array(['k', 'e', 'y', 'o', 'n', 'e', '=', 'R', '8', '9', 'y', '0', 'w',
       'd', 'I', 'l', 'T', 'H', 'A', '2', 'z', 'g', 'y', 'd', '1', 'C',
       'C', '&', 'k', 'e', 'y', 't', 'w', 'o', '=', 'j', 'Q', 'u', 'e',
       'r', 'y', '2', '1', '4', '0', '5', '6', '2', '4', '6', '5', '1',
       '8', '8', '0', '3', '5', '2', '1', '6', '7', '_', '1', '4', '7',
       '8', '1', '4', '5', '2', '3', '3', '2', '2', '2'],
      dtype='<U1')

In [11]:
x_batch = np.array(word_labelencoder.copy())
y_batch = np.array(train_pd.label.copy())
# len(x), len(y)
x_batch.shape, y_batch.shape

((30000,), (30000,))

In [12]:
x_pos = x_batch[y_batch==1]
x_pos.shape

(2500,)

In [13]:
x_neg = x_batch[y_batch==0]
x_neg = np.random.choice(x_neg, size=2500)
x_neg.shape

(2500,)

In [14]:
x_pos.shape, x_neg.shape

((2500,), (2500,))

In [40]:
x_batch = np.append(x_pos, x_neg)
# x = np.array(list(x_pos).extend(x_neg))
y_batch = np.concatenate([[1]*len(x_pos), [0]*len(x_neg)])

In [41]:
x_batch

array([ list([54, 48, 68, 58, 57, 48, 16, 53, 62, 58, 57, 59, 14, 6, 13, 1, 54, 48, 68, 63, 66, 58, 16, 49, 58, 58, 0, 8, 11, 8, 13, 0, 8, 11, 8, 15, 0, 8, 11, 8, 15, 0, 8, 11, 8, 15, 0, 8, 11, 8, 6, 39, 17, 25, 36, 22, 31, 34, 0, 8, 11, 8, 6, 20, 21, 28, 17, 41, 0, 8, 11, 8, 6, 0, 8, 11, 8, 13, 6, 0, 8, 11, 9, 17, 6, 0, 8, 11, 9, 17, 11, 0, 8, 11, 8, 13, 4, 4, 1, 54, 48, 68, 63, 51, 61, 48, 48, 16, 13, 15, 12, 10, 13, 6, 10, 8, 9, 10]),
       list([54, 48, 68, 58, 57, 48, 16, 13, 15, 13, 11, 14, 13, 12, 9, 9, 14, 1, 54, 48, 68, 63, 66, 58, 16, 6, 3, 37, 30, 25, 31, 30, 3, 35, 21, 28, 21, 19, 36, 3, 17, 21, 35, 43, 20, 21, 19, 34, 41, 32, 36, 0, 8, 14, 17, 21, 35, 43, 21, 30, 19, 34, 41, 32, 36, 0, 8, 14, 19, 31, 30, 19, 17, 36, 0, 8, 14, 6, 67, 13, 14, 13, 9, 13, 7, 12, 19, 12, 15, 12, 21, 12, 17, 12, 8, 12, 11, 12, 13, 12, 15, 12, 21, 0, 8, 19, 0, 8, 14, 35, 21, 28, 21, 19, 36, 3, 19, 31, 30, 19, 17, 36, 0, 8, 14, 36, 17, 18, 28, 21, 43, 30, 17, 29, 21, 0, 8, 19, 6, 67, 13, 14, 13, 

In [16]:
index = np.arange(len(x_batch))
np.random.shuffle(index)

x_batch = x_batch[index]
y_batch = y_batch[index]

In [17]:
x_batch.shape

(5000,)

In [18]:
5000*0.8

4000.0

In [36]:
x_batch[0].shape

(300, 70)

In [37]:
x_batch[1].shape

(69, 70)

1670

In [19]:
class Batcher():
    def __init__(self, x, y):
        self.train_size = int(len(x)*0.8)
        self.train_x = x[:self.train_size]
        self.train_y = y[:self.train_size]
        self.test_x = x[self.train_size:]
        self.test_y = y[self.train_size:]
        self.start = 0
    def next_batch(self, batch_size):
        s_index = self.start
        e_index = self.start + batch_size
        if e_index >= self.train_size:
            self.start = 0
            s_index = self.start
            e_index = self.start + batch_size
        self.start = e_index
        return self.train_x[s_index:e_index], self.train_y[s_index:e_index], \
                [len(w) for w in self.train_x[s_index:e_index]]

In [20]:
import tensorflow as tf

In [21]:
#定义一些常量
#图片大小，32 x 256
OUTPUT_SHAPE = (32,256)

#训练最大轮次
num_epochs = 10000
#LSTM
num_hidden = 64
num_layers = 1
num_classes = 2

# obj = gen_id_card()
# num_classes = obj.len + 1 + 1  # 10位数字 + blank + ctc blank

#初始化学习速率
INITIAL_LEARNING_RATE = 1e-3
DECAY_STEPS = 5000
REPORT_STEPS = 100
LEARNING_RATE_DECAY_FACTOR = 0.9  # The learning rate decay factor
MOMENTUM = 0.9

DIGITS='0123456789'
BATCHES = 20
BATCH_SIZE = 500
TRAIN_SIZE = BATCHES * BATCH_SIZE

In [22]:
def get_train_model():
    inputs = tf.placeholder(tf.float32, [None, None, OUTPUT_SHAPE[0]])
    targets = tf.sparse_placeholder(tf.int32)
    seq_len = tf.placeholder(tf.int32, [None])
    
    with tf.name_scope("lstm"):
        # LSTM
        cell = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)
        stack = tf.contrib.rnn.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
        outputs, _ = tf.nn.dynamic_rnn(cell, inputs, seq_len, dtype=tf.float32)

        shape = tf.shape(inputs)
        # [batch_size,256]
        batch_s, max_timesteps = shape[0], shape[1]

        # [batch_size*max_time_step,num_hidden]
        outputs = tf.reshape(outputs, [-1, num_hidden])
        W = tf.Variable(tf.truncated_normal([num_hidden,  num_classes], stddev=0.1), name="W")
        b = tf.Variable(tf.constant(0.0, shape=[num_classes]), name="b")
        # [batch_size*max_timesteps,num_classes]
        logits = tf.matmul(outputs, W) + b
        # [batch_size,max_timesteps,num_classes]
        logits = tf.reshape(logits, [batch_s, -1, num_classes])
        # 转置矩阵，第0和第1列互换位置=>[max_timesteps,batch_size,num_classes]
        logits = tf.transpose(logits, (1, 0, 2))
    
    return logits, inputs, targets, seq_len, W, b

In [23]:
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                                global_step,
                                                DECAY_STEPS,
                                                LEARNING_RATE_DECAY_FACTOR,
                                                staircase=True)
with tf.name_scope("rnn"):
    logits, inputs, targets, seq_len, W, b = get_train_model()

In [24]:
# tragets是一个稀疏矩阵
loss = tf.nn.ctc_loss(labels=targets,inputs=logits, sequence_length=seq_len)
cost = tf.reduce_mean(loss)
    
#optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,momentum=MOMENTUM).minimize(cost, global_step=global_step)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss,global_step=global_step)

In [25]:
def do_batch():
#     train_inputs, train_targets, train_seq_len = get_next_batch(BATCH_SIZE)
    train_inputs, train_targets, train_seq_len = sqli.next_batch(BATCH_SIZE)

    feed = {inputs: train_inputs, targets: train_targets, seq_len: train_seq_len}

    b_loss,b_targets, b_logits, b_seq_len,b_cost, steps, _ = session.run([loss, targets, logits, seq_len, cost, global_step, optimizer], feed)

    print(b_cost, steps)
    if steps > 0 and steps % REPORT_STEPS == 0:
#         do_report()
        save_path = saver.save(session, "ocr.model", global_step=steps)
    return b_cost, steps

In [26]:
label_encoder.transform(vocab)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69], dtype=int64)

In [27]:
x_batch = [np.eye(len(vocab))[item] for item in x_batch]
# x_batch = [np.eye(len(vocab))[item] for item in x_batch]

In [30]:
BATCH_SIZE

500

In [29]:
len(sqli.next_batch(BATCH_SIZE)[0])

NameError: name 'sqli' is not defined

In [31]:
sqli

NameError: name 'sqli' is not defined

In [32]:
init = tf.global_variables_initializer()
sqli = Batcher(x_batch,y_batch)
with tf.Session() as session:
    session.run(init)
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)
    for curr_epoch in range(num_epochs):
        print("Epoch.......", curr_epoch)
        train_cost = train_ler = 0
        for batch in range(BATCHES):
            start = time.time()
            c, steps = do_batch()
            train_cost += c * BATCH_SIZE
            seconds = time.time() - start
            print("Step:", steps, ", batch seconds:", seconds)
            
        train_cost /= TRAIN_SIZE
            
#         train_inputs, train_targets, train_seq_len = get_next_batch(BATCH_SIZE)
#         train_inputs, train_targets, train_seq_len = sqli.next_batch(BATCH_SIZE)
        train_inputs = sqli.test_x
        train_targets = sqli.test_y
        train_seq_len = [len(w) for w in train_inputs]
        val_feed = {inputs: train_inputs,
                        targets: train_targets,
                        seq_len: train_seq_len}
 
        val_cost, val_ler, lr, steps = session.run([cost, acc, learning_rate, global_step], feed_dict=val_feed)
 
        log = "Epoch {}/{}, steps = {}, train_cost = {:.3f}, train_ler = {:.3f}, val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}s, learning_rate = {}"
        print(log.format(curr_epoch + 1, num_epochs, steps, train_cost, train_ler, val_cost, val_ler, time.time() - start, lr))

Epoch....... 0


ValueError: setting an array element with a sequence.

In [None]:
x

In [None]:
def train():
    global_step = tf.Variable(0, trainable=False)
    learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                                global_step,
                                                DECAY_STEPS,
                                                LEARNING_RATE_DECAY_FACTOR,
                                                staircase=True)
    logits, inputs, targets, seq_len, W, b = get_train_model()
    
    # tragets是一个稀疏矩阵
    loss = tf.nn.ctc_loss(labels=targets,inputs=logits, sequence_length=seq_len)
    cost = tf.reduce_mean(loss)
    
    #optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,momentum=MOMENTUM).minimize(cost, global_step=global_step)
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss,global_step=global_step)
    
    #前面说的划分块之后找每块的类属概率分布，ctc_beam_search_decoder方法,是每次找最大的K个概率分布
    #还有一种贪心策略是只找概率最大那个，也就是K=1的情况ctc_ greedy_decoder
    decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len, merge_repeated=False)
    
    acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))
    
    init = tf.global_variables_initializer()

    def report_accuracy(decoded_list, test_targets):
        original_list = decode_sparse_tensor(test_targets)
        detected_list = decode_sparse_tensor(decoded_list)
        true_numer = 0
        
        if len(original_list) != len(detected_list):
            print("len(original_list)", len(original_list), "len(detected_list)", len(detected_list),
                  " test and detect length desn't match")
            return
        print("T/F: original(length) <-------> detectcted(length)")
        for idx, number in enumerate(original_list):
            detect_number = detected_list[idx]
            hit = (number == detect_number)
            print(hit, number, "(", len(number), ") <-------> ", detect_number, "(", len(detect_number), ")")
            if hit:
                true_numer = true_numer + 1
        print("Test Accuracy:", true_numer * 1.0 / len(original_list))

    def do_report():
        test_inputs,test_targets,test_seq_len = get_next_batch(BATCH_SIZE)
        test_feed = {inputs: test_inputs,
                     targets: test_targets,
                     seq_len: test_seq_len}
        dd, log_probs, accuracy = session.run([decoded[0], log_prob, acc], test_feed)
        report_accuracy(dd, test_targets)
 
    def do_batch():
        train_inputs, train_targets, train_seq_len = get_next_batch(BATCH_SIZE)
        
        feed = {inputs: train_inputs, targets: train_targets, seq_len: train_seq_len}
        
        b_loss,b_targets, b_logits, b_seq_len,b_cost, steps, _ = session.run([loss, targets, logits, seq_len, cost, global_step, optimizer], feed)
        
        print(b_cost, steps)
        if steps > 0 and steps % REPORT_STEPS == 0:
            do_report()
            save_path = saver.save(session, "ocr.model", global_step=steps)
        return b_cost, steps
    
    with tf.Session() as session:
        session.run(init)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)
        for curr_epoch in xrange(num_epochs):
            print("Epoch.......", curr_epoch)
            train_cost = train_ler = 0
            for batch in xrange(BATCHES):
                start = time.time()
                c, steps = do_batch()
                train_cost += c * BATCH_SIZE
                seconds = time.time() - start
                print("Step:", steps, ", batch seconds:", seconds)
            
            train_cost /= TRAIN_SIZE
            
            train_inputs, train_targets, train_seq_len = get_next_batch(BATCH_SIZE)
            val_feed = {inputs: train_inputs,
                        targets: train_targets,
                        seq_len: train_seq_len}
 
            val_cost, val_ler, lr, steps = session.run([cost, acc, learning_rate, global_step], feed_dict=val_feed)
 
            log = "Epoch {}/{}, steps = {}, train_cost = {:.3f}, train_ler = {:.3f}, val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}s, learning_rate = {}"
            print(log.format(curr_epoch + 1, num_epochs, steps, train_cost, train_ler, val_cost, val_ler, time.time() - start, lr))

In [None]:
# train()

In [None]:
# pd.read_csv("./data/test.csv", encoding="utf-8")

In [None]:
# pd.read_csv("./data/test2.csv", encoding="utf-8")

In [None]:
'''
A Bidirectional Recurrent Neural Network (LSTM) implementation example using TensorFlow library.
This example is using the MNIST database of handwritten digits (http://yann.lecun.com/exdb/mnist/)
Long Short Term Memory paper: http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf

Author: Aymeric Damien
Project: https://github.com/aymericdamien/TensorFlow-Examples/
'''

from __future__ import print_function

import tensorflow as tf
from tensorflow.contrib import rnn
import numpy as np

# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
mnist

In [None]:
batch_x, batch_y = mnist.train.next_batch(100000)
batch_x.shape, batch_y.shape

In [None]:
batch_x

In [None]:

'''
To classify images using a bidirectional recurrent neural network, we consider
every image row as a sequence of pixels. Because MNIST image shape is 28*28px,
we will then handle 28 sequences of 28 steps for every sample.
'''

# Parameters
learning_rate = 0.001

# 可以理解为，训练时总共用的样本数
training_iters = 100000

# 每次训练的样本大小
batch_size = 128

# 这个是用来显示的。
display_step = 10

# Network Parameters
# n_steps*n_input其实就是那张图 把每一行拆到每个time step上。
n_input = 28 # MNIST data input (img shape: 28*28)
n_steps = 28 # timesteps

# 隐藏层大小
n_hidden = 128 # hidden layer num of features
n_classes = 10 # MNIST total classes (0-9 digits)

# tf Graph input
# [None, n_steps, n_input]这个None表示这一维不确定大小
x = tf.placeholder("float", [None, n_steps, n_input])
y = tf.placeholder("float", [None, n_classes])

# Define weights
weights = {
    # Hidden layer weights => 2*n_hidden because of forward + backward cells
    'out': tf.Variable(tf.random_normal([2*n_hidden, n_classes]))
}
biases = {
    'out': tf.Variable(tf.random_normal([n_classes]))
}


def BiRNN(x, weights, biases):

    # Prepare data shape to match `bidirectional_rnn` function requirements
    # Current data input shape: (batch_size, n_steps, n_input)
    # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)

    # Unstack to get a list of 'n_steps' tensors of shape (batch_size, n_input)
    # 变成了n_steps*(batch_size, n_input)
    x = tf.unstack(x, n_steps, 1)

    # Define lstm cells with tensorflow
    # Forward direction cell
    lstm_fw_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
    # Backward direction cell
    lstm_bw_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)

    # Get lstm cell output
    try:
        outputs, _, _ = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x,
                                              dtype=tf.float32)
    except Exception: # Old TensorFlow version only returns outputs not states
        outputs = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x,
                                        dtype=tf.float32)

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

pred = BiRNN(x, weights, biases)

# Define loss and optimizer
# softmax_cross_entropy_with_logits：Measures the probability error in discrete classification tasks in which the classes are mutually exclusive
# return a 1-D Tensor of length batch_size of the same type as logits with the softmax cross entropy loss.
# reduce_mean就是对所有数值（这里没有指定哪一维）求均值。
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Evaluate model
correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables
init = tf.global_variables_initializer()

# Launch the graph
with tf.Session() as sess:
    sess.run(init)
    step = 1
    # Keep training until reach max iterations
    while step * batch_size < training_iters:
        batch_x, batch_y = mnist.train.next_batch(batch_size)
        # Reshape data to get 28 seq of 28 elements
        batch_x = batch_x.reshape((batch_size, n_steps, n_input))
        # Run optimization op (backprop)
        sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})
        if step % display_step == 0:
            # Calculate batch accuracy
            acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y})
            # Calculate batch loss
            loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y})
            print("Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
                  "{:.6f}".format(loss) + ", Training Accuracy= " + \
                  "{:.5f}".format(acc))
        step += 1
    print("Optimization Finished!")

    # Calculate accuracy for 128 mnist test images
    test_len = 128
    test_data = mnist.test.images[:test_len].reshape((-1, n_steps, n_input))
    test_label = mnist.test.labels[:test_len]
    print("Testing Accuracy:", \
        sess.run(accuracy, feed_dict={x: test_data, y: test_label}))

In [None]:
indices = [5,6,7]
depth = 1
one_hot = tf.one_hot(indices, depth) 
with tf.Session() as sess:
    res = sess.run(one_hot)
    print(res)

In [None]:
np.identity(10)

In [None]:
np.identity(max(b)+1)

In [None]:
a = [5, 4, 5] 
b = [1, 2, 3]
# one hot an integer
one_hot_a = tf.nn.embedding_lookup(np.identity(10), a)
# one hot a list of integers
one_hot_b = tf.nn.embedding_lookup(np.identity(max(b)+1), b)
with tf.Session() as sess:
    res, res2 = sess.run([one_hot_a, one_hot_b])
    print(res)
    print(res2)