# Mini-batchizer
train 資料可能很大，跑 Mini-batch 時應該儘量不增加記憶體使用量 (用 reference 的)。

In [1]:
# Import numpy
import numpy as np

# Import pandas
import pandas as pd
from pandas import Series, DataFrame

# Import util
import time
import re
import sys
import gc
import os

## 資料 scenario：一個問題 n 個選項

#### Define Class

In [None]:
class mini_batcher_one_q_multiple_r():
    def __init__(self, x1, x2, y):
        '''
        Parameters
            x1: np.array. Containing a list of questions
            x2: np.array. Containing options for each corresponding x1(question)
            y : np.array. Containing a list of int which is the answer for corresponding x1(question)
        Note I:
            x1, x2, y inside or outside the class are referencing to same memory.
            So do all returning result by this class.
            This class won't (hope) do any modification on x1, x2, y
        Note II:
            # of batch in a epoch for sigmoid = # of options
            # of batch in a epoch for cross entropy = # of questions
        '''
        if type(x1) != np.ndarray or type(x2) != np.ndarray or type(y) != np.ndarray:
            raise AssertionError('x1, x2, y should be np.ndarray')
        if len(x1) != len(x2) or len(x1) != len(y):
            raise AssertionError('len(x1), len(x2), len(y) should be the same')
        for i in range(len(x2)):
            if len(x2[i]) != len(x2[0]):
                raise AssertionError('Each element of x2 should be the same length')
        self._x1 = x1
        self._x2 = x2
        self._y = y
        self._sigmoid_pointer = 0
        self._sigmoid_idx_pool = np.array([(i, j) for i in range(len(x2)) for j in range(len(x2[i]))])
        self._entropy_pointer = 0
        self._entropy_idx_pool = np.arange(len(x1))
        np.random.shuffle(self._sigmoid_idx_pool)
        np.random.shuffle(self._entropy_idx_pool)


    def next_batch_4_sigmoid(self, batch_size):
        f = self._sigmoid_pointer
        t = self._sigmoid_pointer + batch_size
        if t > len(self._sigmoid_idx_pool):
            f = 0
            t = batch_size
            np.random.shuffle(self._sigmoid_idx_pool)
        self._sigmoid_pointer = t
        idx = self._sigmoid_idx_pool[f:t]
        idx_0 = idx[:, 0]
        idx_1 = idx[:, 1]
        return self._x1[idx_0], self._x2[idx_0, idx_1], np.array(self._y[idx_0]==idx_1, dtype=np.int8)


    def next_batch_4_cross_entropy(self, batch_size):
        f = self._entropy_pointer
        t = self._entropy_pointer + batch_size
        if t > len(self._entropy_idx_pool):
            f = 0
            t = batch_size
            np.random.shuffle(self._entropy_idx_pool)
        self._entropy_pointer = t
        idx = self._entropy_idx_pool[f:t]
        onehot = np.zeros((len(idx), len(x2[0])))
        onehot[np.arange(len(idx)), self._y[idx]] = 1
        return self._x1[idx], self._x2[idx], onehot

#### Loading Datas for Demo

In [None]:
sample = pd.read_csv('datas/sample_test_data.txt')
sample

In [None]:
# Extract sample test datas
x1 = np.array(
    [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
)
x2 = np.array(
    [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
)
y = np.array(sample.answer.values)

In [None]:
print(x1[27])
print(x2[27])
print(y[27])

#### Demo

In [None]:
# Sigmoid scenario
data_loader = mini_batcher_one_q_multiple_r(x1, x2, y)
batch_size = 5
batch_q, batch_r, batch_ans = data_loader.next_batch_4_sigmoid(batch_size)
for i in range(batch_size):
    print('  Q:', batch_q[i])
    print('  R:', batch_r[i])
    print('ans:', batch_ans[i])
    print('')

In [None]:
# Cross entropy scenario
data_loader = mini_batcher_one_q_multiple_r(x1, x2, y)
batch_size = 5
batch_q, batch_r, batch_ans = data_loader.next_batch_4_cross_entropy(batch_size)
for i in range(batch_size):
    print('  Q:', batch_q[i])
    for j in range(len(batch_r[i])):
        print('R%2d: %s' % (j, batch_r[i][j]))
    print('ans:', batch_ans[i])
    print('')

## 資料 scenario：多個不同長度的文本，轉成一筆一筆的 (上句, 下句, 0 or 1)

In [2]:
class mini_batcher_corpus():
    def __init__(self, corpus, n_wrong=1):
        '''
        Parameters:
            corpus : list of corpus (2D)
            n_wrong: int. # of wrong answer to be generated for each question.
        Note I:
            This class will create a flatten (1D) version of corpus for convenient.
            But still a reference to outside corpus, changing corpus outside will 
            changing corpus inside the class also.
        '''
        self._corpus = np.array([s for c in corpus for s in c])
        self._pointer = 0
        
        border_idx = np.cumsum([len(c) for c in corpus]) - 1
        que_idx = np.delete(np.arange(np.sum([len(c) for c in corpus])), border_idx)
        ans_idx = que_idx + 1
        
        self._dt_pool = np.vstack([
            np.stack([que_idx, ans_idx, np.ones(len(que_idx), dtype=np.int32)], axis=1),
            *[
                np.stack([que_idx, self.__get_wrong_idx(ans_idx), np.zeros(len(que_idx), dtype=np.int32)], axis=1)
                for i in range(n_wrong)
            ]
        ])
        np.random.shuffle(self._dt_pool)
        
        self.data_num = len(self._dt_pool)


    def __get_wrong_idx(self, ans_idx):
        '''
        Generate a sequence which is a shuffle version of input ans.
        Each output elements is different from input ans.
        '''
        assert(len(ans_idx) > 1)
        idx = ans_idx.copy()
        np.random.shuffle(idx)
        for i in np.where(idx == ans_idx)[0]:
            if idx[i] != ans_idx[i]:
                continue
            t = np.random.randint(len(ans_idx))
            while t==i or idx[i]==ans_idx[t] or idx[t]==ans_idx[i]:
                t = np.random.randint(len(ans_idx))
            idx[i], idx[t] = idx[t], idx[i]
        return idx


    def next_batch(self, batch_size):
        f = self._pointer
        t = self._pointer + batch_size
        if t > self.data_num:
            f = 0
            t = batch_size
            np.random.shuffle(self._dt_pool)
        self._pointer = t
        dt = self._dt_pool[f:t]
        x1 = self._corpus[dt[:, 0]]
        x2 = self._corpus[dt[:, 1]]
        y = dt[:, 2]
        return x1, x2, y

#### Demo

In [3]:
with open('datas/training_data/下課花路米.txt') as f:
    corpus = [[s.split() for s in line.split('\t')] for line in f]

In [4]:
data_loader = mini_batcher_corpus(corpus)

In [5]:
batch_size = 10
x1, x2, y = data_loader.next_batch(batch_size)
for i in range(batch_size):
    print('x1:', x1[i])
    print('x2:', x2[i])
    print('y :', y[i])
    print('')

x1: ['我們', '正要', '體驗', '傳統', '屋', '的', '一天']
x2: ['而且', '我', '跟', '你', '說']
y : 1

x1: ['特辣', '的', '韓式', '泡菜']
x2: ['你', '可以', '試試看']
y : 1

x1: ['我', '去', '找', '專家', '好不好']
x2: ['所以', '本來', '只要', '做', '這麼', '小', '隻', '就', '對', '了']
y : 0

x1: ['而且', '我', '發現', '到']
x2: ['其實', '平常', '很多', '東西']
y : 1

x1: ['要給', '大家', '欣賞']
x2: ['我們', '三個']
y : 1

x1: ['酵素', '它', '可以', '幫助', '我們', '腸胃', '消化']
x2: ['翻修', '成', '現在', '看到', '的', '這個', '模樣', '啦']
y : 0

x1: ['那', '不過', '這個', '嫁妝']
x2: ['對']
y : 0

x1: ['這種', '東西', '質感', '好壞', '還是', '有差']
x2: ['你', '不', '記得', '之前', '小升', '他', '有', '去', '金門']
y : 1

x1: ['那棵', '嗎']
x2: ['這個', '好像', '黃黃', '綠綠的', '還沒開', '耶']
y : 0

x1: ['好', '用力']
x2: ['所以', '說', '使用', '蠟筆', '就', '不會']
y : 0

