In [182]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow_datasets as tfds
import tensorflow as tf
import numpy as np

import pandas as pd
import glob
import matplotlib.pyplot as plt

## Google Colab

In [2]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


# 先行研究

## 発話エンコーダ設定

In [107]:
utter_embedding = tf.keras.layers.Embedding(1000, 300)
utter_gru = tf.keras.layers.GRU(512)

## 文脈エンコーダ設定

In [None]:
context_gru = tf.keras.layers.GRU(513)

## 対話行為エンコーダ設定

In [None]:
dialog_embedding = tf.keras.layers.Embedding(1000, 100)
dialog_gru = tf.keras.layers.GRU(128)

## 分類器設定

## 損失関数設定

## 最適化関数設定

## 実行(前処理)

## filepath(colab)

In [45]:
#train_data = "drive/My Drive/研究/swda_data/train_set.txt"
#test_data = "../Switchboard-Corpus/swda_data/test_set.txt"
#val_data = "drive/My Drive/研究/swda_data/val_set.txt"

## filepath(local)

In [183]:
train_data = "../Switchboard-Corpus/swda_data/train_set.txt"
test_data = "../Switchboard-Corpus/swda_data/test_set.txt"
val_data = "../Switchboard-Corpus/swda_data/val_set.txt"

## パラメータ設定

In [184]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

## ファイル取得

In [185]:
train_dataset = []
f = open(train_data, "r", encoding='utf-8')
for row in f:
    train_dataset.append(row.strip())
f.close()

In [186]:
val_dataset = []
f = open(val_data, "r", encoding='utf-8')
for row in f:
    val_dataset.append(row.strip())
f.close()

In [187]:
test_dataset = []
f = open(val_data, "r", encoding='utf-8')
for row in f:
    test_dataset.append(row.strip())
f.close()

## 発話者,発話,発話ラベル

In [188]:
utter_user = []
utter = []
utter_label = []
datasets = [train_dataset, val_dataset, test_dataset]

for d in datasets:
    for j in d:
        for i,v in enumerate(j.split("|")):
            if i == 0:
                utter_user.append(v)
            elif i==1:
                utter.append(v)
            else:
                utter_label.append(v)
print("finish")

finish


In [189]:
labels = set(utter_label)
utter_labels = []
for i in utter_label:
    for j,v in enumerate(labels):
        if i == v:
            utter_labels.append(j)
        

In [190]:
utter_labels[:10]

[35, 20, 0, 30, 11, 5, 38, 5, 5, 6]

In [191]:
len(utter_label),len(utter),len(utter_user)

(198934, 198934, 198934)

## データセット化

In [116]:
train_datasets = tf.data.Dataset.from_tensor_slices((utter, utter_labels))

In [11]:
text, label = next(iter(train_datasets.take(1)))
print(text)
print(label)

tf.Tensor(b'Okay.', shape=(), dtype=string)
tf.Tensor(35, shape=(), dtype=int32)


In [122]:
lists = []
for (i, ex) in enumerate(train_datasets.take(5)):
    print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b'Okay.'>, <tf.Tensor: shape=(), dtype=int32, numpy=35>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'So, What kind of experience do you, do you have, then with child care?'>, <tf.Tensor: shape=(), dtype=int32, numpy=20>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I guess, I think, uh, I wonder if that worked.'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Does it say something?'>, <tf.Tensor: shape=(), dtype=int32, numpy=30>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I think it usually does.'>, <tf.Tensor: shape=(), dtype=int32, numpy=11>)


## トークナイザー

In [19]:
## ボキャブラリーリスト
vocabulary_set = set()
## トークナイザー
tokenizer = tfds.features.text.Tokenizer()

## 分かち書き
for text_tensor,_ in train_datasets:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)
    
## ボキャブラリーリスト作成
vocab_size = len(vocabulary_set)

In [20]:
print(vocab_size)

22042


In [23]:
i,v = next(iter(train_datasets))
print(i,v)

tf.Tensor(b'Okay.', shape=(), dtype=string) tf.Tensor(35, shape=(), dtype=int32)


## エンコード(トークンに変換する)

In [15]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [16]:
example_text = next(iter(train_datasets))[0].numpy()
print(example_text)

b'Okay.'


In [97]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[14576]


In [29]:
def encode(token, label):
    token = encoder.encode(token.numpy())
    return token, label

@tf.function
def tf_encoder(utter, label):
    encoded_text, label = tf.py_function(encode,[utter, label],[tf.int64, tf.int32])
    encoded_text.set_shape([None])
    label.set_shape([])
    return encoded_text, label

MAX_LENGTH = 40
def filter_max_length(x,y,max_length=MAX_LENGTH):
    return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [164]:
all_train_data_encode = train_datasets.map(tf_encoder)

In [206]:
for i,v in all_train_data_encode.take(5):
    print(i)
    print(v)

tf.Tensor([2981], shape=(1,), dtype=int64)
tf.Tensor(35, shape=(), dtype=int32)
tf.Tensor(
[  340  1558 13301 20978 17001  8746  1569  8746  1569 21848 20246 20655
 20347 14578], shape=(14,), dtype=int64)
tf.Tensor(20, shape=(), dtype=int32)
tf.Tensor([  782 12244   782 10095  7140   782  7663 11769 10658 21222], shape=(10,), dtype=int64)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor([ 4836 13340  5999  2994], shape=(4,), dtype=int64)
tf.Tensor(30, shape=(), dtype=int32)
tf.Tensor([  782 10095 13340 10685 16518], shape=(5,), dtype=int64)
tf.Tensor(11, shape=(), dtype=int32)


In [201]:
TAKE_SIZE = 1000
max_len = 40

In [203]:
all_train_data_encode = all_train_data_encode.filter(filter_max_length)

## 訓練データ
utter_train_data = all_train_data_encode.skip(TAKE_SIZE)
utter_train_data = utter_train_data.padded_batch(64, padded_shapes=([max_len], []), drop_remainder=True)

## テストデータ
utter_test_data = all_train_data_encode.take(TAKE_SIZE)
utter_test_data = utter_test_data.padded_batch(64, padded_shapes=([max_len], []), drop_remainder=True)

In [204]:
utter_test_data

<PaddedBatchDataset shapes: ((64, 40), (64,)), types: (tf.int64, tf.int32)>

In [207]:
for ex,er in utter_train_data.take(5):
    print(ex)
    print(er)

tf.Tensor(
[[15731     0     0 ...     0     0     0]
 [12213 13756     0 ...     0     0     0]
 [15193   718   782 ...     0     0     0]
 ...
 [12213 13756     0 ...     0     0     0]
 [  155 11266  7052 ...     0     0     0]
 [12213 13756     0 ...     0     0     0]], shape=(64, 40), dtype=int64)
tf.Tensor(
[32 40 11 26 26 26 40 26 11 11 11 11 40 26 11 11 40 11 40 11 11 11 26 40
 26 26  6 11 11 40 26 40 11 40 11 26 26 26 26 26 40 11 11 11 37 11 11 40
 11 11 26 26 40 11 11 11 40 11 38 20 11 40 11 40], shape=(64,), dtype=int32)
tf.Tensor(
[[ 9226     0     0 ...     0     0     0]
 [20996 19781     0 ...     0     0     0]
 [ 4826 11769  2523 ...     0     0     0]
 ...
 [  833  3142  8562 ...     0     0     0]
 [15193 13340 18973 ...     0     0     0]
 [ 2470 20061 11313 ...     0     0     0]], shape=(64, 40), dtype=int64)
tf.Tensor(
[40 37 37 40 26 11 40 26 40 20 20 11 26  8 11 11 40 11 26 40 11 11 11 26
 40 11 11 40 11 40 11 11 40 35 20 11 40 11 11 11 11 26 37 11 40 11 26 26

DataLossError: Attempted to pad to a smaller size than the input element.

In [170]:
for ex,er in utter_train_data.take(5):
    print(ex)
    print(er)

tf.Tensor(
[[ 1569 15626  4494 ...     0     0     0]
 [ 6612 14537   782 ...     0     0     0]
 [  782  2555 21848 ...     0     0     0]
 ...
 [  833  7140 12835 ...   560  4494 20814]
 [ 3363   782 10095 ...     0     0     0]
 [ 9226     0     0 ...     0     0     0]], shape=(64, 45), dtype=int64)
tf.Tensor(
[26 37 11 37 40 40 26 26 26 40  6  6 26  6 38 26 26  6 26  6 26 11 11 11
 30 11 40 11 40 26 11 11 37 35 11  0 34 37 11 11 40 11 37 11 11 40 11 11
 40 20 11 40 11 40 11  8 11 11 11 11 40 26 11 40], shape=(64,), dtype=int32)
tf.Tensor(
[[  833 12313  4356 ...     0     0     0]
 [12213  1569  6099 ...     0     0     0]
 [ 9226     0     0 ...     0     0     0]
 ...
 [12213 13756     0 ...     0     0     0]
 [15193  5572  1656 ...     0     0     0]
 [12213 13756     0 ...     0     0     0]], shape=(64, 41), dtype=int64)
tf.Tensor(
[11 11 40 26 11 26 11 26  6 11 26 40 26 40 11 11 11 11 11 11 26 40 30  9
 18 11 26 26 37 11 32 11 11 11 11 40 11 20 11 10 28 11 11 26 11 40 11 11

In [120]:
lists_hoge = []
for i, ex in enumerate(train_datasets):
    if i == 10:
        break
        
    lists_hoge.append(ex)
    if i%window_size == 4:
        for j in lists_hoge:
            print(j)
        lists_hoge = []

(<tf.Tensor: shape=(), dtype=string, numpy=b'Okay.'>, <tf.Tensor: shape=(), dtype=int32, numpy=35>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'So, What kind of experience do you, do you have, then with child care?'>, <tf.Tensor: shape=(), dtype=int32, numpy=20>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I guess, I think, uh, I wonder if that worked.'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Does it say something?'>, <tf.Tensor: shape=(), dtype=int32, numpy=30>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I think it usually does.'>, <tf.Tensor: shape=(), dtype=int32, numpy=11>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'You might try, uh,'>, <tf.Tensor: shape=(), dtype=int32, numpy=5>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"I don't know,">, <tf.Tensor: shape=(), dtype=int32, numpy=38>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'hold it down a little longer,'>, <tf.Tensor: shape=(), dtype=int32, numpy=5>)
(<tf.Tensor: sha

In [54]:
vocab_size += 1

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 300),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(512)),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(41)
])

In [146]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 300)         6614100   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 1024)              2500608   
_________________________________________________________________
dense_7 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_8 (Dense)              (None, 41)                21033     
Total params: 9,660,541
Trainable params: 9,660,541
Non-trainable params: 0
_________________________________________________________________


In [55]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [56]:
model.fit(utter_train_data, 
          epochs=3, 
          validation_data=utter_test_data,
          validation_steps=30)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f5a80114e20>

In [35]:
"""
train_data_set = utter_train_dataset.map(tf_encoder)
train_data_set = train_data_set.filter(filter_max_length)
train_data_set = train_data_set.cache()
train_data_set = train_data_set.padded_batch(BATCH_SIZE, padded_shapes=(100,100))
train_data_set.prefetch(tf.data.experimental.AUTOTUNE)
"""

'\ntrain_data_set = utter_train_dataset.map(tf_encoder)\ntrain_data_set = train_data_set.filter(filter_max_length)\ntrain_data_set = train_data_set.cache()\ntrain_data_set = train_data_set.padded_batch(BATCH_SIZE, padded_shapes=(100,100))\ntrain_data_set.prefetch(tf.data.experimental.AUTOTUNE)\n'

## コンテキストEncoder層

In [176]:
class ContextEncoder(tf.keras.Model):
    def __init__(self, units, batch_size):
        super(ContextEncoder, self).__init__()
        self.units = units
        self.batch_size = batch_size
        self.context_gru = tf.keras.layers.GRU(self.units,
                                     return_sequences=True,
                                     return_state=True,
                                     recurrent_activation='sigmoid',
                                     recurrent_initializer='glorot_uniform')
                                     
    def call(self, x, hidden):
        output, states = self.gru(x, initial_state=hidden)
        return output, states
        

### context encode

In [177]:
context_encoder = ContextEncoder(513, 64)

## 発話Encoder層

In [167]:
class UtterEncoder(tf.keras.Model):
    def __init__(self, vocabsize, embedding_dim, units, batch_size, output_dim):
        super(UtterEncoder, self).__init__()
        self.units = units
        self.batch_size = batch_size
        self.output_dim = output_dim
        self.vocabsize = vocabsize
        self.embedding = tf.keras.layers.Embedding(vocabsize, embedding_dim)
        self.utter_gru = tf.keras.layers.GRU(self.units,
                                     return_sequences=True,
                                     return_state=True,
                                     recurrent_activation='sigmoid',
                                     recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(self.units, activation='relu')
        self.fc2 = tf.keras.layers.Dense(self.output_dim)
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, states = self.utter_gru(x, initial_state=hidden)
        output = tf.reshape(output,(-1, output.shape[2]))
        x = self.fc1(output)
        x = self.fc2(x)
        return x, states
        

### 発話エンコーダ

In [168]:
utter_encoder = UtterEncoder(vocab_size, 300, 512, 64, 41)

In [113]:
vocab_size

22047

### 最適化関数

In [72]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)

### 損失関数

In [97]:
def loss_function(real, preds):
    return tf.keras.losses.sparse_categorical_crossentropy(real, preds)

## 確認

In [106]:
utter_train_data

<PaddedBatchDataset shapes: ((None, None), (None,)), types: (tf.int64, tf.int32)>

In [179]:
y_true = [1, 2]
y_pred = [[0.05, 0.95, 0, 0.08], [0.1, 0.8, 0.1, 1.02], [0.01, 0.10, 1.00, 2.00]]

In [126]:
y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]

In [180]:
loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)

ValueError: Shape mismatch: The shape of labels (received (2,)) should equal the shape of logits except for the last dimension (received (3, 4)).

## 実行処理

In [74]:
EPOCHS = 3

In [None]:
class FeedFowardNetwork():
    def __init__(self, units):
        super(FeedFowardNetwork, self).__init__()
        self.units = units
        self.fc = tf.keras.layers.Dense(self.units)
    
    def call(self, output):
        x = self.fc(output)
        return x

### 実行処理

## 訓練データセット作成

### ウィンドウサイズ

In [59]:
window_size = 5

In [None]:
lists_utter_datasets = []
for i in 

### 実行モデル version

In [57]:
print("TensorFlow version: ", tf.__version__)

TensorFlow version:  2.3.0
