In [18]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow_datasets as tfds
import tensorflow as tf
import numpy as np

import pandas as pd
import glob
import matplotlib.pyplot as plt

## Google Colab

In [2]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


# 先行研究

## 発話エンコーダ設定

In [107]:
utter_embedding = tf.keras.layers.Embedding(1000, 300)
utter_gru = tf.keras.layers.GRU(512)

## 文脈エンコーダ設定

In [None]:
context_gru = tf.keras.layers.GRU(513)

## 対話行為エンコーダ設定

In [None]:
dialog_embedding = tf.keras.layers.Embedding(1000, 100)
dialog_gru = tf.keras.layers.GRU(128)

## 分類器設定

## 損失関数設定

## 最適化関数設定

## 実行(前処理)

## filepath(colab)

In [45]:
#train_data = "drive/My Drive/研究/swda_data/train_set.txt"
#test_data = "../Switchboard-Corpus/swda_data/test_set.txt"
#val_data = "drive/My Drive/研究/swda_data/val_set.txt"

## filepath(local)

In [2]:
train_data = "../Switchboard-Corpus/swda_data/train_set.txt"
test_data = "../Switchboard-Corpus/swda_data/test_set.txt"
val_data = "../Switchboard-Corpus/swda_data/val_set.txt"

## パラメータ設定

In [3]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

## ファイル取得

In [4]:
train_dataset = []
f = open(train_data, "r", encoding='utf-8')
for row in f:
    train_dataset.append(row.strip())
f.close()

In [5]:
val_dataset = []
f = open(val_data, "r", encoding='utf-8')
for row in f:
    val_dataset.append(row.strip())
f.close()

In [6]:
test_dataset = []
f = open(val_data, "r", encoding='utf-8')
for row in f:
    test_dataset.append(row.strip())
f.close()

## 発話者,発話,発話ラベル

In [7]:
utter_user = []
utter = []
utter_label = []
datasets = [train_dataset, val_dataset, test_dataset]

for d in datasets:
    for j in d:
        for i,v in enumerate(j.split("|")):
            if i == 0:
                utter_user.append(v)
            elif i==1:
                utter.append(v)
            else:
                utter_label.append(v)
print("finish")

finish


In [8]:
labels = set(utter_label)
utter_labels = []
for i in utter_label:
    for j,v in enumerate(labels):
        if i == v:
            utter_labels.append(j)
        

In [9]:
utter_labels[:10]

[35, 20, 0, 30, 11, 5, 38, 5, 5, 6]

## データセット化

In [10]:
train_datasets = tf.data.Dataset.from_tensor_slices((utter, utter_labels))
#utter_val_dataset = tf.data.Dataset.from_tensor_slices((val_utter, val_utter_label))

In [11]:
text, label = next(iter(train_datasets.take(1)))
print(text)
print(label)

tf.Tensor(b'Okay.', shape=(), dtype=string)
tf.Tensor(35, shape=(), dtype=int32)


In [12]:
for train_example, train_label in train_datasets.take(1):
    print(train_example.numpy())
    print(train_label.numpy())

b'Okay.'
35


## トークナイザー

In [19]:
## ボキャブラリーリスト
vocabulary_set = set()
## トークナイザー
tokenizer = tfds.features.text.Tokenizer()

## 分かち書き
for text_tensor,_ in train_datasets:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)
    
## ボキャブラリーリスト作成
vocab_size = len(vocabulary_set)

In [20]:
print(vocab_size)

22042


In [23]:
i,v = next(iter(train_datasets))
print(i,v)

tf.Tensor(b'Okay.', shape=(), dtype=string) tf.Tensor(35, shape=(), dtype=int32)


## エンコード(トークンに変換する)

In [15]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [16]:
example_text = next(iter(train_datasets))[0].numpy()
print(example_text)

b'Okay.'


In [97]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[14576]


In [29]:
def encode(token, label):
    token = encoder.encode(token.numpy())
    return token, label

@tf.function
def tf_encoder(utter, label):
    encoded_text, label = tf.py_function(encode,[utter, label],[tf.int64, tf.int32])
    encoded_text.set_shape([None])
    label.set_shape([])
    return encoded_text, label

MAX_LENGTH = 40
def filter_max_length(x,y,max_length=MAX_LENGTH):
    return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [30]:
all_train_data_encode = train_datasets.map(tf_encoder)

In [32]:
i,v = next(iter(all_train_data_encode.take(1)))
print(i,v)

tf.Tensor([2981], shape=(1,), dtype=int64) tf.Tensor(35, shape=(), dtype=int32)


In [38]:
#all_train_data_encode = all_train_data_encode.filter(filter_max_length)

## 訓練データ
utter_train_data = all_train_data_encode.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
utter_train_data = utter_train_data.padded_batch(BATCH_SIZE)

## テストデータ
utter_test_data = all_train_data_encode.take(TAKE_SIZE)
utter_test_data = utter_test_data.padded_batch(BATCH_SIZE)

In [49]:
for ex in utter_train_data.take(1):
    print(ex)

(<tf.Tensor: shape=(64, 35), dtype=int64, numpy=
array([[  833,   782,   782, ...,     0,     0,     0],
       [ 6388,  5264,  1588, ...,     0,     0,     0],
       [ 2571, 20061,   983, ...,     0,     0,     0],
       ...,
       [20864, 21425,     0, ...,     0,     0,     0],
       [10064,     0,     0, ...,     0,     0,     0],
       [21426, 15040,     0, ...,     0,     0,     0]])>, <tf.Tensor: shape=(64,), dtype=int32, numpy=
array([11, 11, 26, 11, 40, 36, 37, 11, 40, 11,  6, 40, 11, 11, 11, 11, 11,
       40, 11, 11, 20,  6, 11, 11, 40, 40, 11, 11, 40, 11,  6, 26, 17, 11,
       11, 40, 27,  6, 11, 40, 32, 37,  8, 32,  6, 26, 40, 11, 11, 11,  1,
       40, 11, 30, 17, 11, 26, 11,  6,  2,  6, 17, 36, 40], dtype=int32)>)


In [40]:
a,b = next(iter(utter_test_data))
print(a[0])
print(b[1])

tf.Tensor(
[2981    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0], shape=(21,), dtype=int64)
tf.Tensor(20, shape=(), dtype=int32)


In [54]:
vocab_size += 1

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 300),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(512)),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(41)
])

In [55]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [56]:
model.fit(utter_train_data, 
          epochs=3, 
          validation_data=utter_test_data,
          validation_steps=30)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f5a80114e20>

In [35]:
"""
train_data_set = utter_train_dataset.map(tf_encoder)
train_data_set = train_data_set.filter(filter_max_length)
train_data_set = train_data_set.cache()
train_data_set = train_data_set.padded_batch(BATCH_SIZE, padded_shapes=(100,100))
train_data_set.prefetch(tf.data.experimental.AUTOTUNE)
"""

'\ntrain_data_set = utter_train_dataset.map(tf_encoder)\ntrain_data_set = train_data_set.filter(filter_max_length)\ntrain_data_set = train_data_set.cache()\ntrain_data_set = train_data_set.padded_batch(BATCH_SIZE, padded_shapes=(100,100))\ntrain_data_set.prefetch(tf.data.experimental.AUTOTUNE)\n'

In [None]:
"""
class UtterEncoder(tf.keras.Model):
    def __init__(self, vocabsize, embedding_dim, units, batch_size):
        super(UtterEncoder, self).__init__()
        self.units = units
        self.batch_size = batch_size
        self.embedding = tf.keras.Embedding(vocabsize, embedding)
        self.utter_gru = tf.keras.GRU(self.units,
                                     return_sequences=True,
                                     return_state=True,
                                     recurrent_activation='sigmoid',
                                     recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(self.units)
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, states = self.gru(x, initial_state=hidden)
        output = tf.reshape(output,(-1, output.shape[2]))
        x = self.tf(output)
        return x, states
        
utter_encoder = UtterEncoder(vocab_size, 300, 512, 64)
""""       

In [None]:
"""
class ContextEncoder(tf.keras.Model):
    def __init__(self, units, batch_size):
        super(ContextEncoder, self).__init__()
        self.units = units
        self.batch_size = batch_size
        self.context_gru = tf.keras.GRU(self.units,
                                     return_sequences=True,
                                     return_state=True,
                                     recurrent_activation='sigmoid',
                                     recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(self.units)
                                     
    def call(self, x, hidden):
        output, states = self.gru(x, initial_state=hidden)
        #output = tf.reshape(output,(-1, output.shape[2]))
        #x = self.tf(output)
        return output, states
        
context_encoder = ContextEncoder(513, 64)
"""

In [None]:
class FeedFowardNetwork():
    def __init__(self, units):
        super(FeedFowardNetwork, self).__init__()
        self.units = units
        self.fc = tf.keras.layers.Dense(self.units)
    
    def call(self, output):
        x = self.fc(output)
        return x

## 訓練データセット作成

### ウィンドウサイズ

In [None]:
window_size = 5

### 実行モデル

In [49]:
model = tf.keras.Sequential([
                            tf.keras.layers.Embedding(vocab_size, 300),
                            tf.keras.layers.Bidirectional(tf.keras.layers.GRU(512)),
                            tf.keras.layers.Dense(512, activation='relu'),
                            tf.keras.layers.Dense(1)
])

In [53]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [54]:
history = model.fit(train_dataset, epochs=10)

Epoch 1/10


ValueError: ignored

In [57]:
print("TensorFlow version: ", tf.__version__)

TensorFlow version:  2.3.0
