# 予備実験

## 発話分類

In [3]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import numpy as np

In [4]:
import tensorflow_datasets as tfds
import tensorflow as tf

## データセット

In [5]:
train_dataset_path = "../Switchboard-Corpus/swda_data/train_set.txt"
test_dataset_path = "../Switchboard-Corpus/swda_data/test_set.txt"
val_dataset_path = "../Switchboard-Corpus/swda_data/val_set.txt"

In [6]:
train_dataset = []
f = open(train_dataset_path, "r", encoding='utf-8')
for row in f:
    train_dataset.append(row.strip())
f.close()

In [7]:
val_dataset = []
f = open(val_dataset_path, "r", encoding='utf-8')
for row in f:
    val_dataset.append(row.strip())
f.close()

In [8]:
test_dataset = []
f = open(test_dataset_path, "r", encoding='utf-8')
for row in f:
    test_dataset.append(row.strip())
f.close()

### データセット分割

In [11]:
## 訓練データ
train_utter_user = []
train_utter = []
train_utter_label = []

## テストデータ
test_utter_user = []
test_utter = []
test_utter_label = []

## 検証データ
val_utter_user = []
val_utter = []
val_utter_label = []

In [12]:
for j in train_dataset:
    for i,v in enumerate(j.split("|")):
        if i == 0:
            train_utter_user.append(v)
        elif i==1:
            train_utter.append(v)
        else:
            train_utter_label.append(v)
            
print("finish")

finish


In [13]:
for j in test_dataset:
    for i,v in enumerate(j.split("|")):
        if i == 0:
            test_utter_user.append(v)
        elif i==1:
            test_utter.append(v)
        else:
            test_utter_label.append(v)
            
print("finish")

finish


In [14]:
for j in val_dataset:
    for i,v in enumerate(j.split("|")):
        if i == 0:
            val_utter_user.append(v)
        elif i==1:
            val_utter.append(v)
        else:
            val_utter_label.append(v)
            
print("finish")

finish


### ラベルデータ数値化

In [15]:
label_path = "../Switchboard-Corpus/swda_data/metadata/labels.txt"
labels = []
f = open(label_path, "r", encoding='utf-8')
for row in f:
    labels.append(row.strip())
f.close()

In [24]:
label = {}
for i,v in enumerate(labels):
    label[v] = i

In [27]:
train_utter_labels = []
for i in train_utter_label:
    train_utter_labels.append(label[i])

In [28]:
test_utter_labels = []
for i in test_utter_label:
    test_utter_labels.append(label[i])

In [29]:
val_utter_labels = []
for i in val_utter_label:
    val_utter_labels.append(label[i])

## データセットをtensorflowに扱える形にする

In [30]:
train_data = tf.data.Dataset.from_tensor_slices((train_utter, train_utter_labels))
test_data = tf.data.Dataset.from_tensor_slices((test_utter, test_utter_labels))
val_data = tf.data.Dataset.from_tensor_slices((val_utter, val_utter_labels))

In [33]:
i,v = next(iter(train_data))
print(i,v)

tf.Tensor(b'Okay.', shape=(), dtype=string) tf.Tensor(17, shape=(), dtype=int32)


## トークナイザー

In [31]:
## ボキャブラリーリスト
vocabulary_set = set()
## トークナイザー
tokenizer = tfds.features.text.Tokenizer()

## 分かち書き
for text_tensor,_ in train_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)
    
## ボキャブラリーリスト作成
vocab_size = len(vocabulary_set)

## encode生成

In [32]:
en_code = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [34]:
def encode(token, label):
    token = en_code.encode(token.numpy())
    return token, label

@tf.function
def tf_encoder(utter, label):
    encoded_text, label = tf.py_function(encode,[utter, label],[tf.int64, tf.int32])
    encoded_text.set_shape([None])
    label.set_shape([])
    return encoded_text, label

"""
MAX_LENGTH = 40
def filter_max_length(x,y,max_length=MAX_LENGTH):
    return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)
"""

'\nMAX_LENGTH = 40\ndef filter_max_length(x,y,max_length=MAX_LENGTH):\n    return tf.logical_and(tf.size(x) <= max_length,\n                        tf.size(y) <= max_length)\n'

## データ設計

In [35]:
max_len = 40

In [36]:
all_train_data_encode = train_data.map(tf_encoder)
all_test_data_encode = test_data.map(tf_encoder)
all_val_data_encode = val_data.map(tf_encoder)

In [37]:
utter_train_data = all_train_data_encode.padded_batch(64, padded_shapes=([max_len], []), drop_remainder=True)
utter_test_data = all_test_data_encode.padded_batch(64, padded_shapes=([max_len], []), drop_remainder=True)
utter_val_data = all_val_data_encode.padded_batch(64, padded_shapes=([max_len], []), drop_remainder=True)

In [38]:
utter_train_data, utter_test_data, utter_val_data

(<PaddedBatchDataset shapes: ((64, 40), (64,)), types: (tf.int64, tf.int32)>,
 <PaddedBatchDataset shapes: ((64, 40), (64,)), types: (tf.int64, tf.int32)>,
 <PaddedBatchDataset shapes: ((64, 40), (64,)), types: (tf.int64, tf.int32)>)