In [None]:
import re
import pandas as pd
import zhconv #繁体字转换
import tensorflow as tf
gpus = tf.config.list_physical_devices("GPU")
if gpus:
    tf.config.experimental.set_memory_growth(gpus[0], True)  #设置GPU显存用量按需使用
    tf.config.set_visible_devices([gpus[0]],"GPU")
print('tensorflow version {}'.format(tf.__version__))
# tf.keras.layers.experimental.preprocessing.TextVectorization

## 数据加载

### pandas

In [None]:
df = pd.read_csv('../data/文本分类/sms_pub.csv')[:100000]
target = df.pop('label')

dataset = tf.data.Dataset.from_tensor_slices((df[['message']].values, target.values))

dataset.element_spec

### csv make_csv_dataset

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64


dataset = tf.data.experimental.make_csv_dataset(
    file_pattern='../data/文本分类/sms_pub.csv',
    field_delim=',',
    batch_size=BATCH_SIZE,
    label_name="label",
    select_columns=['message', 'label'],
    shuffle=True
)  # .map(split_line, num_parallel_calls=tf.data.experimental.AUTOTUNE)

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

# print(next(iter(dataset)), '\n', dataset.take(1))
# print(type(dataset))
# for feature_batch, label_batch in dataset.take(1):
#     print(len(label_batch), len(feature_batch['message']))
#     for i in range(1):
#       print(label_batch[i], '\n', feature_batch['message'][i][:2])
dataset.element_spec


### txt text_dataset_from_directory

In [None]:
batch_size = 32
seed = 42


ds_data = tf.keras.utils.text_dataset_from_directory(
    directory=['../data/文本分类'],
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

ds_data

## 文本预处理

In [None]:
def cleaan_str(string):
    string = re.sub(u'[\u4e00-\u9fa5]+', ' ', string)
    string = zhconv.convert(string.strip(), 'zh-hans')

def load_data(data_file):
    lines = list(open(data_file, 'r', encoding='utf-8').readlines())
    y = [line[:1] for line in lines]
    x = [clean_str(line[1:] for lin in lines)]
    return [x, y]

    return string


load_data(data_file='../data/文本分类/sms_pub.csv')


In [None]:
MAX_WORDS = 10000   # 仅考虑最高频的10000个词
MAX_LEN = 250       # 每个样本保留200个词的长度
BATCH_SIZE = 20


#构建词典
def clean_text(text):
    lowercase = tf.strings.lower(text)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    cleaned_punctuation = tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation),'')
    return cleaned_punctuation

binary_vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
    # standardize=clean_text, # 标准化是指预处理文本，通常是移除标点符号或 HTML 元素以简化数据集。
    # split='whitespace',     # 分词器会按空格分割 (split='whitespace')。
    max_tokens=MAX_WORDS-1, # 有一个留给占位符
    output_mode='binary',      # 默认向量化模式为 'int'整数索引每个词例一个id, 'binary'来构建词袋模型。
    )
int_vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
    # standardize=clean_text, # 标准化是指预处理文本，通常是移除标点符号或 HTML 元素以简化数据集。
    # split='whitespace',     # 分词器会按空格分割 (split='whitespace')。
    max_tokens=MAX_WORDS-1, # 有一个留给占位符
    output_mode='int',      # 默认向量化模式为 'int'整数索引每个词例一个id, 'binary'来构建词袋模型。
    output_sequence_length=MAX_LEN
    )

ds_dataset = dataset.map(lambda text, label: text)
binary_vectorize_layer.adapt(ds_dataset)
int_vectorize_layer.adapt(ds_dataset)
print(binary_vectorize_layer.get_vocabulary()[0:100])
print(int_vectorize_layer.get_vocabulary()[0:100])


In [None]:
def int_vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return int_vectorize_layer(text), label

def binary_vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return binary_vectorize_layer(text), label

text_batch, label_batch = next(iter(dataset))
first_question, first_label = text_batch, label_batch
print("Question", first_question)
print("Label", first_label)

print("'binary' vectorized question:",
      binary_vectorize_text(first_question, first_label)[0])
print("'int' vectorized question:",
      int_vectorize_text(first_question, first_label)[0])

In [None]:
print("1289 ---> ", int_vectorize_layer.get_vocabulary()[1289])
print("313 ---> ", int_vectorize_layer.get_vocabulary()[313])
print("Vocabulary size: {}".format(len(int_vectorize_layer.get_vocabulary())))

In [None]:
binary_train_ds = dataset.map(binary_vectorize_text)
int_train_ds = dataset.map(int_vectorize_text)

binary_train_ds

In [None]:
# Dataset.prefetch 会在训练时将数据预处理和模型执行重叠。

AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

binary_train_ds = configure_dataset(binary_train_ds)
int_train_ds = configure_dataset(int_train_ds)

binary_train_ds


## Model

In [None]:
tf.keras.backend.clear_session()

binary_model = tf.keras.Sequential([
    tf.keras.layers.Dense(4)
    ])
binary_model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])

int_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(MAX_WORDS + 1, 64, mask_zero=True),
    tf.keras.layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(2) #num_labels
])
int_model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])

# history = binary_model.fit(int_train_ds, validation_data=0.2, epochs=10)
history = int_model.fit(int_train_ds, validation_data=0.2, epochs=5)

history

In [None]:
tf.keras.backend.clear_session()

model = tf.keras.Sequential([
  int_vectorize_layer,
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid'),
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])

history = model.fit(dataset.shuffle(500), epochs=20)