In [1]:
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

import re

In [2]:
# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


In [4]:
df = pd.read_csv('./data/mbti_1.csv')

---

## Thinking

In [20]:
df['t/f'] = df['type'].map(lambda x: x[2])

In [27]:
thinking_df = df[df['t/f'] == 'T']
thinking = ' '.join(thinking_df['posts'])
thinking = re.sub(r'[^A-Za-z ]+', '', thinking)

In [28]:
"".join(sorted(set(thinking.lower())))

' abcdefghijklmnopqrstuvwxyz'

In [29]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(thinking)

In [30]:
tokenizer.word_index

{' ': 1,
 'e': 2,
 't': 3,
 'i': 4,
 'o': 5,
 'a': 6,
 'n': 7,
 's': 8,
 'r': 9,
 'h': 10,
 'l': 11,
 'd': 12,
 'u': 13,
 'm': 14,
 'y': 15,
 'c': 16,
 'g': 17,
 'w': 18,
 'p': 19,
 'f': 20,
 'b': 21,
 'v': 22,
 'k': 23,
 'j': 24,
 'x': 25,
 'q': 26,
 'z': 27}

In [31]:
tokenizer.texts_to_sequences(["hello"])

[[10, 2, 11, 11, 5]]

In [32]:
tokenizer.sequences_to_texts([[10, 2, 11, 11, 5]])

['h e l l o']

In [33]:
max_id = len(tokenizer.word_index) # number of distinct characters
dataset_size = tokenizer.document_count # total number of characters

print(f'number of distinct characters: {max_id}')
print(f'total number of characters: {dataset_size}')

number of distinct characters: 27
total number of characters: 26164841


In [34]:
[encoded] = np.array(tokenizer.texts_to_sequences([thinking])) - 1
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-03-01 11:08:23.695552: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-03-01 11:08:23.696521: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [35]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead

dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [36]:
batch_size = 32
encoded_parts = np.array_split(encoded[:train_size], batch_size)
datasets = []
for encoded_part in encoded_parts:
    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
    dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    datasets.append(dataset)
dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [37]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     #dropout=0.2, recurrent_dropout=0.2,
                     dropout=0.2,
                     batch_input_shape=[batch_size, None, max_id]),
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     #dropout=0.2, recurrent_dropout=0.2),
                     dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])

In [38]:
class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [39]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=50,
                    callbacks=[ResetStatesCallback()])

Epoch 1/50


2022-03-01 11:08:53.134383: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-03-01 11:08:54.517208: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-01 11:08:54.926814: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-01 11:08:55.563043: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-01 11:08:57.616966: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-01 11:08:59.039740: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50

KeyboardInterrupt: 

In [None]:
model.save("./models/thinking_model1")

--- 
## Feeling

In [None]:
feeling_df = df[df['t/f'] == 'f']
feeling = ' '.join(feeling_df['posts'])

In [None]:
"".join(sorted(set(feeling.lower())))

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(feeling)

In [None]:
tokenizer.word_index

In [None]:
tokenizer.texts_to_sequences(["Eto"])

In [None]:
tokenizer.sequences_to_texts([[2, 3, 4]])

In [None]:
max_id_f = len(tokenizer.word_index) # number of distinct characters
dataset_size_f = tokenizer.document_count # total number of characters

print(f'number of distinct characters: {max_id_f}')
print(f'total number of characters: {dataset_size_f}')