# Note

In [None]:
# model on fullnames but make sure that last names aligned at the same position
# this is to overcome the fullname model only focus on firstname part

# Preprocess the data

In [None]:
# change keras default GPU
# os.environ["CUDA_VISIBLE_DEVICES"]="0" # first gpu
# os.environ["CUDA_VISIBLE_DEVICES"]="1" # second gpu
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # runs in cpu

In [None]:
# pip install keras-tuner

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Apr 16 16:30:16 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
cd drive/MyDrive/

/content/drive/MyDrive


In [3]:
import tensorflow.keras as keras
import tensorflow as tf
print(keras.__version__)
#print(tf.__version__)
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import string

2.8.0


In [6]:
df = pd.read_csv("nmzpAgeSexFL.csv")

In [7]:
df

Unnamed: 0.1,Unnamed: 0,name_first,name_last,zip5,sex,race,age
0,1,Elizabeth,Walker,32643,F,5,59.049315
1,2,Alton,Palmer,32643,M,5,77.854795
2,3,Alicia,Mc Cleod,32607,F,3,54.402740
3,4,Dale,Scarborough,32643,M,5,69.868493
4,5,Daniel,Walker,32640,M,5,65.572603
...,...,...,...,...,...,...,...
13605732,13710354,William,Walters,32428,M,5,74.487671
13605733,13710355,Matthew,Sawyer,32428,M,5,36.526027
13605734,13710356,Janine,Thomas,32428,F,5,33.857534
13605735,13710357,Angel,Campbell,32431,F,7,22.397260


In [8]:
df=df.loc[df['race'].isin([2,3,4,5])]

In [9]:
# create ASCII dictionary
chars = ['E'] + [chr(i) for i in range(97,123)] + [' ', 'U']
id2char = {i:j for i,j in enumerate(chars)}
char2id = {j:i for i,j in enumerate(chars)}

In [None]:
# the characters here are all ASCII, good
# for name in df['name_combine'].tolist():
#     namechars = list(name)
#     for nc in namechars:
#         if nc not in char2id:
#             print(nc)

In [10]:
def name2id(name, l = 10):
    ids = [0] * l
    for i, c in enumerate(name):
        if i < l:
            if c.isalpha():
                ids[i] = char2id.get(c, char2id['U'])
            elif c in string.punctuation:
                ids[i] = char2id.get(c, char2id[' '])
            else:
                ids[i] = char2id.get(c, char2id['U'])
    return ids

In [11]:
X = [name2id(fn.lower()) + name2id(ln.lower()) for fn, ln in zip(df['name_first'], df['name_last'])]
y = [int(i) for i in df['race'].tolist()]

In [12]:
# convert the output (y) from 2-5 to 0-3
y = [i-2 for i in y]

In [13]:
# Split train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
# check the distribution
print('distribution of all data points', {i: y.count(i) for i in set(y)})
print('distribution of all training data', {i: y_train.count(i) for i in set(y)})
print('distribution of all testing data', {i: y_test.count(i) for i in set(y)})

distribution of all data points {0: 252986, 1: 1836866, 2: 2167700, 3: 8685610}
distribution of all training data {0: 202361, 1: 1469726, 2: 1735192, 3: 6947250}
distribution of all testing data {0: 50625, 1: 367140, 2: 432508, 3: 1738360}


# Train the model

In [15]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM, Bidirectional

num_words = len(id2char)
feature_len = 20 # cut texts after this number of words (among top max_features most common words)
batch_size = 512

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = 4 # np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

10354529 train sequences
2588633 test sequences
Pad sequences (samples x time)
X_train shape: (10354529, 20)
X_test shape: (2588633, 20)
4 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (10354529, 4)
y_test shape: (2588633, 4)


In [None]:
# simple train-test
# first build
model = Sequential()
model.add(Embedding(num_words, 256, input_length=feature_len))
# try out bi-directional LSTM
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
model.add(Bidirectional(LSTM(512, dropout=0.2)))
model.add(Dense(num_classes, activation='softmax'))

# choose between learning rates
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

# train model
model.fit(X_train, y_train, batch_size=batch_size, epochs=10, validation_split=0.2, verbose=1, callbacks=[callback])
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

In [None]:
# now lets' test
y_pred = model.predict(X_test, batch_size=batch_size, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(np.argmax(y_test, axis=1), y_pred_bool))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred_bool))

In [None]:
# Use full data set to train saved model


X_train = sequence.pad_sequences(X, maxlen=feature_len)
y_train = keras.utils.to_categorical(y, num_classes)
print('y_train shape:', y_train.shape)
print('y_train shape:', x_train.shape)

model = Sequential()
model.add(Embedding(num_words, 256, input_length=feature_len))
# try out bi-directional LSTM
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
model.add(Bidirectional(LSTM(512, dropout=0.2)))
model.add(Dense(num_classes, activation='softmax'))

# choose between learning rates
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

# train model
model.fit(X_train, y_train, batch_size=batch_size, epochs=10, validation_split=0.2, verbose=1, callbacks=[callback])
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)


model.save('fullname_aligned_fulldata.h5', include_optimizer=False)
model.save('fullname_aligned_opt_fulldata.h5')

In [None]:
# now lets' test again with full model on original test set to compare
y_pred = model.predict(X_test, batch_size=batch_size, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(np.argmax(y, axis=1), y_pred_bool))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred_bool))

              precision    recall  f1-score   support

           0       0.86      0.76      0.81     41861
           1       0.76      0.73      0.75     41904
           2       0.85      0.87      0.86     41940
           3       0.66      0.74      0.70     41707

    accuracy                           0.78    167412
   macro avg       0.78      0.78      0.78    167412
weighted avg       0.78      0.78      0.78    167412

[[31824  2338  2980  4719]
 [ 1461 30797  1110  8536]
 [ 1706  1062 36637  2535]
 [ 1943  6496  2386 30882]]


# Distill the Model

In [None]:
# TODO: shall we try model distillation for compressing the model size?
# so that we will have smaller model to work with
class Distiller(keras.Model):
    def __init__(self, student, teacher):
        super(Distiller, self).__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = self.distillation_loss_fn(
                tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                tf.nn.softmax(student_predictions / self.temperature, axis=1),
            )
            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [None]:
# Create the student
student = keras.Sequential(
    [
        Embedding(num_words, 32, input_length=feature_len),
        Bidirectional(LSTM(16, return_sequences=True, dropout=0.2)),
        Bidirectional(LSTM(16, return_sequences=True, dropout=0.2)),
        Bidirectional(LSTM(16, return_sequences=True, dropout=0.2)),
        Bidirectional(LSTM(16, dropout=0.2)),
        Dense(num_classes, activation='softmax')
    ],
    name="student",
)

In [None]:
# Initialize and compile distiller
distiller = Distiller(student=student, teacher=model)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[keras.metrics.CategoricalAccuracy()],
    student_loss_fn=keras.losses.CategoricalCrossentropy(),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=10,
)

# Distill teacher to student
distiller.fit(X_train, y_train, epochs=40)

# Evaluate student on test dataset
distiller.evaluate(X_test, y_test)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


[0.7372231483459473, 0.5687428116798401]

In [None]:
y_pred = distiller.student.predict(X_test, batch_size=32, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(np.argmax(y_test, axis=1), y_pred_bool))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred_bool))

 197/5232 [>.............................] - ETA: 43s

In [None]:
distiller.student.save('fullname_aligned_distill_opt.h5')
distiller.student.save('fullname_aligned_distill.h5', include_optimizer=False)

# Tune the Model

In [None]:
model.summary()

In [None]:
model.get_config()

In [None]:
model.optimizer

In [None]:
from keras import backend as K
K.eval(model.optimizer.lr)

In [None]:
# from google.colab import files
# files.download('fullname_2_ethnicity_bilstm.h5') 