## Import libraries and frameworks

In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
import tensorflow as tf
import pickle

tf.config.run_functions_eagerly(True)


In [2]:
tf.__version__

'2.4.1'

## Our dataset

In [3]:
# Load in our data set

data = pd.DataFrame(pd.read_csv("names.csv"))

In [4]:
# Taking a look at the first 5 items in our data set

data.head()

Unnamed: 0,Year of Birth,Gender,Ethnicity,Child's First Name,Count,Rank
0,2011,FEMALE,HISPANIC,GERALDINE,13,75
1,2011,FEMALE,HISPANIC,GIA,21,67
2,2011,FEMALE,HISPANIC,GIANNA,49,42
3,2011,FEMALE,HISPANIC,GISELLE,38,51
4,2011,FEMALE,HISPANIC,GRACE,36,53


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22035 entries, 0 to 22034
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Year of Birth       22035 non-null  int64 
 1   Gender              22035 non-null  object
 2   Ethnicity           22035 non-null  object
 3   Child's First Name  22035 non-null  object
 4   Count               22035 non-null  int64 
 5   Rank                22035 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 1.0+ MB


In [6]:
data.describe()

Unnamed: 0,Year of Birth,Count,Rank
count,22035.0,22035.0,22035.0
mean,2012.115589,34.512821,57.06467
std,1.170393,40.085777,25.452851
min,2011.0,10.0,1.0
25%,2011.0,13.0,38.0
50%,2012.0,20.0,59.0
75%,2013.0,36.0,78.0
max,2014.0,426.0,102.0


In [7]:
# value counts

data["Gender"].value_counts()

data["Ethnicity"].value_counts()

HISPANIC                      6654
WHITE NON HISPANIC            6178
BLACK NON HISPANIC            3396
ASIAN AND PACIFIC ISLANDER    2983
WHITE NON HISP                1402
ASIAN AND PACI                 716
BLACK NON HISP                 706
Name: Ethnicity, dtype: int64

In [8]:
#  Selecting the columns we need 

names = data["Child's First Name"]
genders = data["Gender"]

## We now write a few functions to convert our letters to number representations

In [9]:
name_char_set = set()

# We now extract all unique characters in our dataset
for name in names:
    for ch in name:
        if ch not in name_char_set:
            name_char_set.add(ch)
            pass
        pass
    pass

#char to number encoder
char_to_number = {}

# char to number decoder
number_to_char = {}

temp_set = sorted([ch for ch in name_char_set])

# Generate simple key value pair using unique characters
for i, char in enumerate(temp_set):
    char_to_number[char] = i + 1
    number_to_char[i + 1] = char
    pass

In [10]:
# Save your encoders if you want to, Very helpful when you want to tranfer your models

# with open('char_to_number.enc', 'wb') as handle:
#     pickle.dump(char_to_number, handle, protocol=pickle.HIGHEST_PROTOCOL)


# lets now save the character decoder

# with open('number_to_char.enc', 'wb') as handle:
#     pickle.dump(number_to_char, handle, protocol=pickle.HIGHEST_PROTOCOL)


# Reading saved encoder -- for later use
# with open('char_to_number.enc', 'rb') as handle:
#     saved_encoder = pickle.load(handle)

In [12]:
# Testing our encoder and decoder

char_to_number.get("g"), number_to_char.get(char_to_number.get("g"))

(34, 'g')

### Input Transformation

In [13]:
#  Lets now transform all our inputs

X = []

for name in names:
    name_token = [char_to_number.get(ch) for ch in name]
    X.append(name_token)

In [15]:
# Lets take a look at our transformation
X[0], "".join([number_to_char.get(n) for n in X[0]]), names[0]

([8, 6, 19, 2, 13, 5, 10, 15, 6], 'GERALDINE', 'GERALDINE')

In [16]:
# Model parameters

VOCAB_SIZE = len(temp_set) + 1
MAX_LEN = 20  # or max([len(x) for x in X]) - Choosing a big number to be safe

In [17]:
# padding the input sequence to make each training set length of 20

X_padded = keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_LEN, truncating="post", padding="post", value=0)
#  X_padded = X_padded.reshape(-1, 1, 20)

In [18]:
X_padded[0]

array([ 8,  6, 19,  2, 13,  5, 10, 15,  6,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0], dtype=int32)

### Labels Transformation

In [19]:
# Convert all to One-Hot Encoding FEMALE, MALE


labels = np.array(pd.get_dummies(data["Gender"]))


## Building The model

#### You can build your own custom model but my model
#### I used Simple BiLSTM(both forward and Backwards) with some Layers

In [20]:
# BiLSTM block using a simple function

def BI_LSTM_BLOCK(units,return_sequence = False, input_shape=None):
    fwd = keras.layers.LSTM(units, return_sequences=return_sequence, recurrent_dropout=0.2, input_shape=(None, MAX_LEN, VOCAB_SIZE))
    bkw = keras.layers.LSTM(units, return_sequences=return_sequence, go_backwards=True, recurrent_dropout=0.2, input_shape=(None, MAX_LEN, VOCAB_SIZE))
    if input_shape != None:
        lstm = keras.layers.Bidirectional(fwd, bkw, input_shape=input_shape)
        pass
    else:
        lstm = keras.layers.Bidirectional(layer=fwd, backward_layer=bkw)
        pass

    return lstm

In [21]:
# building the model

_input = keras.Input(shape=(20,))
embedding = keras.layers.Embedding(VOCAB_SIZE, 100, input_length=MAX_LEN)(_input)
bidirectional_lstm = BI_LSTM_BLOCK(64)
bidirectional_lstm_2 = BI_LSTM_BLOCK(128, return_sequence=True)



blk = bidirectional_lstm(embedding)
blk_2 = bidirectional_lstm_2(embedding)
bl_2 = keras.layers.Flatten()(blk_2)

concat = keras.layers.Concatenate()([blk, bl_2])

dense = keras.layers.Dense(192, activation="relu")(concat)
dense = keras.layers.Dense(64, activation="relu")(dense)
output = keras.layers.Dense(2, activation="sigmoid")(dense)

In [22]:
# Returns number of classes

output.shape

TensorShape([None, 2])

In [23]:
model = keras.Model(_input, output)


opt = keras.optimizers.Adam(lr=1e-3)
loss = keras.losses.BinaryCrossentropy()
model.compile(metrics=["accuracy"], loss=loss,optimizer=opt)

In [24]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 20, 100)      5400        input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 20, 256)      234496      embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 128)          84480       embedding[0][0]                  
______________________________________________________________________________________________

In [25]:
#  Some few callbacks for our model training

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', min_delta=0, patience=5, verbose=0,
    mode='max', baseline=None, restore_best_weights=True
)

save_best_model = tf.keras.callbacks.ModelCheckpoint(
    "gender_model.h5", monitor='val_accuracy', verbose=0, save_best_only=True,
    save_weights_only=False, mode='max', save_freq='epoch',
    options=None
)

In [28]:
history = model.fit(
    X_padded,
    labels,
    epochs=100,
    batch_size=240,
    shuffle=True,
    validation_split=0.3,
    callbacks=[early_stopping, save_best_model]
)



In [29]:
# Saving model params

model.save("gender_model_tut.h5")
saved_model = keras.models.load_model('gender_model_tut.h5')
# model.save_weights("weight_v2")

### Building Inference function

In [32]:
def process_name(name = ""):
    if name == "":
        return None
    # convert the characters into numbers
    name_token = [char_to_number.get(ch) for ch in name]
    
    # pad to the max length
    tokens = keras.preprocessing.sequence.pad_sequences([name_token], maxlen=MAX_LEN, truncating="post", padding="post", value=0)
    
    return tokens

In [33]:
processed_name = process_name("Archibold")

pred = model.predict(processed_name)[0]

np.round(pred)

array([0., 1.], dtype=float32)