In [3]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
# Sample dataset
names = ['John', 'Jane', 'Alice', 'Bob', 'Michael', 'Mary', 'David', 'Sarah', 'James', 'Emily',
         'William', 'Emma', 'Matthew', 'Olivia', 'Daniel', 'Sophia', 'Christopher', 'Isabella', 'Andrew', 'Ava']
genders = ['male', 'female', 'female', 'male', 'male', 'female', 'male', 'female', 'male', 'female',
           'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female']
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Create Vocabulary
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(names)
print("Vocabulary:", tokenizer.word_index)

# Convert names to numerical representation
X = tokenizer.texts_to_sequences(names)
print(X)

# Pad sequences to a fixed length
max_length = max(len(seq) for seq in X)
X = pad_sequences(X, maxlen=max_length, padding='post')
print(X)
# Convert to numpy array
X = np.array(X)
# One-hot encode genders
y = np.array([1 if gender == 'male' else 0 for gender in genders])
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(len(tokenizer.word_index))
print(X.shape[1])
# Build SimpleRNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=10, input_length=X.shape[1]))
model.add(SimpleRNN(10))
model.add(Dense(1, activation='sigmoid'))
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=1, verbose=1)
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Vocabulary: {'a': 1, 'i': 2, 'e': 3, 'l': 4, 'm': 5, 'h': 6, 'o': 7, 'r': 8, 's': 9, 'n': 10, 'd': 11, 'j': 12, 'c': 13, 'b': 14, 'v': 15, 'w': 16, 't': 17, 'y': 18, 'p': 19}
[[12, 7, 6, 10], [12, 1, 10, 3], [1, 4, 2, 13, 3], [14, 7, 14], [5, 2, 13, 6, 1, 3, 4], [5, 1, 8, 18], [11, 1, 15, 2, 11], [9, 1, 8, 1, 6], [12, 1, 5, 3, 9], [3, 5, 2, 4, 18], [16, 2, 4, 4, 2, 1, 5], [3, 5, 5, 1], [5, 1, 17, 17, 6, 3, 16], [7, 4, 2, 15, 2, 1], [11, 1, 10, 2, 3, 4], [9, 7, 19, 6, 2, 1], [13, 6, 8, 2, 9, 17, 7, 19, 6, 3, 8], [2, 9, 1, 14, 3, 4, 4, 1], [1, 10, 11, 8, 3, 16], [1, 15, 1]]
[[12  7  6 10  0  0  0  0  0  0  0]
 [12  1 10  3  0  0  0  0  0  0  0]
 [ 1  4  2 13  3  0  0  0  0  0  0]
 [14  7 14  0  0  0  0  0  0  0  0]
 [ 5  2 13  6  1  3  4  0  0  0  0]
 [ 5  1  8 18  0  0  0  0  0  0  0]
 [11  1 15  2 11  0  0  0  0  0  0]
 [ 9  1  8  1  6  0  0  0  0  0  0]
 [12  1  5  3  9  0  0  0  0  0  0]
 [ 3  5  2  4 18  0  0  0  0  0  0]
 [16  2  4  4  2  1  5  0  0  0  0]
 [ 3  5  5  1  0  0  0  0

In [9]:
# Define a function to predict gender for a given name
def predict_gender(name):
    # Convert the name to numerical representation
    name_seq = tokenizer.texts_to_sequences([name])
    # Pad sequences to a fixed length
    max_length = max(len(name_seq) for name_seq in X)
    name_seq = pad_sequences(name_seq, maxlen=max_length, padding='post')
    # Convert to numpy array
    name_seq = np.array(name_seq)
    # Make the prediction
    prediction = model.predict(name_seq)[0][0]
    # Convert prediction to gender label
    gender = 'male' if prediction >= 0.5 else 'female'
    return gender
random_name = 'Emma'
predicted_gender = predict_gender(random_name)
print(f"The predicted gender for the name '{random_name}' is: {predicted_gender}")


The predicted gender for the name 'Emma' is: female
