In [5]:
# Gender Classification using Character Embeddings

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Load and preprocess the data
file_path = '..\\data\\mm_names.csv'
data = pd.read_csv(file_path)

# Select relevant columns
names = data['Name'].astype(str)
genders = data['Gender']

# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(genders)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(names, labels, test_size=0.4, random_state=42)

# Tokenize the names
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
max_sequence_length = max([len(seq) for seq in X_train_seq])
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')

# Build the model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_pad, y_train, epochs=10, validation_data=(X_test_pad, y_test), batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')




Epoch 1/10




[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.6306 - loss: 0.6289 - val_accuracy: 0.7390 - val_loss: 0.5208
Epoch 2/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7216 - loss: 0.5373 - val_accuracy: 0.7559 - val_loss: 0.5025
Epoch 3/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7411 - loss: 0.5078 - val_accuracy: 0.7484 - val_loss: 0.4825
Epoch 4/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.7402 - loss: 0.4976 - val_accuracy: 0.7502 - val_loss: 0.4708
Epoch 5/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.7644 - loss: 0.4676 - val_accuracy: 0.7850 - val_loss: 0.4491
Epoch 6/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.7901 - loss: 0.4442 - val_accuracy: 0.7859 - val_loss: 0.4392
Epoch 7/10
[1m134/134[0m [32m━