### Character level LSTM for gender classification 
#### A comparison between word and character level in gender classifiction

In [1]:
#imports 
import re

import numpy as np 
import pandas as pd 

import tensorflow as tf

from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split

In [2]:
#load datasets
data = pd.read_csv('name_gender_fix.csv')

In [3]:
data.gender.unique()

array(['m', 'f'], dtype=object)

In [4]:
#preprocessing datasets
data['name'] = data['name'].apply(lambda x: x.lower())

In [41]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,name,gender
0,0,hafizhan shidqi,m
1,1,gandhi wibowo,m
2,2,aldio mahendra purwandrarto,m
3,3,benny putra,m
4,4,vicky vernando dasta,m
5,5,jufianto henri,m
6,6,aan nuraini,f
7,7,abdur rahman,m
8,8,abdurrahman,m
9,9,ade indra sukma,f


In [6]:
#create vocabulary dictionary
human_vocab = set()

for name in data['name']:
    human_vocab.update(tuple(name))
    
vocab_index = {v: k + 1 for k, v in enumerate(human_vocab)}

print(vocab_index)
print(len(vocab_index))


{'b': 1, 'w': 2, 'i': 3, '.': 4, 'd': 5, 'r': 6, 's': 7, 'h': 8, 'm': 9, "'": 10, 'f': 11, 'q': 12, 'g': 13, 'c': 14, 'l': 15, 'y': 16, 'u': 17, 'p': 18, ' ': 19, 't': 20, 'n': 21, 'k': 22, 'v': 23, 'o': 24, 'z': 25, 'e': 26, 'j': 27, 'a': 28}
28


In [46]:
import json 
with open('char_dictionary.json', 'w') as f:
    json.dump(vocab_index, f)

In [7]:
name_datasets = data['name'].apply(lambda x: [vocab_index[key] for key in list(x)])

In [8]:
X = pad_sequences(name_datasets)

In [9]:
Y = pd.get_dummies(data['gender']).values

In [10]:
print(Y[0])

[0 1]


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print('data shape')
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

data shape
(1568, 32) (1568, 2)
(392, 32) (392, 2)


In [24]:
#build model
model = Sequential()

#model.add(Input(shape=(40, len(vocab_index)), dtype='float32'))
#model.add(LSTM(128, input_shape=(40, len(vocab_index)), dropout=0.2, recurrent_dropout=0.2))
#model.add(Dense(3, activation='sigmoid'))

#model.add(Embedding(40, 47, input_length=X.shape[1]))
model.add(Embedding(len(vocab_index) + 1, 16, input_length=X.shape[1]))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='sigmoid'))

#compile model 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('model summary')
print(model.summary())

model summary
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 32, 16)            464       
_________________________________________________________________
lstm_6 (LSTM)                (None, 64)                20736     
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 130       
Total params: 21,330
Trainable params: 21,330
Non-trainable params: 0
_________________________________________________________________
None


In [25]:
batch_size = 64
n_epochs = 12
model.fit(X_train, y_train, batch_size=batch_size, epochs=n_epochs, validation_data=(X_test, y_test), verbose=1)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<tensorflow.python.keras.callbacks.History at 0x7f4b4d77de80>

In [26]:
#test evaluate
score, acc = model.evaluate(X_test, y_test, batch_size=64)
print('score', score)
print('accuracy', acc)

score 0.376253604888916
accuracy 0.831632673740387


In [47]:
#test 
name = 'Aminarti'
name = list(name.lower())
test_dt = [vocab_index[x] for x in name]
test_dt = pad_sequences([test_dt], maxlen=X.shape[1])
print(test_dt)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  28  9  3 21 28  6 20  3]]


In [48]:
pad = np.array(test_dt[0])
res = model.predict(pad.reshape(1, pad.shape[0]), batch_size=1, verbose=2)[0]
print(res)
if np.argmax(res) == 0:
    print('Female')
elif np.argmax(res) == 1:
    print('Male')

1/1 - 0s
[0.856099   0.13981047]
Female
