In [1]:
import tensorflow as tf

import csv
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
%pylab

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,auc,roc_curve
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,Activation,Dropout,LSTM,Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint,ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model

Using matplotlib backend: agg
Populating the interactive namespace from numpy and matplotlib


In [2]:
#data stored in a csv file hosted on a public GitHub repo.
csv_url = "https://raw.githubusercontent.com/Sly1029/Twitter-Android/master/names_dataset.csv"
df = pd.read_csv(csv_url,sep=",")


names = df['name'].apply(lambda x:x.lower())
females =  (df.loc[df['sex']=='F']['sex'])
males = (df.loc[df['sex']=='M']['sex'])

df.head(5)

Unnamed: 0,index,name,sex
0,0,Mary,F
1,1,Anna,F
2,2,Emma,F
3,3,Elizabeth,F
4,4,Minnie,F


In [0]:
#visualizing length of names
plt.figure(figsize(12,8))
plt.hist([len(n) for n in names], bins=36)
plt.title("Length of names")
plt.show()

In [4]:
#class imbalance
print("Females",len(females.to_numpy()))
print("Males",len(males.to_numpy()))

#RESOLVE class imbalance if really needed in future. Might not be an issue with LSTM.

Females 60600
Males 34425


In [5]:
#creating vocab
vocab = set(''.join([str(i) for i in names]))
vocab.add("-")
vocab.add(".")

#"-" denotes END
#'.' denotes NULL CHAR
vocab = list(vocab)
vocab.sort()
print(vocab)
print(len(vocab))

mapping = dict((c,i) for i,c in enumerate(vocab))
print(mapping)


['-', '.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
28
{'-': 0, '.': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, 'i': 10, 'j': 11, 'k': 12, 'l': 13, 'm': 14, 'n': 15, 'o': 16, 'p': 17, 'q': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27}


In [6]:
#vectorizing y
data_y = []
max_len = 10

my_dict = {'F':0,'M':1}
df = df.replace(my_dict)

final_Y = []
y_vals = df['sex']
data_Y = y_vals.to_numpy()

for i in data_Y:
  if i == 0:
    final_Y.append([1,0])
  else:
    final_Y.append([0,1])

final_Y = np.asarray(final_Y)
final_Y.shape


(95025, 2)

In [7]:
#vectorizing x


final_X = []

for words in names.values:
  final_mapping = []
  trunc_name = str(words)[0:max_len]
  #print(trunc_name)
  tmp = list(trunc_name)
  for items in tmp:
    word = []
    for index in range(30):
      if index < len(items):
        word.append(mapping[items[index]])
        continue
      if index==len(items):
        word.append(mapping["-"])
        continue
      else:
        word.append(mapping["."])
        continue

    b = np.zeros((28,))
    final_mapping.append(np.eye(28)[word][0])
  a = np.zeros((28,))
  index = mapping['-']
  a[index] = 1
  final_mapping.append(a)
  #final_X.append(np.asarray(final_mapping))
  filled = final_mapping[-1]
  #print(filled)
  while len(final_mapping)<=10:
    a = np.zeros((28,))
    index = mapping['.']
    a[index] = 1
    final_mapping.append(a)
  final_X.append(np.asarray(final_mapping))
  #print(np.asarray(final_mapping))

final_X = np.asarray(final_X)

final_X.shape



(95025, 11, 28)

In [17]:
#only use this crap for logistic regression in sklearn
very_final_X = []
for index,items in enumerate(final_X):
  final_items = np.zeros(28,)
  for more in items:
    final_items = final_items + more
    #final_items = sum(more,final_items)
  very_final_X.append(final_items)
  #final_X[index] = np.zeros(28,)
  #final_X[index] = final_items 

very_final_X = np.asarray(very_final_X)
print(very_final_X[0])

[1. 6. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 1. 0.]


In [20]:
#vectorizing y
data_y = []
max_len = 10

my_dict = {'F':0,'M':1}
df = df.replace(my_dict)

final_Y = []
y_vals = df['sex']
data_Y = y_vals.to_numpy()

for i in data_Y:
  if i == 0:
    final_Y.append([1,0])
  else:
    final_Y.append([0,1])

final_Y = np.asarray(data_Y)
final_Y.shape


(95025,)

In [0]:
#logistic regression-split
X_log_train,X_log_test,y_train,y_test = train_test_split(very_final_X,final_Y,test_size=0.1)

In [26]:
#model building - logistic regression
logistic_regression = LogisticRegression(solver='lbfgs',penalty='none',fit_intercept=False)
logistic_regression.fit(X_log_train,y_train)

#training_accuracy
train_predictions = logistic_regression.predict(X_log_train)
score = accuracy_score(train_predictions,y_train)
print(score*100)

#test_accuracy
test_predictions = logistic_regression.predict(X_log_test)
score = accuracy_score(test_predictions,y_test)
print(score*100)

#params
print(logistic_regression.coef_)

72.46088725708006
71.3564137640745
[[ 0.58857223  0.04678496 -1.04497639  0.43164416  0.26131624  0.66172622
  -0.79016378  0.5471032   0.46373086 -0.02462423 -0.83470927  0.78316331
   0.50625019 -0.13211847  0.51473895  0.01971747  0.1975784   0.38144
   0.77008521  0.4106635   0.14508123  0.09254046  0.18922783  0.90189165
   0.85288411  0.88135201 -0.84106897  0.49446344]]


In [8]:
#train test split
X_train,X_test,y_train,y_test = train_test_split(final_X,final_Y,test_size=0.1)
X_train.shape
X_test.shape

'''
X_train = tf.convert_to_tensor(X_train,np.float32)
X_test = tf.convert_to_tensor(X_test,np.float32)
y_train = tf.convert_to_tensor(y_train,np.float32)
y_test = tf.convert_to_tensor(y_test,np.float32)
'''

'\nX_train = tf.convert_to_tensor(X_train,np.float32)\nX_test = tf.convert_to_tensor(X_test,np.float32)\ny_train = tf.convert_to_tensor(y_train,np.float32)\ny_test = tf.convert_to_tensor(y_test,np.float32)\n'

In [9]:
#LSTM
model = Sequential()
model.add(Bidirectional(LSTM(512,return_sequences=True),backward_layer=LSTM(512,return_sequences=True,go_backwards=True),input_shape=(max_len+1,len(vocab))))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(512)))
model.add(Dropout(0.2))
model.add(Dense(2, activity_regularizer=l2(0.002)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [0]:
callback = EarlyStopping(monitor='val_loss', patience=5)
mc = ModelCheckpoint('best_model_9.h5', monitor='val_loss', mode='min', verbose=1)
reduce_lr_acc = ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=2, verbose=1, min_delta=1e-4, mode='max')

In [11]:
batch_size = 256
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=35,verbose=1, validation_data =(X_test, y_test), callbacks=[callback, mc, reduce_lr_acc])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 85522 samples, validate on 9503 samples
Epoch 1/35
Epoch 00001: saving model to best_model_9.h5
Epoch 2/35
Epoch 00002: saving model to best_model_9.h5
Epoch 3/35
Epoch 00003: saving model to best_model_9.h5
Epoch 4/35
Epoch 00004: saving model to best_model_9.h5
Epoch 5/35
Epoch 00005: saving model to best_model_9.h5
Epoch 6/35
Epoch 00006: saving model to best_model_9.h5
Epoch 7/35
Epoch 00007: saving model to best_model_9.h5
Epoch 8/35
Epoch 00008: saving model to best_model_9.h5
Epoch 9/35
Epoch 00009: saving model to best_model_9.h5
Epoch 10/35
Epoch 00010: saving model to best_model_9.h5
Epoch 11/35
Epoch 00011: saving model to best_model_9.h5
Epoch 12/35
Epoch 00012: saving model to best_model_9.h5
Epoch 13/35
Epoch 00013: saving model to best_model_9.h5
Epoch 14/35
Epoch 00014: saving model to best_model_9.h5
Epoch 15/35
Epoch 00015: saving model to best_model_9.h5
Epoch 16/35

In [40]:
#a function to convert names to vectors for custom testing once the model is trained
given_test = ["ben","john"]

final_X = []

def convert_X(X):
  for words in X:
    words = words.lower()
    final_mapping = []
    trunc_name = str(words)[0:max_len]
    #print(trunc_name)
    tmp = list(trunc_name)
    for items in tmp:
      word = []
      for index in range(30):
        if index < len(items):
          word.append(mapping[items[index]])
          continue
        if index==len(items):
          word.append(mapping["-"])
          continue
        else:
          word.append(mapping["."])
          continue

      b = np.zeros((28,))
      final_mapping.append(np.eye(28)[word][0])
    a = np.zeros((28,))
    index = mapping['-']
    a[index] = 1
    final_mapping.append(a)
    #final_X.append(np.asarray(final_mapping))
    filled = final_mapping[-1]
    #print(filled)
    while len(final_mapping)<=10:
      a = np.zeros((28,))
      index = mapping['.']
      a[index] = 1
      final_mapping.append(a)
    final_X.append(np.asarray(final_mapping))
    #print(np.asarray(final_mapping))
  return np.asarray(final_X)

input_list = ["akash","vivek","John","Laila","Samantha","Rohit","Sam","Jupiter","Shiliqua","Eileen","Kasey","Casey"]
testy = convert_X(input_list)

predictions = model.predict(testy)
#print(predictions)
#print(predictions)


#besty = convert_X(X_test)
correct = 0
total = 0
#predictions = model.predict(X_test)
for index,items in enumerate(predictions):
  total+=1
  if items[0] >= items[1]:
    if y_test[index][0] == 1:
      correct+=1
  else:
    if y_test[index][1]==1:
      correct+=1

acc = correct/total * 100
    
for index,items in enumerate(input_list):
  if predictions[index][0] > predictions[index][1]:
    print('Girl ',end="")

  if predictions[index][1]>predictions[index][0]:
    print('Boy ',end="")
  print(items,max(predictions[index].tolist()))






Boy akash 0.9970390796661377
Boy vivek 0.9968423843383789
Boy John 0.9490768313407898
Girl Laila 0.9929890632629395
Girl Samantha 0.9940319657325745
Boy Rohit 0.9195820689201355
Boy Sam 0.9956364035606384
Boy Jupiter 0.9774627685546875
Girl Shiliqua 0.995093584060669
Girl Eileen 0.9929437041282654
Girl Kasey 0.9810515642166138
Girl Casey 0.6886679530143738
