## Gender Classification/Prediction of Names

In [1]:
# Importing EDA libraries

import pandas as pd
import numpy as np

In [2]:
# Loading our csv file as our dataset doesn't have any columns we are using names from pandas read_csv attribute

df = pd.read_csv('name_gender.csv',names=['Names','Gender','Score'])
df.head()

Unnamed: 0,Names,Gender,Score
0,Aaban,M,1.0
1,Aabha,F,1.0
2,Aabid,M,1.0
3,Aabriella,F,1.0
4,Aada,F,1.0


In [3]:
#size of our dataset 
df.size

285075

In [4]:
# Checking for any Null values in our dataset

df.isnull().sum()

Names     0
Gender    0
Score     0
dtype: int64

In [5]:
#datatypes
df.dtypes

Names      object
Gender     object
Score     float64
dtype: object

In [6]:
# Number of female names

df[df.Gender == 'F'].size

180912

In [7]:
# Number of male names

df[df.Gender == 'M'].size

104163

In [8]:
#Replacing all the 'F' and 'M' with 0 and 1 respectively
df_names = df
df_names.Gender.replace({'F':0,'M':1},inplace=True)

In [9]:
df_names.Gender.value_counts()

0    60304
1    34721
Name: Gender, dtype: int64

In [10]:
X_features = df_names.Names

In [None]:
# Ml Packages
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
#Feature Extraction
cv = CountVectorizer()
X = cv.fit_transform(X_features)

In [13]:
cv.get_feature_names()

['aaban',
 'aabha',
 'aabid',
 'aabriella',
 'aada',
 'aadam',
 'aadan',
 'aadarsh',
 'aaden',
 'aadesh',
 'aadhav',
 'aadhavan',
 'aadhi',
 'aadhira',
 'aadhvik',
 'aadhya',
 'aadhyan',
 'aadi',
 'aadian',
 'aadil',
 'aadin',
 'aadish',
 'aadison',
 'aadit',
 'aadith',
 'aadithya',
 'aaditri',
 'aaditya',
 'aadiv',
 'aadon',
 'aadrian',
 'aadrika',
 'aadrit',
 'aadvik',
 'aadvika',
 'aadya',
 'aadyn',
 'aafia',
 'aafreen',
 'aagam',
 'aage',
 'aagot',
 'aahaan',
 'aahan',
 'aahana',
 'aahil',
 'aahir',
 'aahliyah',
 'aahna',
 'aahron',
 'aaidan',
 'aaiden',
 'aaidyn',
 'aaila',
 'aailiyah',
 'aailyah',
 'aaima',
 'aaira',
 'aairah',
 'aaisha',
 'aaishah',
 'aaiyana',
 'aaiza',
 'aaja',
 'aajah',
 'aajaylah',
 'aajon',
 'aakanksha',
 'aakarsh',
 'aakash',
 'aakeem',
 'aakilah',
 'aakira',
 'aakiyah',
 'aakriti',
 'aala',
 'aalaiya',
 'aalaiyah',
 'aalana',
 'aalanah',
 'aalani',
 'aalap',
 'aalaya',
 'aalayah',
 'aalayiah',
 'aalayjah',
 'aalayna',
 'aalaysha',
 'aalaysia',
 'aalea',
 

In [14]:
# train test splitting
# X = we got after transfroming the feature using CountVectorizer
# y = df_names.Gender
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,df_names.Gender, test_size=0.33, random_state=42)

In [15]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.6328645683854714

In [16]:
# Test Accuracy of the Model
print("Test Accuracy of our Model",round(clf.score(X_test,y_test)*100,4),'%')

Test Accuracy of our Model 63.2865 %


In [17]:
# Train Accuracy of the Model
print("Train Accuracy of our Model",round(clf.score(X_train,y_train)*100,4),'%')

Train Accuracy of our Model 100.0 %


### Clearly it is seen that our model is overfitted

## Sample Prediction

In [18]:
sample_name = ['Misty','Miroslav']
vect = cv.transform(sample_name).toarray()

In [19]:
# Female = 0, Male = 1
clf.predict(vect)

array([0, 1], dtype=int64)

In [20]:
# lets define a function
def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print('Female')
    else:
        print('Male')

In [21]:
genderpredictor('Martha')

Female


In [22]:
namelist = ['Mary','Mark','Natasha','Masha','Ana','Messi','Toshi','Raheem','Fitzgerald','Luke']
for i in namelist:
    genderpredictor(i)

Female
Male
Female
Female
Female
Male
Female
Female
Male
Male


Non English Names like Raheem and Toshi are wrongly classified

### Using LSTM 

In [23]:
# import keras libraries
from tensorflow.keras.preprocessing.text import Tokenizer,one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding,Dropout

In [24]:
#Converting the words into sequence of vectors
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(df_names.Names)
sequence_of_int = tokenizer.texts_to_sequences(df_names.Names)

In [25]:
#padding the tokenized word for equal lenght
padsequences=pad_sequences(sequence_of_int,maxlen=15,padding='post')

In [26]:
padsequences.shape

(95025, 15)

In [None]:
#converting our labels into categorical
labels=to_categorical(df_names.Gender)

In [28]:
from sklearn.model_selection import train_test_split
feature_train,feature_test,label_train,label_test=train_test_split(padsequences,labels,test_size=0.1,random_state=42)

In [29]:
#define a Sequential Model
model=Sequential()

#adding an Embedding layer before passing it to LSTM
model.add(Embedding(27,64,input_length=15))

#1st LSTM layer which returns a sequence of vectors of dimension 2048
model.add(LSTM(2048,return_sequences=True))

#2nd LSTM layer return a single vector of dimension 256
model.add(LSTM(256,return_sequences=False))

#adding a dropout layer to avoid overfitting
model.add(Dropout(0.2))

#Dense layer with 2 inputs with sigmoid activation functiond
model.add(Dense(2,activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 64)            1728      
_________________________________________________________________
lstm (LSTM)                  (None, 15, 2048)          17309696  
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               2360320   
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 2)                 514       
Total params: 19,672,258
Trainable params: 19,672,258
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.fit(feature_train,label_train,epochs=10,validation_data=(feature_test,label_test),batch_size=3000)

Train on 85522 samples, validate on 9503 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1e931fa56c8>

In [38]:
#predition using test data
import sklearn.metrics as m
label_pred=model.predict(feature_test)
label_pred

array([[0.5012706 , 0.4987274 ],
       [0.86967885, 0.12935294],
       [0.938498  , 0.06107036],
       ...,
       [0.96735203, 0.03205542],
       [0.10715456, 0.8920172 ],
       [0.962909  , 0.03655704]], dtype=float32)

In [39]:
label_pred=np.argmax(label_pred,axis=1)

In [40]:
label_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [41]:
l_test=np.argmax(label_test,axis=1)

In [42]:
m.accuracy_score(l_test,label_pred)

0.8520467220877618

In [43]:
#Classification Report Precision,Recall,F1score
print(m.classification_report(l_test,label_pred))

              precision    recall  f1-score   support

           0       0.86      0.91      0.89      5997
           1       0.83      0.75      0.79      3506

    accuracy                           0.85      9503
   macro avg       0.85      0.83      0.84      9503
weighted avg       0.85      0.85      0.85      9503



In [44]:
#Confusion Matrix
m.confusion_matrix(l_test,label_pred)

array([[5462,  535],
       [ 871, 2635]], dtype=int64)

In [46]:
# Pickle the model for reuse
import pickle
pickle.dump(tokenizer,open('tokenizer.pkl','wb+'),protocol=pickle.HIGHEST_PROTOCOL)

In [47]:
tokenizer=pickle.load(open('tokenizer.pkl','rb+'))

In [48]:
tokenizer.index_word

{1: 'a',
 2: 'e',
 3: 'n',
 4: 'i',
 5: 'r',
 6: 'l',
 7: 's',
 8: 'h',
 9: 'y',
 10: 'o',
 11: 't',
 12: 'd',
 13: 'm',
 14: 'k',
 15: 'c',
 16: 'u',
 17: 'j',
 18: 'b',
 19: 'v',
 20: 'g',
 21: 'z',
 22: 'w',
 23: 'p',
 24: 'f',
 25: 'q',
 26: 'x'}

In [49]:
def input(n):
  q=tokenizer.texts_to_sequences(n)
  q=[i[0] for i in q]
  q1=[]
  q1.append(q)
  s=pad_sequences(q1,maxlen=15,padding='post')
  return s

In [52]:
tokenizer.texts_to_sequences('vishal')

[[19], [4], [7], [8], [1], [6]]

In [51]:
def pred(n):
  w=model.predict(input(n))
  w=np.argmax(w,axis=1)
  return w[0]

In [53]:
#1 for Male and 0 for female
pred('vishal')

1

In [62]:
name_list = ['rajeev','sudha','rahul','john','Chung','manga','Xena','aliabhatt','rithvik','gladiator','optumus','paul']
for i in name_list:
    if pred(i) == 0:
        print(i,'is Female')
    else:
        print(i,'is Male')

rajeev is Male
sudha is Female
rahul is Male
john is Male
Chung is Male
manga is Female
Xena is Female
aliabhatt is Female
rithvik is Male
gladiator is Male
optumus is Male
paul is Male


In [63]:
model.save('model.h5')