# Senior Project 2020

## --------------------------------------------Data Preprocessing----------------------------------------

## Import Libraries

In [1]:
# Standard Library
import pandas as pd
import numpy as np

In [2]:
# Data Preprocessing Library
from sklearn.feature_extraction import DictVectorizer
from nltk.tokenize import SyllableTokenizer
from nltk import word_tokenize, ngrams
from sklearn.model_selection import train_test_split

In [3]:
# Model Library
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC  
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import VotingClassifier

In [4]:
#Feature analysis
import matplotlib.pyplot as plt
from statistics import mean 

In [5]:
#Model Tuning Library
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

## Load Name Dataset

In [6]:
df = pd.read_excel('name_dataset.xlsx')

In [7]:
df.shape

(11671, 2)

In [8]:
df.head()

Unnamed: 0,name,gender
0,Yuwadee Klanarong Civil,female
1,Muay Chita,female
2,Passorn DT,female
3,Nut Samsarai,male
4,Jan Jao,female


In [9]:
df.isnull().sum()

name         0
gender    1640
dtype: int64

In [10]:
# Number of Female Names
df[df.gender == 'female'].shape

(5460, 2)

In [11]:
# Number of Male Names
df[df.gender == 'male'].shape

(4571, 2)

In [12]:
# Checking for column name consistency
df.columns

Index(['name', 'gender'], dtype='object')

## Data Cleaning

In [13]:
df = df.dropna(subset = ['gender'])

In [14]:
df = df.reset_index(drop=True)

In [15]:
df.shape

(10031, 2)

In [16]:
df.isnull().sum()

name      0
gender    0
dtype: int64

## Features

#### Extract First Name

In [17]:
df['firstName'] = df['name'].str.split().str[0]

In [18]:
df.head()

Unnamed: 0,name,gender,firstName
0,Yuwadee Klanarong Civil,female,Yuwadee
1,Muay Chita,female,Muay
2,Passorn DT,female,Passorn
3,Nut Samsarai,male,Nut
4,Jan Jao,female,Jan


#### Replacing All F and M with 0 and 1 respectively

In [19]:
df.gender.replace({'female':0,'male':1},inplace=True)

In [20]:
df.head()

Unnamed: 0,name,gender,firstName
0,Yuwadee Klanarong Civil,0,Yuwadee
1,Muay Chita,0,Muay
2,Passorn DT,0,Passorn
3,Nut Samsarai,1,Nut
4,Jan Jao,0,Jan


### Extracting Features

#### Using a custom function for feature analysis : Substring,Syllable n-gram

In [21]:
# Convert Tuple to String like Tuple for function "gender_features()"
def convertTuple(tup) : 
    str =  "("
    for i in range(len(tup)) :
        if i < len(tup)-1 :
            str += "'" + tup[i] + "', "
        else :
            str += "'" + tup[i] + "'"
    str += ")"
    return str

convertTuple(('Yu','wa'))

"('Yu', 'wa')"

In [22]:
# Extract features from names
def gender_features(name,isSubstring,isCharacter,ngrams_mode):
        SSP = SyllableTokenizer()
        name = name.lower()
        features = {}
        if (isSubstring) :
            features["first_letter"] = name[0].lower()
            features["first2_letter"] = name[0:2].lower()
            features["first3_letter"] = name[0:3].lower()
            features["first4_letter"] = name[0:4].lower()
            features["last_letter"] = name[-1].lower()
            features["last2_letter"] = name[-2:].lower()
            features["last3_letter"] = name[-3:].lower()
            features["last4_letter"] = name[-4:].lower()
        if (isCharacter) :
            for letter in "abcdefghijklmnopqrstuvwxyz0123456789.'-":
                features["count({})".format(letter)] = name.lower().count(letter)
        token = SSP.tokenize(name)
        if (ngrams_mode == 1) :
            if len(list(ngrams(token, 1))) > 0 : features["1grams-1"] = list(ngrams(token, 1))[0][0]
            if len(list(ngrams(token, 1))) > 1 : features["1grams-2"] = list(ngrams(token, 1))[1][0]
            if len(list(ngrams(token, 1))) > 2 : features["1grams-3"] = list(ngrams(token, 1))[2][0]
            if len(list(ngrams(token, 1))) > 3 : features["1grams-4"] = list(ngrams(token, 1))[3][0]
            if len(list(ngrams(token, 1))) > 4 : features["1grams-5"] = list(ngrams(token, 1))[4][0]
            if len(list(ngrams(token, 1))) > 5 : features["1grams-6"] = list(ngrams(token, 1))[5][0]
        if (ngrams_mode == 2) :
            if len(list(ngrams(token, 2))) > 0 : features["2grams-1"] = convertTuple(list(ngrams(token, 2))[0])
            if len(list(ngrams(token, 2))) > 1 : features["2grams-2"] = convertTuple(list(ngrams(token, 2))[1])
            if len(list(ngrams(token, 2))) > 2 : features["2grams-3"] = convertTuple(list(ngrams(token, 2))[2])
            if len(list(ngrams(token, 2))) > 3 : features["2grams-4"] = convertTuple(list(ngrams(token, 2))[3])
            if len(list(ngrams(token, 2))) > 4 : features["2grams-5"] = convertTuple(list(ngrams(token, 2))[4])
        return features

#### DictVectorizer - Convert features to sparse matrix

In [23]:
dv = DictVectorizer()
isSubstring = True
isCharacter = True
ngrams_mode = 2 #modes == 1,2
Xfeatures = [gender_features(n,isSubstring,isCharacter,ngrams_mode) for n in df['firstName']]

In [24]:
Xfeatures[:1]

[{'first_letter': 'y',
  'first2_letter': 'yu',
  'first3_letter': 'yuw',
  'first4_letter': 'yuwa',
  'last_letter': 'e',
  'last2_letter': 'ee',
  'last3_letter': 'dee',
  'last4_letter': 'adee',
  'count(a)': 1,
  'count(b)': 0,
  'count(c)': 0,
  'count(d)': 1,
  'count(e)': 2,
  'count(f)': 0,
  'count(g)': 0,
  'count(h)': 0,
  'count(i)': 0,
  'count(j)': 0,
  'count(k)': 0,
  'count(l)': 0,
  'count(m)': 0,
  'count(n)': 0,
  'count(o)': 0,
  'count(p)': 0,
  'count(q)': 0,
  'count(r)': 0,
  'count(s)': 0,
  'count(t)': 0,
  'count(u)': 1,
  'count(v)': 0,
  'count(w)': 1,
  'count(x)': 0,
  'count(y)': 1,
  'count(z)': 0,
  'count(0)': 0,
  'count(1)': 0,
  'count(2)': 0,
  'count(3)': 0,
  'count(4)': 0,
  'count(5)': 0,
  'count(6)': 0,
  'count(7)': 0,
  'count(8)': 0,
  'count(9)': 0,
  'count(.)': 0,
  "count(')": 0,
  'count(-)': 0,
  '2grams-1': "('yu', 'wa')",
  '2grams-2': "('wa', 'dee')"}]

In [25]:
X = dv.fit_transform(Xfeatures)

In [26]:
X

<10031x18720 sparse matrix of type '<class 'numpy.float64'>'
	with 484786 stored elements in Compressed Sparse Row format>

## Train Test Split

In [27]:
X
y = df['gender']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## -----------------------------------------Basic Model Classification--------------------------------------

## 1.Naive Bayes Classifier

In [38]:
mnb = MultinomialNB()

In [39]:
mnb.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Prediction and Evaluation

In [40]:
mnb_pred = mnb.predict(X_test)

In [41]:
print(confusion_matrix(y_test,mnb_pred))

[[1122  256]
 [ 274  856]]


In [42]:
print(classification_report(y_test,mnb_pred))

              precision    recall  f1-score   support

           0       0.80      0.81      0.81      1378
           1       0.77      0.76      0.76      1130

    accuracy                           0.79      2508
   macro avg       0.79      0.79      0.79      2508
weighted avg       0.79      0.79      0.79      2508



#### cross validation

In [43]:
# 10-fold cross-validation
mnb_scores = cross_val_score(mnb, X, y, cv=10, scoring='accuracy',n_jobs = -1)
print(mnb_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (mnb_scores.mean(), mnb_scores.std() * 2))

[0.79681275 0.78165503 0.75972084 0.78464606 0.78065803 0.78364905
 0.81256231 0.79361914 0.82352941 0.77567298]
Accuracy: 0.79 (+/- 0.03)


## 2.Random Forest

In [247]:
rfc = RandomForestClassifier(n_estimators=20, min_samples_leaf=1,criterion='gini',max_features=None,random_state = 0)

In [248]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

#### Prediction and Evaluation

In [249]:
rfc_pred = rfc.predict(X_test)

In [250]:
print(confusion_matrix(y_test,rfc_pred))

[[1135  243]
 [ 311  819]]


In [251]:
print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

           0       0.78      0.82      0.80      1378
           1       0.77      0.72      0.75      1130

    accuracy                           0.78      2508
   macro avg       0.78      0.77      0.78      2508
weighted avg       0.78      0.78      0.78      2508



In [252]:
# 10-fold cross-validation
rfc_scores = cross_val_score(rfc, X, y, cv=10, scoring='accuracy',n_jobs = -1)
print(rfc_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (rfc_scores.mean(), rfc_scores.std() * 2))

[0.79482072 0.79561316 0.76969093 0.78065803 0.78165503 0.78564307
 0.82552343 0.777667   0.80159521 0.777667  ]
Accuracy: 0.79 (+/- 0.03)


## 3.Support Vector Machine

In [253]:
svm = SVC(kernel='linear') 

In [254]:
svm.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

#### Prediction and Evaluation

In [255]:
svm_pred = svm.predict(X_test)

In [256]:
print(confusion_matrix(y_test,svm_pred))

[[1098  280]
 [ 269  861]]


In [257]:
print(classification_report(y_test,svm_pred))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80      1378
           1       0.75      0.76      0.76      1130

    accuracy                           0.78      2508
   macro avg       0.78      0.78      0.78      2508
weighted avg       0.78      0.78      0.78      2508



In [258]:
# 10-fold cross-validation
svm_scores = cross_val_score(svm, X, y, cv=10, scoring='accuracy',n_jobs = -1)
print(svm_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (svm_scores.mean(), svm_scores.std() * 2))

[0.79482072 0.77367896 0.76370887 0.79361914 0.78265204 0.78364905
 0.81256231 0.80059821 0.7996012  0.79760718]
Accuracy: 0.79 (+/- 0.03)


## 4.KNN

In [259]:
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')

In [260]:
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [261]:
knn_pred = knn.predict(X_test)

#### Prediction and Evaluation

In [262]:
print(confusion_matrix(y_test,knn_pred))

[[1086  292]
 [ 305  825]]


In [263]:
print(classification_report(y_test,knn_pred))

              precision    recall  f1-score   support

           0       0.78      0.79      0.78      1378
           1       0.74      0.73      0.73      1130

    accuracy                           0.76      2508
   macro avg       0.76      0.76      0.76      2508
weighted avg       0.76      0.76      0.76      2508



##### Cross Validation

In [264]:
# 10-fold cross-validation
knn_scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy',n_jobs = -1)
print(knn_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (knn_scores.mean(), knn_scores.std() * 2))

[0.7938247  0.74775673 0.7337986  0.74975075 0.77168495 0.74775673
 0.77268195 0.77168495 0.80059821 0.7666999 ]
Accuracy: 0.77 (+/- 0.04)


## 5. Neural Network

In [265]:
nn = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(10), random_state=0)

In [266]:
nn.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=10, learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=0, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

#### Prediction and Evaluation

In [267]:
nn_pred = nn.predict(X_test)

In [268]:
print(confusion_matrix(y_test,nn_pred))

[[1095  283]
 [ 268  862]]


In [269]:
print(classification_report(y_test,nn_pred))

              precision    recall  f1-score   support

           0       0.80      0.79      0.80      1378
           1       0.75      0.76      0.76      1130

    accuracy                           0.78      2508
   macro avg       0.78      0.78      0.78      2508
weighted avg       0.78      0.78      0.78      2508



In [270]:
# 10-fold cross-validation
nn_scores = cross_val_score(nn, X, y, cv=10, scoring='accuracy',n_jobs = -1)
print(nn_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (nn_scores.mean(), nn_scores.std() * 2))

[0.79681275 0.77966102 0.77268195 0.78963111 0.78763709 0.77567298
 0.80957129 0.79262213 0.80159521 0.78963111]
Accuracy: 0.79 (+/- 0.02)


## -------------------------------------------Feature Analysis -----------------------------------------------

In [50]:
def modelTraining(isSubstring,isCharacter,ngrams_mode) :
    Xfeatures = [gender_features(n,isSubstring,isCharacter,ngrams_mode) for n in df['firstName']]
    X = dv.fit_transform(Xfeatures)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    #fit Model with new features
    mnb.fit(X_train,y_train)
    rfc.fit(X_train, y_train)
    svm.fit(X_train,y_train)
    knn.fit(X_train,y_train)
    nn.fit(X_train, y_train)
    accuracy = [
                    cross_val_score(mnb, X, y, cv=10, scoring='accuracy',n_jobs = -1).mean(),
                    cross_val_score(rfc, X, y, cv=10, scoring='accuracy',n_jobs = -1).mean(),
                    cross_val_score(svm, X, y, cv=10, scoring='accuracy',n_jobs = -1).mean(),
                    cross_val_score(knn, X, y, cv=10, scoring='accuracy',n_jobs = -1).mean(),
                    cross_val_score(nn, X, y, cv=10, scoring='accuracy',n_jobs = -1).mean()
                    ]
    return accuracy

## Only Substring

In [51]:
isSubstring = True
isCharacter = False
ngrams_mode = 0 #modes == 0,1,2

In [52]:
accuracy_sub = modelTraining(isSubstring,isCharacter,ngrams_mode)



In [53]:
accuracy_sub

[0.7808787780086036,
 0.7828722001326698,
 0.7830720984456987,
 0.7675193542877344,
 0.7748970220811668]

## Only Character Frequency

In [54]:
isSubstring = False
isCharacter = True
ngrams_mode = 0 #modes == 0,1,2

In [55]:
accuracy_char = modelTraining(isSubstring,isCharacter,ngrams_mode)



In [56]:
accuracy_char

[0.6243641585204546,
 0.7348215314216713,
 0.6309446163501528,
 0.7085023813023081,
 0.692952219040091]

### Only 1-Ngrams

In [57]:
isSubstring = False
isCharacter = False
ngrams_mode = 1 #modes == 0,1,2

In [58]:
accuracy_1_ngrams = modelTraining(isSubstring,isCharacter,ngrams_mode)



In [59]:
accuracy_1_ngrams

[0.7577515461583377,
 0.7484799585307822,
 0.7609415776574658,
 0.7137857344301756,
 0.7516707844593709]

### Only 2-Ngrams

In [60]:
isSubstring = False
isCharacter = False
ngrams_mode = 2 #modes == 0,1,2

In [61]:
accuracy_2_ngrams = modelTraining(isSubstring,isCharacter,ngrams_mode)

In [62]:
accuracy_2_ngrams

[0.6846767466524728,
 0.6861712670752682,
 0.6964399629795871,
 0.5475976453110787,
 0.6945452487160033]

## Substring +  Character 

In [63]:
isSubstring = True
isCharacter = True
ngrams_mode = 0 #modes == 0,1,2

In [64]:
accuracy_sub_char = modelTraining(isSubstring,isCharacter,ngrams_mode)

In [65]:
accuracy_sub_char

[0.7815768828971253,
 0.7908481726136332,
 0.7853652190837845,
 0.7667222436276828,
 0.7790832681239152]

## Substring + 1Ngrams

In [66]:
isSubstring = True
isCharacter = False
ngrams_mode = 1 #modes == 0,1,2

In [67]:
accuracy_sub_1_ngrams = modelTraining(isSubstring,isCharacter,ngrams_mode)

In [68]:
accuracy_sub_1_ngrams

[0.7871597359316473,
 0.7861631241732968,
 0.7857641219766993,
 0.7760927377230857,
 0.7825730974407454]

## Substring + 2Ngrams

In [69]:
isSubstring = True
isCharacter = False
ngrams_mode = 2 #modes == 0,1,2

In [70]:
accuracy_sub_2_ngrams = modelTraining(isSubstring,isCharacter,ngrams_mode)

In [71]:
accuracy_sub_2_ngrams

[0.7876574459887271,
 0.7848665159898789,
 0.7879569458953817,
 0.7670201546754161,
 0.7853636302248633]

## Character + 1Ngrams

In [72]:
isSubstring = False
isCharacter = True
ngrams_mode = 1 #modes == 0,1,2

In [73]:
accuracy_char_1_ngrams = modelTraining(isSubstring,isCharacter,ngrams_mode)



In [74]:
accuracy_char_1_ngrams

[0.7593480514631403,
 0.7721044039197149,
 0.7734034946951973,
 0.7440911329755753,
 0.7624373890281347]

## Character + 2Ngrams

In [75]:
isSubstring = False
isCharacter = True
ngrams_mode = 2 #modes == 0,1,2

In [76]:
accuracy_char_2_ngrams = modelTraining(isSubstring,isCharacter,ngrams_mode)



In [77]:
accuracy_char_2_ngrams

[0.7177731745004032,
 0.7439913327745846,
 0.7314322967352921,
 0.7154807489880953,
 0.7279415736853186]

## Character +Substring + 1Ngrams

In [78]:
isSubstring = True
isCharacter = True
ngrams_mode = 1 #modes == 0,1,2

In [79]:
accuracy_char_sub_1_ngrams = modelTraining(isSubstring,isCharacter,ngrams_mode)

In [80]:
accuracy_char_sub_1_ngrams

[0.7877580406191783,
 0.7890533578547226,
 0.7874586400162062,
 0.7711084872871425,
 0.7802806719284379]

## Character +Substring + 2Ngrams

In [81]:
isSubstring = True
isCharacter = True
ngrams_mode = 2 #modes == 0,1,2

In [82]:
accuracy_char_sub_2_ngrams = modelTraining(isSubstring,isCharacter,ngrams_mode)

In [83]:
accuracy_char_sub_2_ngrams

[0.7892525610419737,
 0.7890533578547227,
 0.7902497686224196,
 0.7656237462910076,
 0.789551663733898]

In [84]:

accuracy_mean = [mean(accuracy_sub),
                 mean(accuracy_char),
                 mean(accuracy_1_ngrams),
                 mean(accuracy_2_ngrams),
                 mean(accuracy_sub_1_ngrams),
                 mean(accuracy_sub_2_ngrams),
                 mean(accuracy_sub_char),
                 mean(accuracy_sub_1_ngrams),
                 mean(accuracy_sub_2_ngrams),
                 mean(accuracy_char_1_ngrams),
                 mean(accuracy_char_2_ngrams),
                 mean(accuracy_char_sub_1_ngrams),
                 mean(accuracy_char_sub_2_ngrams)
                ]
accuracy_mean

[0.7778478905911747,
 0.6783169813269355,
 0.7465259202472264,
 0.661886174146882,
 0.7835505634490949,
 0.7825729385548534,
 0.7807191572692282,
 0.7835505634490949,
 0.7825729385548534,
 0.7622768944163525,
 0.7273238253367388,
 0.7831318395411375,
 0.7847462195088043]

## --------------------------------------------- Model Tuning ----------------------------------------------

In [29]:
# Function for model tuning
def modelTuning(modelType,tuned_parameters,n_iter,isFinalModel) :
    scores = ['accuracy']
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        grid = RandomizedSearchCV(
            modelType, tuned_parameters, scoring='%s' % score,cv = 3,n_jobs=-1,n_iter=n_iter,verbose = 1,random_state = 0
        )
        if (not isFinalModel) :
            grid.fit(X_train, y_train)
        else :
            grid.fit(enn_X_train,enn_y_train)

        print("Best parameters set found on development set:")
        print()
        print(grid.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = grid.cv_results_['mean_test_score']
        stds = grid.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, grid.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        if (not isFinalModel) :
            y_true, y_pred = y_test, grid.predict(X_test)
        else :
            y_true, y_pred = enn_y_test,grid.predict(enn_X_test)
            
        print(confusion_matrix(y_true, y_pred))
        print(classification_report(y_true, y_pred))
        print()
        return grid

In [30]:
isSubstring = True
isCharacter = True
ngrams_mode = 2 #modes == 0,1,2
Xfeatures = [gender_features(n,isSubstring,isCharacter,ngrams_mode) for n in df['firstName']]
X = dv.fit_transform(Xfeatures)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## 1.Multinomial Naive Bayes

In [87]:
#Get Parameters
mnb.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

In [88]:
tuned_parameters = {'alpha': [1e-4,1e-3,1e-2,1e-1,1]}

In [89]:
mnb = modelTuning(MultinomialNB(),tuned_parameters,5,False)

# Tuning hyper-parameters for accuracy

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    0.0s finished


Best parameters set found on development set:

{'alpha': 0.1}

Grid scores on development set:

0.785 (+/-0.013) for {'alpha': 0.0001}
0.786 (+/-0.014) for {'alpha': 0.001}
0.788 (+/-0.012) for {'alpha': 0.01}
0.791 (+/-0.015) for {'alpha': 0.1}
0.785 (+/-0.011) for {'alpha': 1}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

[[1096  282]
 [ 243  887]]
              precision    recall  f1-score   support

           0       0.82      0.80      0.81      1378
           1       0.76      0.78      0.77      1130

    accuracy                           0.79      2508
   macro avg       0.79      0.79      0.79      2508
weighted avg       0.79      0.79      0.79      2508




## 2.Random Forest

In [90]:
#Get Parameters
rfc.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 20,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [91]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [92]:
tuned_parameters = {'n_estimators': n_estimators,
                    'max_features': max_features,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split,
                    'min_samples_leaf': min_samples_leaf,
                    'bootstrap': bootstrap}
pprint({'n_estimators': n_estimators,
                    'max_features': max_features,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split,
                    'min_samples_leaf': min_samples_leaf,
                    'bootstrap': bootstrap})

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [93]:
rfc = modelTuning(RandomForestClassifier(random_state = 0),tuned_parameters,100,False)

# Tuning hyper-parameters for accuracy

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 33.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 53.9min finished


Best parameters set found on development set:

{'n_estimators': 2000, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 100, 'bootstrap': False}

Grid scores on development set:

0.757 (+/-0.003) for {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': True}
0.788 (+/-0.004) for {'n_estimators': 1200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 90, 'bootstrap': True}
0.719 (+/-0.009) for {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}
0.716 (+/-0.008) for {'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 20, 'bootstrap': False}
0.793 (+/-0.010) for {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
0.711 (+/-0.012) for {'n_estim

[[1187  191]
 [ 327  803]]
              precision    recall  f1-score   support

           0       0.78      0.86      0.82      1378
           1       0.81      0.71      0.76      1130

    accuracy                           0.79      2508
   macro avg       0.80      0.79      0.79      2508
weighted avg       0.79      0.79      0.79      2508




## 3.Support Vector Machine

In [95]:
#Get Parameters
svm.get_params()

{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto_deprecated',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [96]:
tuned_parameters = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['linear','rbf', 'poly', 'sigmoid']}

In [97]:
svm = modelTuning(SVC(),tuned_parameters,64,False)

# Tuning hyper-parameters for accuracy

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:  5.3min finished


Best parameters set found on development set:

{'kernel': 'rbf', 'gamma': 0.01, 'C': 10}

Grid scores on development set:

0.787 (+/-0.006) for {'kernel': 'linear', 'gamma': 1, 'C': 0.1}
0.544 (+/-0.001) for {'kernel': 'rbf', 'gamma': 1, 'C': 0.1}
0.789 (+/-0.011) for {'kernel': 'poly', 'gamma': 1, 'C': 0.1}
0.526 (+/-0.012) for {'kernel': 'sigmoid', 'gamma': 1, 'C': 0.1}
0.787 (+/-0.006) for {'kernel': 'linear', 'gamma': 0.1, 'C': 0.1}
0.688 (+/-0.023) for {'kernel': 'rbf', 'gamma': 0.1, 'C': 0.1}
0.766 (+/-0.004) for {'kernel': 'poly', 'gamma': 0.1, 'C': 0.1}
0.612 (+/-0.018) for {'kernel': 'sigmoid', 'gamma': 0.1, 'C': 0.1}
0.787 (+/-0.006) for {'kernel': 'linear', 'gamma': 0.01, 'C': 0.1}
0.661 (+/-0.009) for {'kernel': 'rbf', 'gamma': 0.01, 'C': 0.1}
0.543 (+/-0.000) for {'kernel': 'poly', 'gamma': 0.01, 'C': 0.1}
0.643 (+/-0.007) for {'kernel': 'sigmoid', 'gamma': 0.01, 'C': 0.1}
0.787 (+/-0.006) for {'kernel': 'linear', 'gamma': 0.001, 'C': 0.1}
0.543 (+/-0.000) for {'kernel': '

## 4.KNN

In [98]:
#Get Parameters
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [99]:
tuned_parameters = {'n_neighbors' : list(range(1,31)),'weights' : ['uniform','distance'],'metric': ['minkowski','euclidean','manhattan']}

In [100]:
knn = modelTuning(KNeighborsClassifier(),tuned_parameters,100,False)

# Tuning hyper-parameters for accuracy

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 33.7min finished


Best parameters set found on development set:

{'weights': 'distance', 'n_neighbors': 24, 'metric': 'manhattan'}

Grid scores on development set:

0.737 (+/-0.013) for {'weights': 'uniform', 'n_neighbors': 24, 'metric': 'euclidean'}
0.767 (+/-0.009) for {'weights': 'distance', 'n_neighbors': 4, 'metric': 'minkowski'}
0.764 (+/-0.004) for {'weights': 'distance', 'n_neighbors': 3, 'metric': 'manhattan'}
0.743 (+/-0.011) for {'weights': 'uniform', 'n_neighbors': 21, 'metric': 'manhattan'}
0.753 (+/-0.005) for {'weights': 'uniform', 'n_neighbors': 9, 'metric': 'manhattan'}
0.770 (+/-0.004) for {'weights': 'distance', 'n_neighbors': 6, 'metric': 'euclidean'}
0.738 (+/-0.007) for {'weights': 'uniform', 'n_neighbors': 29, 'metric': 'minkowski'}
0.773 (+/-0.006) for {'weights': 'distance', 'n_neighbors': 16, 'metric': 'manhattan'}
0.738 (+/-0.010) for {'weights': 'uniform', 'n_neighbors': 22, 'metric': 'manhattan'}
0.739 (+/-0.006) for {'weights': 'uniform', 'n_neighbors': 25, 'metric': 'eucli

[[1121  257]
 [ 297  833]]
              precision    recall  f1-score   support

           0       0.79      0.81      0.80      1378
           1       0.76      0.74      0.75      1130

    accuracy                           0.78      2508
   macro avg       0.78      0.78      0.78      2508
weighted avg       0.78      0.78      0.78      2508




## 5. Neural Network

In [101]:
#Get Parameters
nn.get_params()

{'activation': 'relu',
 'alpha': 1e-05,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': 10,
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_iter': 200,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': 0,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [102]:
tuned_parameters = {'hidden_layer_sizes': [(10,), (10,2), (10,4,2)],
                    'activation': ['identity', 'logistic', 'tanh', 'relu'],
                    'solver': ['sgd', 'adam'],
                    'alpha': [1e-4, 1e-3,1e-2,1e-1,1],
                    'learning_rate': ['constant', 'invscaling', 'adaptive']}

In [103]:
nn = modelTuning(MLPClassifier(),tuned_parameters,100,False)

# Tuning hyper-parameters for accuracy

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 61.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 101.6min finished


Best parameters set found on development set:

{'solver': 'adam', 'learning_rate': 'constant', 'hidden_layer_sizes': (10,), 'alpha': 0.1, 'activation': 'logistic'}

Grid scores on development set:

0.788 (+/-0.004) for {'solver': 'sgd', 'learning_rate': 'constant', 'hidden_layer_sizes': (10, 2), 'alpha': 0.0001, 'activation': 'identity'}
0.786 (+/-0.008) for {'solver': 'adam', 'learning_rate': 'constant', 'hidden_layer_sizes': (10, 2), 'alpha': 0.01, 'activation': 'tanh'}
0.543 (+/-0.000) for {'solver': 'sgd', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (10, 4, 2), 'alpha': 0.0001, 'activation': 'logistic'}
0.763 (+/-0.004) for {'solver': 'sgd', 'learning_rate': 'constant', 'hidden_layer_sizes': (10,), 'alpha': 0.01, 'activation': 'tanh'}
0.657 (+/-0.012) for {'solver': 'sgd', 'learning_rate': 'constant', 'hidden_layer_sizes': (10,), 'alpha': 0.0001, 'activation': 'logistic'}
0.514 (+/-0.080) for {'solver': 'sgd', 'learning_rate': 'invscaling', 'hidden_layer_sizes': (10, 2), 'al



## ------------------------------------------Test Prediction-------------------------------------------------

In [31]:
#train all model with tuning hyperparameter
mnb = MultinomialNB(alpha = 0.1)
mnb.fit(X_train,y_train)
rfc = RandomForestClassifier(n_estimators= 2000, min_samples_split= 5, min_samples_leaf= 1, max_features= 'auto', max_depth= 100, bootstrap= False)
rfc.fit(X_train,y_train)
svm = SVC(kernel= 'rbf',gamma= 0.01, C= 10) 
svm.fit(X_train,y_train)
knn = KNeighborsClassifier(weights= 'distance', n_neighbors= 24, metric= 'manhattan')
knn.fit(X_train,y_train)
nn = MLPClassifier(solver= 'adam', learning_rate= 'constant', hidden_layer_sizes= (10,), alpha= 0.1, activation= 'logistic',max_iter = 400)
nn.fit(X_train, y_train)

MLPClassifier(activation='logistic', alpha=0.1, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(10,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=400, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [32]:
sample_name = "sira"
vect = dv.transform(gender_features(sample_name,True,True,2)).toarray()

In [33]:
pred_results = [mnb.predict(vect),rfc.predict(vect),svm.predict(vect),knn.predict(vect),nn.predict(vect)]
pred_results = ["female" if pred == [0] else "male" for pred in pred_results]
pred_results 

['female', 'male', 'male', 'male', 'male']

## ----------------------------------------- Ensembling Model ------------------------------------------

In [34]:
#Get output from all models to be the input of neural network
df['mnb'] = mnb.predict(X)
df['rfc'] = rfc.predict(X)
df['svm'] = svm.predict(X)
df['knn'] = knn.predict(X)
df['nn'] = nn.predict(X)

In [35]:
enn_X = df.iloc[:,3:8]
enn_y = df['gender']

In [36]:
enn_X

Unnamed: 0,mnb,rfc,svm,knn,nn
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,1,1,1,1,1
4,0,0,0,0,0
...,...,...,...,...,...
10026,0,0,0,0,0
10027,1,1,1,1,1
10028,1,1,1,1,1
10029,0,0,0,0,0


In [37]:
enn_X_train, enn_X_test, enn_y_train, enn_y_test = train_test_split(enn_X, enn_y, test_size=0.25,random_state=1)

In [38]:
enn = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(3), random_state=0,max_iter=300)

In [39]:
enn.fit(enn_X_train,enn_y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=3, learning_rate='constant',
              learning_rate_init=0.001, max_iter=300, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=0, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [40]:
enn_pred = enn.predict(enn_X_test)

In [41]:
print(confusion_matrix(enn_y_test,enn_pred))

[[1313   71]
 [ 117 1007]]


In [42]:
print(classification_report(enn_y_test,enn_pred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.93      1384
           1       0.93      0.90      0.91      1124

    accuracy                           0.93      2508
   macro avg       0.93      0.92      0.92      2508
weighted avg       0.93      0.93      0.92      2508



In [43]:
# 10-fold cross-validation
enn_scores = cross_val_score(enn, enn_X, enn_y, cv=10, scoring='accuracy',n_jobs = -1)
print(enn_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (enn_scores.mean(), enn_scores.std() * 2))

[0.93027888 0.92821535 0.90428714 0.92422732 0.91026919 0.92622134
 0.93220339 0.92323031 0.91824526 0.92622134]
Accuracy: 0.92 (+/- 0.02)


## -------------------------------------------- Optimize Final Model ------------------------------------------

In [44]:
enn.get_params()

{'activation': 'relu',
 'alpha': 1e-05,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': 3,
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_iter': 300,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': 0,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [45]:
tuned_parameters = {'hidden_layer_sizes': [(10,), (3,), (3,2), ()],
                    'activation': ['identity', 'logistic', 'tanh', 'relu'],
                    'solver': ['lbfgs','sgd', 'adam'],
                    'alpha': [1e-4, 1e-3,1e-2,1e-1,1],
                    'learning_rate': ['constant', 'invscaling', 'adaptive']}

In [46]:
enn = modelTuning(MLPClassifier(),tuned_parameters,100,True)

# Tuning hyper-parameters for accuracy

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 181 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   30.2s finished


Best parameters set found on development set:

{'solver': 'lbfgs', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (10,), 'alpha': 0.1, 'activation': 'relu'}

Grid scores on development set:

0.922 (+/-0.010) for {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (), 'alpha': 0.1, 'activation': 'tanh'}
0.922 (+/-0.010) for {'solver': 'lbfgs', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (), 'alpha': 0.01, 'activation': 'relu'}
0.921 (+/-0.009) for {'solver': 'adam', 'learning_rate': 'constant', 'hidden_layer_sizes': (), 'alpha': 0.001, 'activation': 'logistic'}
0.922 (+/-0.010) for {'solver': 'sgd', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (), 'alpha': 0.1, 'activation': 'identity'}
0.922 (+/-0.009) for {'solver': 'sgd', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (3, 2), 'alpha': 0.1, 'activation': 'tanh'}
0.921 (+/-0.010) for {'solver': 'sgd', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (), 'alpha': 1, 'activation': 'logistic'}
0.92

In [47]:
enn = MLPClassifier(solver= 'adam', learning_rate= 'adaptive', hidden_layer_sizes= (), alpha= 0.1, activation= 'tanh',max_iter=300)
enn.fit(enn_X_train,enn_y_train)

MLPClassifier(activation='tanh', alpha=0.1, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=300, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [56]:
sample_name = "Thanapath"
vect = dv.transform(gender_features(sample_name,True,True,2))
vect = pd.DataFrame([mnb.predict(vect),
              rfc.predict(vect),
              svm.predict(vect),
              knn.predict(vect),
              nn.predict(vect)
              ]).transpose()
result = enn.predict(vect)
if (result[0] == 0) :
    print("female")
else :
    print("male")

male


## -------------------------------------------- For VDO Demo Only------------------------------------------------

In [49]:
def vdoPredict() :
    test_df = pd.read_excel('test_dataset.xlsx') 
    test_df['firstName'] = test_df['name'].str.split().str[0]
    vdo_X = [gender_features(n,True,True,2) for n in test_df['firstName']]
    vdo_X_features = dv.transform(vdo_X)
    vdo_gender = pd.DataFrame([mnb.predict(vdo_X_features),
              rfc.predict(vdo_X_features),
              svm.predict(vdo_X_features),
              knn.predict(vdo_X_features),
              nn.predict(vdo_X_features)
              ]).transpose()
    results = enn.predict(vdo_gender)
    results =  ["female" if pred == 0 else "male" for pred in results]
    test_df['gender'] = pd.DataFrame(results)
    test_df[['name','gender']].to_excel("test_dataset.xlsx")

In [63]:
vdoPredict()