## name-gender

In [22]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import classification_report, f1_score,accuracy_score
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import time
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")
import re

In [23]:
df =pd.read_csv('name_gender.csv')

## simple EDA and preprocessing

In [24]:
df.shape

(95025, 2)

In [25]:
#the dataset is imblanced, can try using different imbalanced handling method later
df['gender'].value_counts()

F    60304
M    34721
Name: gender, dtype: int64

In [26]:
# a simple observation: the first 3 female name all end with 'a', 
# suggesting that the last letter might be important feature
df.head() 

Unnamed: 0,name,gender
0,Aaban&&,M
1,Aabha*,F
2,Aabid,M
3,Aabriella,F
4,Aada_,F


In [27]:
#text cleansing
def preprocessing(s):
    # 1.to lower
    s=s.lower()
    # 2: keep only the letter 
    s =''.join([w for w in s if w.isalpha()])
    return s

df['clean_name']=df['name'].apply(preprocessing)

In [28]:
#no duplicates in the name
df['clean_name'].nunique() 

95025

In [29]:
df['name_len']=df['clean_name'].apply(lambda x: len(x))
df['name_len'].describe()

count    95025.000000
mean         6.534070
std          1.486065
min          2.000000
25%          6.000000
50%          6.000000
75%          7.000000
max         15.000000
Name: name_len, dtype: float64

(1)Based on common sense and research literature, there is no correlation between the gender and percentage of letter in a name. Therefore the is no point to include n-gram = 1. 
(2)the median length of name in the dataset is 6. Therefore when using CountVectorizer, I will not set a ngram range larger than 6, (2,6) will be the largest range. 
(3)Since every name is considerably short, certain pattern is likely to appear only once. There is no point in setting min-df larger than 1. 

## baseline Naive Bayes model

In [40]:
X = df['clean_name']
y = df['gender']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1004)

In [41]:
vect_cv = CountVectorizer(analyzer='char',ngram_range=(2,6))
X_train_cv=vect_cv.fit_transform(X_train)
X_val_cv=vect_cv.transform(X_val)

In [72]:
nb=MultinomialNB()
nb.fit(X_train_cv,y_train)
y_pred = nb.predict(X_val_cv)
print(classification_report(y_true=y_val,y_pred=y_pred))
print(accuracy_score(y_val,y_pred))

              precision    recall  f1-score   support

           F       0.84      0.88      0.86     12055
           M       0.77      0.70      0.73      6950

    accuracy                           0.81     19005
   macro avg       0.80      0.79      0.79     19005
weighted avg       0.81      0.81      0.81     19005

0.8131018153117601


1.793704105881583e-08

The accuracy for baseline model is  0.813. Note that the f1-score for female is significatly higher than male

In [44]:
## see if the score can improve by handling imbalance problem
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
over = RandomOverSampler(random_state=1004)
under = RandomUnderSampler(random_state=1004)
X_train_cv_over, y_train_over=over.fit_resample(X_train_cv,y_train)
X_train_cv_under, y_train_under=under.fit_resample(X_train_cv,y_train)

In [60]:
nb.fit(X_train_cv_over, y_train_over)
y_pred = nb.predict(X_val_cv)
print(classification_report(y_true=y_val,y_pred=y_pred))
print(accuracy_score(y_val,y_pred))

              precision    recall  f1-score   support

           F       0.86      0.84      0.85     12055
           M       0.74      0.77      0.75      6950

    accuracy                           0.81     19005
   macro avg       0.80      0.80      0.80     19005
weighted avg       0.82      0.81      0.81     19005

0.8138910812943962


baseline model with random oversampling: slight improvement

In [61]:
nb.fit(X_train_cv_under, y_train_under)
y_pred = nb.predict(X_val_cv)
print(classification_report(y_true=y_val,y_pred=y_pred))
print(accuracy_score(y_val,y_pred))

              precision    recall  f1-score   support

           F       0.86      0.83      0.84     12055
           M       0.72      0.78      0.75      6950

    accuracy                           0.81     19005
   macro avg       0.79      0.80      0.80     19005
weighted avg       0.81      0.81      0.81     19005

0.8073664825046041


Baseline model with random undersampling:  drop 

## trial hyper parameter tuning 

using imblearn instead of sklearn since the pipeline in sklearn cannot change the size of input data,
i.e.: can not include the random oversampling step

In [85]:
from imblearn.pipeline import make_pipeline, Pipeline
start = time.time()
pipe = Pipeline([('vect', CountVectorizer()), ('over', over), ('nb', MultinomialNB())])
pipe_param= [
   {'vect': [CountVectorizer(analyzer='char')],
    'vect__ngram_range':   [(2,2),(2,3),(2,4),(2,5),(2,6)],
    'nb__alpha':     np.linspace(0,1,11) },
    
   {'vect': [TfidfVectorizer(analyzer='char')],
    'vect__ngram_range':   [(2,2),(2,3),(2,4),(2,5),(2,6)],
    'nb__alpha':     np.linspace(0,1,11)}]

pipe_grid =GridSearchCV(estimator=pipe,
                        param_grid=pipe_param,
                        cv=5,
                        verbose=1,
                        scoring='accuracy')

pipe_grid.fit(X_train,y_train)

print("time takes in minutes:",round((time.time()-start)/60,1))
print(pipe_grid.best_params_)
print(pipe_grid.best_score_)

Fitting 5 folds for each of 110 candidates, totalling 550 fits
time takes in minutes: 11.7
{'nb__alpha': 0.1, 'vect': CountVectorizer(analyzer='char', ngram_range=(2, 6)), 'vect__ngram_range': (2, 6)}
0.8112996579847408


In [86]:
# selected nb
nb=MultinomialNB(alpha=0.1)
nb.fit(X_train_cv_over, y_train_over)
y_pred = nb.predict(X_val_cv)
print(classification_report(y_true=y_val,y_pred=y_pred))
print(accuracy_score(y_val,y_pred)) #slightly better than before

              precision    recall  f1-score   support

           F       0.86      0.86      0.86     12055
           M       0.75      0.75      0.75      6950

    accuracy                           0.82     19005
   macro avg       0.81      0.81      0.81     19005
weighted avg       0.82      0.82      0.82     19005

0.8192054722441463


## Feature Engineering

According to literature review, The most signifiant pattern to differentiate the gender based on name is the last few letters. Besides this, 'ild' and 'lin' are common pattern that in girls name metioned in literature which might not necessarily be at the end part of the name.

In [30]:
## add some pattern according to research paper
df['last_letter']=df['clean_name'].apply(lambda x: x[-1])
df['last_2_letter']=df['clean_name'].apply(lambda x:x[-2:])
df['last_3_letter']=df['clean_name'].apply(lambda x:x[-3:])
df['ild']=df['clean_name'].apply(lambda x: bool(re.search('lid',x)))
df['lin']=df['clean_name'].apply(lambda x: bool(re.search('lin',x)))

In [31]:
data=df.copy() #save of copy of dataframe

In [92]:
#one hot encoding for ['last_letter','last_2_letter','last_3_letter']
#features_col=['last_letter','last_2_letter','last_3_letter']
#df=pd.get_dummies(data=df, columns=features_col, dtype=bool, drop_first=False)
#len(list(df.columns))

4366

In [32]:
from sklearn.preprocessing import OneHotEncoder
features_col=['last_letter','last_2_letter','last_3_letter']
onehot=OneHotEncoder(handle_unknown='ignore') 
new_cols = onehot.fit_transform(df[features_col]).toarray()
names_col=onehot.get_feature_names(features_col)
df_new= pd.DataFrame(new_cols,dtype=bool, columns=names_col)
df=pd.concat([df,df_new],axis=1).drop(columns=features_col)
len(list(df.columns))

4366

In [143]:
X = df.drop(columns=['gender','name','name_len'])
y = df['gender']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1004)

In [33]:
# function to concat cv/tfv character matrix and the manuel crafted features
import scipy as sp
def combine_features(X,X_cv):
    X_manual = X.drop(columns=['clean_name'])
    X_manual=  X_manual.fillna(0)
    X_manual_sparse = sp.sparse.csr_matrix(X_manual)
    X_full = sp.sparse.hstack([X_cv, X_manual_sparse])
    return X_full

In [34]:
# function to transfrom train and val text to cv/tdidf matrix
def vect_transfrom(vect, X_train, X_val):
    X_train_m = vect.fit_transform(X_train['clean_name'])
    X_val_m = vect.transform(X_val['clean_name'])
    return X_train_m, X_val_m

In [35]:
# function to print result and check overfitting
def model_result(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred_val=model.predict(X_val)
    y_pred_train = model.predict(X_train)
    print(classification_report(y_true=y_val,y_pred=y_pred_val))
    print("testing score is", accuracy_score(y_val,y_pred_val))
    print("training score is",accuracy_score(y_train,y_pred_train))

### base model results by TFIDF

In [104]:
vect_tfv=TfidfVectorizer(analyzer='char',ngram_range=(2,6))
X_train_tfv, X_val_tfv = vect_transfrom(vect_tfv, X_train, X_val)
X_train_full = combine_features(X_train, X_train_tfv)
X_val_full = combine_features(X_val, X_val_tfv)
X_train_full_over, y_train_over = over.fit_resample(X_train_full, y_train)

In [109]:
# the best nb model by grid search
nb=MultinomialNB(0.1)
model_result(nb,X_train_full_over, y_train_over, X_val_full,y_val)

              precision    recall  f1-score   support

           F       0.92      0.89      0.91     12055
           M       0.82      0.87      0.85      6950

    accuracy                           0.88     19005
   macro avg       0.87      0.88      0.88     19005
weighted avg       0.89      0.88      0.88     19005

testing score is 0.8837674296237832
training score is 0.9241953201102614


In [110]:
## logistic regression
lr = LogisticRegression(random_state=1004)
model_result(lr,X_train_full_over, y_train_over, X_val_full,y_val)

              precision    recall  f1-score   support

           F       0.93      0.90      0.91     12055
           M       0.83      0.89      0.86      6950

    accuracy                           0.89     19005
   macro avg       0.88      0.89      0.89     19005
weighted avg       0.90      0.89      0.89     19005

testing score is 0.8939226519337017
training score is 0.9305374204646728


In [111]:
## linear SVC
## overfitting occured
from sklearn.svm import LinearSVC
svc = LinearSVC()
model_result(svc,X_train_full_over, y_train_over, X_val_full,y_val)

              precision    recall  f1-score   support

           F       0.93      0.92      0.92     12055
           M       0.86      0.88      0.87      6950

    accuracy                           0.90     19005
   macro avg       0.90      0.90      0.90     19005
weighted avg       0.91      0.90      0.90     19005

testing score is 0.904709287029729
training score is 0.9919791083753031


In [114]:
## Xgboost
xgb1 = xgb.XGBClassifier(random_state=1004)
model_result(xgb1,X_train_full_over.tocsc(), y_train_over, X_val_full.tocsc(),y_val)

              precision    recall  f1-score   support

           F       0.93      0.86      0.90     12055
           M       0.79      0.89      0.84      6950

    accuracy                           0.87     19005
   macro avg       0.86      0.88      0.87     19005
weighted avg       0.88      0.87      0.87     19005

testing score is 0.8720862930807682
training score is 0.9014072830524985


### base model results by CountVectorizer

In [121]:
vect_cv=CountVectorizer(analyzer='char',ngram_range=(2,6))
X_train_cv, X_val_cv = vect_transfrom(vect_cv, X_train, X_val)
X_train_full = combine_features(X_train, X_train_cv)
X_val_full = combine_features(X_val, X_val_cv)
X_train_full_over, y_train_over = over.fit_resample(X_train_full, y_train)

In [122]:
model_result(nb, X_train_full_over, y_train_over, X_val_full,y_val)

              precision    recall  f1-score   support

           F       0.90      0.90      0.90     12055
           M       0.83      0.83      0.83      6950

    accuracy                           0.87     19005
   macro avg       0.86      0.87      0.87     19005
weighted avg       0.87      0.87      0.87     19005

testing score is 0.8746645619573796
training score is 0.9464341229870049


In [123]:
model_result(lr,X_train_full_over, y_train_over, X_val_full,y_val)

              precision    recall  f1-score   support

           F       0.94      0.91      0.92     12055
           M       0.86      0.89      0.87      6950

    accuracy                           0.91     19005
   macro avg       0.90      0.90      0.90     19005
weighted avg       0.91      0.91      0.91     19005

testing score is 0.9057616416732439
training score is 0.9801653920288503


In [124]:
model_result(svc,X_train_full_over, y_train_over, X_val_full,y_val)

              precision    recall  f1-score   support

           F       0.92      0.92      0.92     12055
           M       0.86      0.86      0.86      6950

    accuracy                           0.90     19005
   macro avg       0.89      0.89      0.89     19005
weighted avg       0.90      0.90      0.90     19005

testing score is 0.8963956853459616
training score is 0.998424837820473


In [125]:
model_result(xgb1,X_train_full_over.tocsc(), y_train_over, X_val_full.tocsc(),y_val)

              precision    recall  f1-score   support

           F       0.93      0.87      0.90     12055
           M       0.79      0.89      0.84      6950

    accuracy                           0.88     19005
   macro avg       0.86      0.88      0.87     19005
weighted avg       0.88      0.88      0.88     19005

testing score is 0.8751381215469614
training score is 0.8930547783373749


1.the new featuers largely increased the score.  
2.TFIDF leads to better test score and lower overfitting compared to CountVectorizer after feature engineering.  
3.Among the models, LR(Countvectorizer) and linearSVC have serious overfitting problem. LR(TFIDF) has the best testing result, therefore LR will be tuned for improvement in later section.

## tune logistic regression

In [39]:
start = time.time()
lr = LogisticRegression()
param_lr= {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': np.linspace(0.4, 1.1, 8),
    'max_iter': [80, 100, 120, 150],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'warm_start': [False, True]                                
}
lr_grid =GridSearchCV(estimator=lr,
                      param_grid=param_lr,
                      cv=5,
                      scoring='accuracy')

lr_grid.fit(X_train_full,y_train)

print("time takes:",round((time.time()-start)/60,1))
print(lr_grid.best_params_)
print(lr_grid.best_score_)

time takes: 486.1
{'C': 0.6000000000000001, 'max_iter': 80, 'penalty': 'none', 'solver': 'saga', 'warm_start': False}
0.8996316758747698


In [126]:
## testing score of the optimal model selected
## highly overfitting
lr_select = LogisticRegression(C=0.6,penalty='none', solver='saga',max_iter=80,warm_start=False)
model_result(lr_select, X_train_full_over, y_train_over, X_val_full,y_val)

              precision    recall  f1-score   support

           F       0.93      0.92      0.92     12055
           M       0.86      0.88      0.87      6950

    accuracy                           0.90     19005
   macro avg       0.89      0.90      0.90     19005
weighted avg       0.90      0.90      0.90     19005

testing score is 0.9033938437253355
training score is 0.99293249601028


The similar overfitting problem is highly likely to occur after tuning for rest models.

## simple model ensemling

using one model for tfidf/cv character n-gram matrix, another model for other crafted features, and combine the predicted probality see if there is an improvement.

In [127]:
# model 1: nb model for text
X_train_tfv, X_val_tfv = vect_transfrom(vect_tfv, X_train, X_val)
X_train_tfv_over,y_train_over = over.fit_resample(X_train_tfv,y_train)
nb=MultinomialNB(alpha=0.1)
nb.fit(X_train_tfv_over,y_train_over)
y_pred = nb.predict(X_val_tfv)
y_pred_train = nb.predict(X_train_tfv)
print(classification_report(y_true=y_val,y_pred=y_pred))
print(accuracy_score(y_val,y_pred))
print(accuracy_score(y_train,y_pred_train))
proba1=nb.predict_proba(X_val_tfv)

              precision    recall  f1-score   support

           F       0.86      0.85      0.86     12055
           M       0.75      0.77      0.76      6950

    accuracy                           0.82     19005
   macro avg       0.80      0.81      0.81     19005
weighted avg       0.82      0.82      0.82     19005

0.8193633254406735
0.9296237832149434


In [128]:
# model 2: lr model for other extracted features
X_train_manual = X_train.drop(columns=['clean_name'])
X_val_manuel = X_val.drop(columns=['clean_name'])
lr = LogisticRegression(random_state=1004)
lr.fit(X_train_manual, y_train)
y_pred = lr.predict(X_val_manuel)
y_pred_train = lr.predict(X_train_manual)
print(classification_report(y_true=y_val,y_pred=y_pred))
print(accuracy_score(y_val,y_pred))
print(accuracy_score(y_train,y_pred_train))
proba2=lr.predict_proba(X_val_manuel)

              precision    recall  f1-score   support

           F       0.89      0.89      0.89     12055
           M       0.81      0.80      0.80      6950

    accuracy                           0.86     19005
   macro avg       0.85      0.84      0.85     19005
weighted avg       0.86      0.86      0.86     19005

0.856564062088924
0.8660615627466456


In [140]:
proba =pd.DataFrame((proba1+proba2)/2)
proba =proba.apply(np.argmax, axis=1)

In [147]:
y_val_new=y_val.apply(lambda x: int(x=='M'))
print(classification_report(y_true=y_val_new,y_pred=proba))
print(accuracy_score(y_val_new,proba))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91     12055
           1       0.85      0.82      0.84      6950

    accuracy                           0.88     19005
   macro avg       0.88      0.87      0.87     19005
weighted avg       0.88      0.88      0.88     19005

0.8828203104446198


The model accruacy doesnt seem to improve. Try change model 1 from nb to logistic regression.

In [148]:
# model 1: lr model for text
X_train_tfv, X_val_tfv = vect_transfrom(vect_tfv, X_train, X_val)
X_train_tfv_over,y_train_over = over.fit_resample(X_train_tfv,y_train)
lr1=LogisticRegression(random_state=1004)
lr1.fit(X_train_tfv_over,y_train_over)
y_pred = lr1.predict(X_val_tfv)
y_pred_train = lr1.predict(X_train_tfv)
print(classification_report(y_true=y_val,y_pred=y_pred))
print(accuracy_score(y_val,y_pred))
print(accuracy_score(y_train,y_pred_train))
proba1=lr1.predict_proba(X_val_tfv)

              precision    recall  f1-score   support

           F       0.92      0.89      0.90     12055
           M       0.81      0.87      0.84      6950

    accuracy                           0.88     19005
   macro avg       0.87      0.88      0.87     19005
weighted avg       0.88      0.88      0.88     19005

0.8785056564062089
0.9217442778216259


In [149]:
proba =pd.DataFrame((proba1+proba2)/2)
proba =proba.apply(np.argmax, axis=1)
y_val_new=y_val.apply(lambda x: int(x=='M'))
print(classification_report(y_true=y_val_new,y_pred=proba))
print(accuracy_score(y_val_new,proba))

              precision    recall  f1-score   support

           0       0.91      0.90      0.91     12055
           1       0.84      0.84      0.84      6950

    accuracy                           0.88     19005
   macro avg       0.87      0.87      0.87     19005
weighted avg       0.88      0.88      0.88     19005

0.8816101026045777


Similarly, the ensembling model accruacy is lower than previous lr model. 

## final model selected
The selected model is the simple logistic regression model with default parameter

In [60]:
X = df.drop(columns=['gender','name','name_len'])
y = df['gender']
vect_tfv=TfidfVectorizer(analyzer='char',ngram_range=(2,6))
X_tfv = vect_tfv.fit_transform(X['clean_name'])
X_full = combine_features(X, X_tfv)
X_full_over, y_over = over.fit_resample(X_full, y)

In [65]:
lr_select = LogisticRegression(random_state=1004)
lr_select.fit(X_full_over,y_over)

LogisticRegression(random_state=1004)

In [167]:
# Save the model
import joblib
joblib.dump(lr_select, 'model.pkl')

['model.pkl']

## for flask application

In [61]:
def to_df(s):
    s = ''.join([w for w in s if w.isalpha()])
    df = pd.DataFrame([s], columns=['clean_name'])
    return df
    
def process_input(s):
    df= to_df(s)
    df['last_letter']=df['clean_name'].apply(lambda x: x[-1])
    df['last_2_letter']=df['clean_name'].apply(lambda x:x[-2:])
    df['last_3_letter']=df['clean_name'].apply(lambda x:x[-3:])
    df['ild']=df['clean_name'].apply(lambda x: bool(re.search('lid',x)))
    df['lin']=df['clean_name'].apply(lambda x: bool(re.search('lin',x)))
    features_col=['last_letter','last_2_letter','last_3_letter']
    new_cols = onehot.transform(df[features_col]).toarray()
    names_col=onehot.get_feature_names(features_col)
    df_new= pd.DataFrame(new_cols,dtype=bool, columns=names_col)
    df=pd.concat([df,df_new],axis=1).drop(columns=features_col)
    X_tfv = vect_tfv.transform(df['clean_name'])
    X_full = combine_features(df, X_tfv)
    return X_full

In [64]:
print(process_input('Emily').shape,X_full.shape)

(1, 203740) (95025, 203740)


## other thoughts

word2vec/BERT + neural network is not implemented since:  
(1) its very time consuming (2) not sure the perfermance of charater based BERT 