In [2]:
import os
import pandas as pd
import numpy as np
import nltk

### Using corpus of names

In [3]:
def format_names():
    female_fn = '../names/female.txt'
    male_fn = '../names/male.txt'

    males = pd.read_csv(male_fn, skiprows=4)
    males.rename(columns={males.columns[0]:'MaleNames'}, inplace=True)

    females = pd.read_csv(female_fn, skiprows=4)
    females.rename(columns={females.columns[0]:'FemaleNames'}, inplace=True)

    merged = females.combine_first(males)
    merged = merged.applymap(lambda s:s.lower() if type(s) == str else s)
    
    return merged

In [4]:
merged = format_names()

In [9]:
merged.head()

Unnamed: 0,FemaleNames,MaleNames
0,abagael,aamir
1,abagail,aaron
2,abbe,abbey
3,abbey,abbie
4,abbi,abbot


In [5]:
def check_name_sex(name, merged):
    name = name.lower()
    maleflags = merged['MaleNames'].eq(name).any()
    femaleflags = merged['FemaleNames'].eq(name).any()
    if maleflags and femaleflags:
        val = 'ambiguous'
    elif maleflags:
        val = 'male'
    elif femaleflags:
        val = 'female'
    else:
        val = 'unknown'
    print('The name {0} was decided to be {1}'.format(name, val))
    return val
        

In [6]:
benchmark_names = ['john', 'mary', 'jean', 'ali', 'susan','mamta','harry','hermione','ron','peter','marvin','julie','jeremy','salvador']

In [7]:
for i in benchmark_names:
    check_name_sex(i, merged)

The name john was decided to be male
The name mary was decided to be female
The name jean was decided to be ambiguous
The name ali was decided to be ambiguous
The name susan was decided to be female
The name mamta was decided to be unknown
The name harry was decided to be male
The name hermione was decided to be female
The name ron was decided to be male
The name peter was decided to be male
The name marvin was decided to be male
The name julie was decided to be ambiguous
The name jeremy was decided to be male
The name salvador was decided to be male


### Using gender guesser

In [8]:
import gender_guesser.detector as gender
d = gender.Detector()

for i in benchmark_names:
    #check_str = unicode(i.capitalize(), "utf-8")
    #check_str = i.capitalize().decode("utf-8")
    print("The name {0} was decided to be {1}".format(i, d.get_gender(i.capitalize())))

The name john was decided to be male
The name mary was decided to be mostly_female
The name jean was decided to be male
The name ali was decided to be male
The name susan was decided to be female
The name mamta was decided to be female
The name harry was decided to be male
The name hermione was decided to be unknown
The name ron was decided to be male
The name peter was decided to be male
The name marvin was decided to be male
The name julie was decided to be female
The name jeremy was decided to be male
The name salvador was decided to be male


In [14]:
def gender_guesser(name):
    import gender_guesser.detector as gender
    d = gender.Detector()
    return(d.get_gender(name.capitalize()))

In [52]:
def check_name_contigency(name, names_df):
    val = check_name_sex(name, names_df)
    if val == 'unknown':
        val = gender_guesser(name)
        if val == 'mostly_female':
            return('female')
        elif val == 'mostly_male':
            return('male')
        elif val == 'andy':
            return('ambiguous')
    return(val)