In [62]:
import nltk
from nltk.corpus import names
import random
import pandas as pd
# Download names corpus if not already downloaded
nltk.download('names')

# Load and shuffle the names corpus
names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]
random.shuffle(names)

# Feature extraction function
def gender_features(name):
    return {
        'last_letter': name[-1],
        'last_two_letters': name[-2:],
        'first_two_letters': name[:2],
        'first_letter': name[0],
        'name_length': len(name),
        'vowel_count': sum(1 for char in name if char in 'aeiouAEIOU'),
        'consonant_count': sum(1 for char in name if char not in 'aeiouAEIOU'),
        'a_count': name.lower().count('a'),
        'e_count': name.lower().count('e'),
        'i_count': name.lower().count('i'),
        'o_count': name.lower().count('o'),
        'u_count': name.lower().count('u'),
        'starts_ends_same': name[0].lower() == name[-1].lower()
    }

# Create a DataFrame
name_features = [(name, gender, gender_features(name)) for (name, gender) in names]
df = pd.DataFrame(name_features, columns=['Name', 'Gender', 'Features'])

# Expand the 'Features' column into separate columns
features_df = df['Features'].apply(pd.Series)
df = pd.concat([df.drop(columns=['Features']), features_df], axis=1)

# Split the data into train, dev-test, and test sets
train_df = df.iloc[1000:]      # Remaining 6900 samples for training
devtest_df = df.iloc[500:1000] # 500 samples for dev-test
test_df = df.iloc[:500]        # 500 samples for test

print(train_df)

[nltk_data] Downloading package names to
[nltk_data]     /Users/willberritt/nltk_data...
[nltk_data]   Package names is already up-to-date!


           Name  Gender last_letter last_two_letters first_two_letters  \
1000    Winfred    male           d               ed                Wi   
1001     Tarzan    male           n               an                Ta   
1002     Silvia  female           a               ia                Si   
1003  Florentia  female           a               ia                Fl   
1004     Teador    male           r               or                Te   
...         ...     ...         ...              ...               ...   
7939    Thibaut    male           t               ut                Th   
7940      Shaun  female           n               un                Sh   
7941     Helise  female           e               se                He   
7942      Maxie    male           e               ie                Ma   
7943    Jackson    male           n               on                Ja   

     first_letter  name_length  vowel_count  consonant_count  a_count  \
1000            W            7        

In [60]:
print(devtest_df)

          Name  Gender last_letter last_two_letters first_two_letters  \
500       Dick    male           k               ck                Di   
501  Modestine  female           e               ne                Mo   
502   Thornton    male           n               on                Th   
503    Lindsay    male           y               ay                Li   
504    Pauline  female           e               ne                Pa   
..         ...     ...         ...              ...               ...   
995      Faina  female           a               na                Fa   
996       Bert    male           t               rt                Be   
997   Davidson    male           n               on                Da   
998     Pepito    male           o               to                Pe   
999   Clarissa  female           a               sa                Cl   

    first_letter  name_length  vowel_count  consonant_count  a_count  e_count  \
500            D            4            1

In [61]:
print(test_df)

         Name  Gender last_letter last_two_letters first_two_letters  \
0     Candide  female           e               de                Ca   
1       Kiele  female           e               le                Ki   
2    Kimberli  female           i               li                Ki   
3        Ralf    male           f               lf                Ra   
4         Jay    male           y               ay                Ja   
..        ...     ...         ...              ...               ...   
495    Hadria  female           a               ia                Ha   
496    Shirah  female           h               ah                Sh   
497     Flynn    male           n               nn                Fl   
498  Devondra  female           a               ra                De   
499   Philipa  female           a               pa                Ph   

    first_letter  name_length  vowel_count  consonant_count  a_count  e_count  \
0              C            7            3            