## Project 3 - Will Berritt, Shafya Nadour, Jonathan Burns

In [None]:
import nltk
from nltk.corpus import names
import random
import pandas as pd
# Download names corpus if not already downloaded
nltk.download('names')

# Load and shuffle the names corpus
names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]
random.shuffle(names)

# Feature extraction function
def gender_features(name):
    return {
        'last_letter': name[-1],
        'last_two_letters': name[-2:],
        'first_two_letters': name[:2],
        'first_letter': name[0],
        'name_length': len(name),
        'vowel_count': sum(1 for char in name if char in 'aeiouAEIOU'),
        'consonant_count': sum(1 for char in name if char not in 'aeiouAEIOU'),
        'a_count': name.lower().count('a'),
        'e_count': name.lower().count('e'),
        'i_count': name.lower().count('i'),
        'o_count': name.lower().count('o'),
        'u_count': name.lower().count('u'),
        'starts_ends_same': name[0].lower() == name[-1].lower()
    }

# Create a DataFrame
name_features = [(name, gender, gender_features(name)) for (name, gender) in names]
df = pd.DataFrame(name_features, columns=['Name', 'Gender', 'Features'])

# Expand the 'Features' column into separate columns
features_df = df['Features'].apply(pd.Series)
df = pd.concat([df.drop(columns=['Features']), features_df], axis=1)

# Split the data into train, dev-test, and test sets
train_df = df.iloc[1000:]      # Remaining 6900 samples for training
devtest_df = df.iloc[500:1000] # 500 samples for dev-test
test_df = df.iloc[:500]        # 500 samples for test

print(train_df)

### Testing out different functions looking at different gender classifiers and splitting up the original function.

In [28]:
def gender_features_2(name):
    features = {}
    features['first_two_letters'] = name[:2]
    features['last_two_letters'] = name[-2:]
    
    return features


In [36]:
def gender_features_3(name):
    features = {}
    features['first_letter'] = name[0]
    features['last_letter'] = name[-1]
    
    return features    
        

In [45]:
def gender_features_4(name):
    features = {}
    features['vowel_count'] = sum(1 for char in name if char in 'aeiouAEIOU')
    features['consonant_count'] = sum(1 for char in name if char not in 'aeiouAEIOU')
    return features

In [48]:
def gender_features_5(name):
    features = {}
    features['a_count'] = name.lower().count('a')
    features['e_count'] = name.lower().count('e')
    return features

In [11]:
def gender_features_6(name):
    features = {}
    features['i_count'] = name.lower().count('i')
    features['o_count'] = name.lower().count('o')
    return features

In [51]:
def gender_features_7(name):
    features = {}
    features['u_count'] = name.lower().count('u')
    features['starts_ends_same'] = name[0].lower() == name[-1].lower()
    return features

### Creating 7 different test, training and dev-test sets

First set of gender features and its accuracy

In [41]:
train_names_2 = names[1000:]
devtest_names_2 = names[500:1000]
test_names_2 = names[:500]

In [42]:
train_2 = [(gender_features_2(n), g) for (n,g) in train_names_2]
devtest_2 =[(gender_features_2(n), g) for (n,g) in devtest_names_2]
test_2 = [(gender_features_2(n), g) for (n,g) in test_names_2]
classifier_2 = nltk.NaiveBayesClassifier.train(train_2)
print(nltk.classify.accuracy(classifier_2, devtest_2))

0.816


Using the first two letters and last two letters as our gender features yielded a 81.60% accuracy when tested against the dev test file

Second set of gender features and its accuracy

In [43]:
train_names_3 = names[1000:]
devtest_names_3 = names[500:1000]
test_names_3 = names[:500]

In [44]:
train_3 = [(gender_features_3(n), g) for (n,g) in train_names_3]
devtest_3 =[(gender_features_3(n), g) for (n,g) in devtest_names_3]
test_3 = [(gender_features_3(n), g) for (n,g) in test_names_3]
classifier_3 = nltk.NaiveBayesClassifier.train(train_3)
print(nltk.classify.accuracy(classifier_3, devtest_3))

0.758


Using the first and last letters of a name yielded a lower accuracy rate at 75.8% compared to first and last two from above.

Third set of gender features and accuracy

In [46]:
train_names_4 = names[1000:]
devtest_names_4 = names[500:1000]
test_names_4 = names[:500]

In [47]:
train_4 = [(gender_features_4(n), g) for (n,g) in train_names_4]
devtest_4 =[(gender_features_4(n), g) for (n,g) in devtest_names_4]
test_4 = [(gender_features_4(n), g) for (n,g) in test_names_4]
classifier_4 = nltk.NaiveBayesClassifier.train(train_4)
print(nltk.classify.accuracy(classifier_4, devtest_4))

0.656


Using constonant and vowel counts to classify gender has so far performed the worst out of the three classifiers as 65.6%.

Fourth set of gender features and accuracy

In [49]:
train_names_5 = names[1000:]
devtest_names_5 = names[500:1000]
test_names_5 = names[:500]

In [50]:
train_5 = [(gender_features_5(n), g) for (n,g) in train_names_5]
devtest_5 =[(gender_features_5(n), g) for (n,g) in devtest_names_5]
test_5 = [(gender_features_5(n), g) for (n,g) in test_names_5]
classifier_5 = nltk.NaiveBayesClassifier.train(train_5)
print(nltk.classify.accuracy(classifier_5, devtest_5))

0.62


Again, much worse than the first set of gender features at 62.0%

Fifth set of gender features and accuracy

In [53]:
train_names_6 = names[1000:]
devtest_names_6 = names[500:1000]
test_names_6 = names[:500]

In [54]:
train_6 = [(gender_features_6(n), g) for (n,g) in train_names_6]
devtest_6 =[(gender_features_6(n), g) for (n,g) in devtest_names_6]
test_6 = [(gender_features_6(n), g) for (n,g) in test_names_6]
classifier_6 = nltk.NaiveBayesClassifier.train(train_6)
print(nltk.classify.accuracy(classifier_6, devtest_6))

0.634


Using individual counts of different vowels does not seem to be working, we will try one more set of features for consistency. But given the previous performance of other singled out vowel counts I do not expect this to be much better.

In [55]:
train_names_7 = names[1000:]
devtest_names_7 = names[500:1000]
test_names_7 = names[:500]

In [56]:
train_7 = [(gender_features_7(n), g) for (n,g) in train_names_7]
devtest_7 =[(gender_features_7(n), g) for (n,g) in devtest_names_7]
test_7 = [(gender_features_7(n), g) for (n,g) in test_names_7]
classifier_7 = nltk.NaiveBayesClassifier.train(train_7)
print(nltk.classify.accuracy(classifier_7, devtest_7))

0.638


As expected the final set of features did not produce anything better than the first two letters and last two letters of a name.

### Run the dev-test and compare it to test

In [57]:

print(nltk.classify.accuracy(classifier_2, devtest_2))

0.816


In [58]:
print(nltk.classify.accuracy(classifier_2, test_2))

0.8


### Final Analysis

1. How does the performance on the test set compare to the performance on the dev-test set?
2. Is this what you'd expect?

As stated in the textbook, having a separate test set and dev-test set is crucial to keeping analysis in line. Known as the development set the training data and dev-test data are split out before setting the official test dataset. The dev-test dataset performes only a slight magnitude better than the official test dataset. One reason for this could be an unscene offset in the test df and the dev df. One possible way around this could be by randomly splitting out this data and retesting. An alternative is making this dataset bigger, lessening the issue with overfitting in the dev-test set. However because the dataset consists only of names we really did not expect to see much of a difference in the accuracy of the test and dev datasets.

Using code from the book (p.226) we can also evaluate the errors generated from the gender classifier

In [61]:
errors = []
for (name, tag) in devtest_names_2:
    guess = classifier_2.classify(gender_features_2(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [62]:
for (tag, guess, name) in sorted(errors):
    print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))

correct=female   guess=male     name=Agnes                         
correct=female   guess=male     name=Brook                         
correct=female   guess=male     name=Clair                         
correct=female   guess=male     name=Cloris                        
correct=female   guess=male     name=Devon                         
correct=female   guess=male     name=Dion                          
correct=female   guess=male     name=Easter                        
correct=female   guess=male     name=Edin                          
correct=female   guess=male     name=Gredel                        
correct=female   guess=male     name=Haily                         
correct=female   guess=male     name=Hedwig                        
correct=female   guess=male     name=Helen                         
correct=female   guess=male     name=Honey                         
correct=female   guess=male     name=Imojean                       
correct=female   guess=male     name=Jessalin   

We can also get the total amount of correct and incorrect guesses using this code

In [63]:
correct_guesses = 0
incorrect_guesses = 0

for (tag, guess, name) in sorted(errors):
    print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))
    if tag == guess:
        correct_guesses += 1
    else:
        incorrect_guesses += 1

print(f'Correct guesses: {correct_guesses}')
print(f'Incorrect guesses: {incorrect_guesses}')

correct=female   guess=male     name=Agnes                         
correct=female   guess=male     name=Brook                         
correct=female   guess=male     name=Clair                         
correct=female   guess=male     name=Cloris                        
correct=female   guess=male     name=Devon                         
correct=female   guess=male     name=Dion                          
correct=female   guess=male     name=Easter                        
correct=female   guess=male     name=Edin                          
correct=female   guess=male     name=Gredel                        
correct=female   guess=male     name=Haily                         
correct=female   guess=male     name=Hedwig                        
correct=female   guess=male     name=Helen                         
correct=female   guess=male     name=Honey                         
correct=female   guess=male     name=Imojean                       
correct=female   guess=male     name=Jessalin   

In the case of the error dataset, we are only going to generate incorrect guesses