In [1]:
import nltk
import random
import numpy as np
from nltk.corpus import names
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report, accuracy_score

nltk.download('names')

male_names = [(name, 'male') for name in names.words('male.txt')]
female_names = [(name, 'female') for name in names.words('female.txt')]

all_names = male_names + female_names
random.shuffle(all_names)

X = [name for name, _ in all_names]
y = [gender for _, gender in all_names]

vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 2))
X_transformed = vectorizer.fit_transform(X).toarray()

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(),
    'Gaussian Naive Bayes': GaussianNB(),
    'Multinomial Naive Bayes': MultinomialNB(),
    'Bernoulli Naive Bayes': BernoulliNB()
}

for classifier_name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{classifier_name}:\n")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy}\n")


[nltk_data] Downloading package names to
[nltk_data]     /Users/vaishnaviet/nltk_data...
[nltk_data]   Package names is already up-to-date!


Logistic Regression:

              precision    recall  f1-score   support

      female       0.79      0.86      0.82      1006
        male       0.72      0.60      0.65       583

    accuracy                           0.77      1589
   macro avg       0.75      0.73      0.74      1589
weighted avg       0.76      0.77      0.76      1589

Accuracy: 0.7658904971680303

Random Forest:

              precision    recall  f1-score   support

      female       0.80      0.85      0.83      1006
        male       0.71      0.63      0.67       583

    accuracy                           0.77      1589
   macro avg       0.76      0.74      0.75      1589
weighted avg       0.77      0.77      0.77      1589

Accuracy: 0.7715544367526747

Decision Tree:

              precision    recall  f1-score   support

      female       0.77      0.80      0.79      1006
        male       0.64      0.59      0.61       583

    accuracy                           0.73      1589
   macro avg  