<a href="https://colab.research.google.com/github/th-shristi/GenderEthnicityModel/blob/main/GenderEthnicityModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('/content/drive/MyDrive/ML_Projects/USA_Names.csv')

In [3]:
df.head()

Unnamed: 0,Year of Birth,Gender,Ethnicity,Child's First Name,Count,Rank
0,2011,FEMALE,HISPANIC,GERALDINE,13,75
1,2011,FEMALE,HISPANIC,GIA,21,67
2,2011,FEMALE,HISPANIC,GIANNA,49,42
3,2011,FEMALE,HISPANIC,GISELLE,38,51
4,2011,FEMALE,HISPANIC,GRACE,36,53


In [4]:
df.columns

Index(['Year of Birth', 'Gender', 'Ethnicity', 'Child's First Name', 'Count',
       'Rank'],
      dtype='object')

In [5]:
df.isnull().sum()

Year of Birth         0
Gender                0
Ethnicity             0
Child's First Name    0
Count                 0
Rank                  0
dtype: int64

In [6]:
df = df.rename(columns={"Child's First Name": "Name"})

In [7]:
df['Name'].str.lower()

0        geraldine
1              gia
2           gianna
3          giselle
4            grace
           ...    
69209       cayden
69210     margaret
69211        tamar
69212       amanda
69213         anna
Name: Name, Length: 69214, dtype: object

In [8]:
le_gender = LabelEncoder()
le_ethnicity = LabelEncoder()

In [9]:
df['Gender'] = le_gender.fit_transform(df['Gender'])
df['Ethnicity'] = le_ethnicity.fit_transform(df['Ethnicity'])

In [10]:
df['Gender'].value_counts()

Gender
0    35299
1    33915
Name: count, dtype: int64

In [11]:
df['Ethnicity'].value_counts()

Ethnicity
4    20365
6    19642
3    10052
1     9383
5     4843
0     2483
2     2446
Name: count, dtype: int64

In [12]:
df['Name Length'] = df['Name'].apply(len)

In [13]:
vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 3))

In [14]:
X = vectorizer.fit_transform(df['Name'])

In [15]:
X = np.hstack((X.toarray(), df[['Name Length']].values))

In [16]:
y_gender = df['Gender']
y_ethnicity = df['Ethnicity']

In [17]:
X_train, X_test, y_train_gender, y_test_gender = train_test_split(X, y_gender, test_size=0.2, random_state=42)
X_train, X_test, y_train_ethnicity, y_test_ethnicity = train_test_split(X, y_ethnicity, test_size=0.2, random_state=42)

In [18]:
rf_gender = RandomForestClassifier(n_estimators=100, random_state=42)
rf_gender.fit(X_train, y_train_gender)
y_pred_gender_rf = rf_gender.predict(X_test)

In [19]:
dt_gender = DecisionTreeClassifier(random_state=42)
dt_gender.fit(X_train, y_train_gender)
y_pred_gender_dt = dt_gender.predict(X_test)

In [20]:
rf_ethnicity = RandomForestClassifier(n_estimators=100, random_state=42)
rf_ethnicity.fit(X_train, y_train_ethnicity)
y_pred_ethnicity_rf = rf_ethnicity.predict(X_test)

In [21]:
dt_ethnicity = DecisionTreeClassifier(random_state=42)
dt_ethnicity.fit(X_train, y_train_ethnicity)
y_pred_ethnicity_dt = dt_ethnicity.predict(X_test)

In [22]:
ensemble_gender = VotingClassifier(estimators=[
    ('rf', rf_gender),
    ('dt', dt_gender)
], voting='soft')
ensemble_gender.fit(X_train, y_train_gender)

In [23]:
ensemble_ethnicity = VotingClassifier(estimators=[
    ('rf', rf_ethnicity),
    ('dt', dt_ethnicity)
], voting='soft')
ensemble_ethnicity.fit(X_train, y_train_ethnicity)

In [24]:
# Model Evaluation
def print_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"{model_name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

In [25]:
print_metrics(y_test_gender, rf_gender.predict(X_test), "Random Forest Gender")
print_metrics(y_test_gender, dt_gender.predict(X_test), "Decision Tree Gender")
print_metrics(y_test_gender, ensemble_gender.predict(X_test), "Ensemble Gender")

print_metrics(y_test_ethnicity, rf_ethnicity.predict(X_test), "Random Forest Ethnicity")
print_metrics(y_test_ethnicity, dt_ethnicity.predict(X_test), "Decision Tree Ethnicity")
print_metrics(y_test_ethnicity, ensemble_ethnicity.predict(X_test), "Ensemble Ethnicity")

Random Forest Gender - Accuracy: 0.9880083796864841, Precision: 0.9880279563797304, Recall: 0.9880083796864841, F1 Score: 0.9880072474999438
Decision Tree Gender - Accuracy: 0.9879361410098967, Precision: 0.9879548182547626, Recall: 0.9879361410098967, F1 Score: 0.9879350296037641
Ensemble Gender - Accuracy: 0.9879361410098967, Precision: 0.9879514854878589, Recall: 0.9879361410098967, F1 Score: 0.9879351387445525
Random Forest Ethnicity - Accuracy: 0.5162175828938814, Precision: 0.5582054862953536, Recall: 0.5162175828938814, F1 Score: 0.4739009599134579
Decision Tree Ethnicity - Accuracy: 0.5163620602470562, Precision: 0.5549101902593324, Recall: 0.5163620602470562, F1 Score: 0.47581625687273205
Ensemble Ethnicity - Accuracy: 0.5154229574514195, Precision: 0.5574743560713222, Recall: 0.5154229574514195, F1 Score: 0.4731066366804211


In [26]:
def predict_gender_ethnicity(names, model_gender, model_ethnicity, vectorizer, le_gender, le_ethnicity):
    # Preprocess the input names
    names = [re.sub(r'[^a-z]', '', name.lower()) for name in names]

    # Character-based encodings
    X_input = vectorizer.transform(names)

    # Name length feature
    name_lengths = np.array([len(name) for name in names]).reshape(-1, 1)

    # Combine features
    X_input = np.hstack((X_input.toarray(), name_lengths))

    # Predict gender and ethnicity
    gender_preds = model_gender.predict(X_input)
    ethnicity_preds = model_ethnicity.predict(X_input)

    # Decode labels
    gender_preds = le_gender.inverse_transform(gender_preds)
    ethnicity_preds = le_ethnicity.inverse_transform(ethnicity_preds)

    # Return predictions
    return list(zip(names, gender_preds, ethnicity_preds))

In [27]:
input_names = [ "John", "Xiang", "Alejandro", "Ayesha"]
predictions = predict_gender_ethnicity(input_names, ensemble_gender, ensemble_ethnicity, vectorizer, le_gender, le_ethnicity)

for name, gender, ethnicity in predictions:
    print(f"Name: {name}, Predicted Gender: {gender}, Predicted Ethnicity: {ethnicity}")

Name: john, Predicted Gender: MALE, Predicted Ethnicity: HISPANIC
Name: xiang, Predicted Gender: MALE, Predicted Ethnicity: HISPANIC
Name: alejandro, Predicted Gender: MALE, Predicted Ethnicity: HISPANIC
Name: ayesha, Predicted Gender: FEMALE, Predicted Ethnicity: ASIAN AND PACIFIC ISLANDER
