# Analyses of Baby Name Popularity Distribution in U.S. for the Last 144 Years

In [2]:
import numpy as np
import pandas as pd

In [3]:
file_path = '/Users/yingzhou/Downloads/Capstone_Babyname/notebooks/Featured_Data.csv'

# Read the CSV file into a DataFrame
data = pd.read_csv(file_path, index_col=0)

# Display the first few rows to verify
print(data.head())

           Name  Year Gender  Count  Name_Ratio  Gender_Name_Ratio  Is_Famous  \
128731    Aaden  2008      M    958    0.243760           0.469689          0   
129570    Aaden  2009      M   1268    0.331981           0.639458          0   
114438  Aaliyah  1994      F   1451    0.390318           0.812591          0   
115498  Aaliyah  1995      F   1256    0.342980           0.714250          0   
116688  Aaliyah  1996      F    831    0.227856           0.474070          0   

        Gender_Binary  Year_of_Last_Appearance  Is_Top_100  ...  \
128731              1                     2009           0  ...   
129570              1                     2009           0  ...   
114438              0                     2023           0  ...   
115498              0                     2023           0  ...   
116688              0                     2023           0  ...   

        Rolling_Average_Gender_Ratio_5_Years  \
128731                              0.469689   
129570        

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

X = data.drop(columns=['Is_Top_100'])  # Drop the target column from features
y = data['Is_Top_100']  # Target variable


In [6]:
# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:

# Make sure both train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1)
X_test = X_test.fillna(0)  # Fill NaN values if align introduces any


In [8]:
# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)


In [9]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)


In [10]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report for more detailed metrics
print(classification_report(y_test, y_pred))


Accuracy: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25686
           1       0.98      0.98      0.98      3123

    accuracy                           1.00     28809
   macro avg       0.99      0.99      0.99     28809
weighted avg       1.00      1.00      1.00     28809



In [11]:
# Get feature importances
importances = rf_model.feature_importances_

# Display feature importances
feature_importances = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(feature_importances)


                                     feature  importance
3                          Gender_Name_Ratio    0.185472
9     Rolling_Average_National_Ratio_5_Years    0.179726
2                                 Name_Ratio    0.155933
8       Rolling_Average_Gender_Ratio_5_Years    0.116324
1                                      Count    0.088664
...                                      ...         ...
1860                              Name_Kenya    0.000000
1856                            Name_Kennith    0.000000
1852                             Name_Kenley    0.000000
1848                             Name_Kendal    0.000000
1708                              Name_Juana    0.000000

[3416 rows x 2 columns]


In [14]:
feature_importances.head(20)

Unnamed: 0,feature,importance
3,Gender_Name_Ratio,0.185472
9,Rolling_Average_National_Ratio_5_Years,0.179726
2,Name_Ratio,0.155933
8,Rolling_Average_Gender_Ratio_5_Years,0.116324
1,Count,0.088664
7,Rolling_Average_Count_5_Years,0.070163
12,Yearly_Change_National_Ratio,0.045798
11,Yearly_Change_Gender_Ratio,0.03705
10,Yearly_Change_Count,0.036835
0,Year,0.014436
