# Analyses of Baby Name Popularity Distribution in U.S. for the Last 143 Years

In [48]:
import numpy as np
import pandas as pd

# plotting
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import matplotlib.pyplot as plt

# stats
from statsmodels.api import tsa # time series analysis
import statsmodels.api as sm
from statsmodels.tsa.statespace.sarimax import SARIMAX


from prophet import Prophet

In [53]:
file_path = '/Users/yingzhou/Downloads/Capstone_Babyname/Notebook_data/data.csv'

# Read the CSV file into a DataFrame
data = pd.read_csv(file_path)

# Display the first few rows to verify
print(data.head())

   Unnamed: 0     Name  Year Gender  Count  Name_Ratio  Gender_Name_Ratio
0           0    Emily  2000      F  25959    6.867840          14.295257
1           1   Hannah  2000      F  23086    6.107745          12.713136
2           2  Madison  2000      F  19968    5.282832          10.996097
3           3   Ashley  2000      F  17998    4.761639           9.911246
4           4    Sarah  2000      F  17713    4.686238           9.754300


In [54]:
file_path = '/Users/yingzhou/Downloads/Capstone_Babyname/Notebook_data/dataset3.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first few rows to verify
print(df.head())


   Unnamed: 0  Rank      Name  Number Gender  Year
0           0     1    OLIVER     555   Male  2023
1           1     2      NOAH     537   Male  2023
2           2     3       LEO     409   Male  2023
3           3     4     HENRY     404   Male  2023
4           4     5  THEODORE     400   Male  2023


In [55]:
# Prove the prediction
# Filter for the year 2023
years_of_interest = [2023]
rank_filtered = df[df['Year'].isin(years_of_interest)]

# Top 100 male names for 2023
top_100_male_2023 = rank_filtered[(rank_filtered['Year'] == 2023) & (rank_filtered['Gender'] == 'Male')].head(100)

# Top 100 female names for 2023
top_100_female_2023 = rank_filtered[(rank_filtered['Year'] == 2023) & (rank_filtered['Gender'] == 'Female')].head(100)

# Display the results
print("Top 100 Male Names for 2023:")
print(top_100_male_2023)

print("\nTop 100 Female Names for 2023:")
print(top_100_female_2023)


Top 100 Male Names for 2023:
    Unnamed: 0  Rank      Name  Number Gender  Year
0            0     1    OLIVER     555   Male  2023
1            1     2      NOAH     537   Male  2023
2            2     3       LEO     409   Male  2023
3            3     4     HENRY     404   Male  2023
4            4     5  THEODORE     400   Male  2023
..         ...   ...       ...     ...    ...   ...
95          95    93    JOSHUA      89   Male  2023
96          96    97      ZAYN      88   Male  2023
97          97    97    MARCUS      88   Male  2023
98          98    97     ASHER      88   Male  2023
99          99   100       ALI      87   Male  2023

[100 rows x 6 columns]

Top 100 Female Names for 2023:
     Unnamed: 0  Rank       Name  Number  Gender  Year
100         100     1       ISLA     403  Female  2023
101         101     2     AMELIA     399  Female  2023
102         102     3     OLIVIA     381  Female  2023
103         103     4        MIA     347  Female  2023
104         104 

### Year of Last Appearance

In [100]:
# Calculate the Year of Last Appearance
# Group by 'Name' and find the max 'Year' for each name
last_appearance = data.groupby('Name')['Year'].max().reset_index()

# Rename the column to 'Year_of_Last_Appearance'
last_appearance = last_appearance.rename(columns={'Year': 'Year_of_Last_Appearance'})

# Merge this information back into the original dataframe
data1 = data.merge(last_appearance, on='Name', how='left')

# Display the updated dataframe
print(data1.head())

   Unnamed: 0     Name  Year Gender  Count  Name_Ratio  Gender_Name_Ratio  \
0           0    Emily  2000      F  25959    6.867840          14.295257   
1           1   Hannah  2000      F  23086    6.107745          12.713136   
2           2  Madison  2000      F  19968    5.282832          10.996097   
3           3   Ashley  2000      F  17998    4.761639           9.911246   
4           4    Sarah  2000      F  17713    4.686238           9.754300   

   Year_of_Last_Appearance  
0                     2023  
1                     2023  
2                     2023  
3                     2023  
4                     2023  


### Rolling Averages

In [103]:
# Calculate a 5-year rolling average for the 'Count' column
data1['Rolling_Average_Count_5_Years'] = data1.groupby('Name')['Count'].rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)

# Calculate a 5-year rolling average for the 'Gender_Ratio' column
data1['Rolling_Average_Gender_Ratio_5_Years'] = data1.groupby('Name')['Gender_Name_Ratio'].rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)

# Calculate a 5-year rolling average for the National Ratio
data1['Rolling_Average_National_Ratio_5_Years'] = data1.groupby('Name')['Name_Ratio'].rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)

data2 = data1.sort_values(by=['Name', 'Year'])

# Display the updated dataframe with both rolling averages
print(data2.head(10))

         Unnamed: 0   Name  Year Gender  Count  Name_Ratio  Gender_Name_Ratio  \
448361       448361  Aaban  2007      M      5    0.001251           0.002410   
1658557     1658557  Aaban  2009      M      6    0.001571           0.003026   
622389       622389  Aaban  2010      M      9    0.002436           0.004692   
542878       542878  Aaban  2011      M     11    0.003008           0.005794   
342512       342512  Aaban  2012      M     11    0.003008           0.005801   
475583       475583  Aaban  2013      M     14    0.003841           0.007404   
54484         54484  Aaban  2014      M     16    0.004318           0.008334   
122608       122608  Aaban  2015      M     15    0.004054           0.007830   
309877       309877  Aaban  2016      M      9    0.002454           0.004745   
186730       186730  Aaban  2017      M     11    0.003077           0.005947   

         Year_of_Last_Appearance  Rolling_Average_Count_5_Years  \
448361                      2022         

### Name length

In [104]:
# Calculate the length of each name
data2['Name_Length'] = data2['Name'].apply(len)
print(data2.head(10))

         Unnamed: 0   Name  Year Gender  Count  Name_Ratio  Gender_Name_Ratio  \
448361       448361  Aaban  2007      M      5    0.001251           0.002410   
1658557     1658557  Aaban  2009      M      6    0.001571           0.003026   
622389       622389  Aaban  2010      M      9    0.002436           0.004692   
542878       542878  Aaban  2011      M     11    0.003008           0.005794   
342512       342512  Aaban  2012      M     11    0.003008           0.005801   
475583       475583  Aaban  2013      M     14    0.003841           0.007404   
54484         54484  Aaban  2014      M     16    0.004318           0.008334   
122608       122608  Aaban  2015      M     15    0.004054           0.007830   
309877       309877  Aaban  2016      M      9    0.002454           0.004745   
186730       186730  Aaban  2017      M     11    0.003077           0.005947   

         Year_of_Last_Appearance  Rolling_Average_Count_5_Years  \
448361                      2022         

In [105]:
# Define a function to count vowels and consonants
def count_vowels(name):
    vowels = 'AEIOUaeiou'
    return sum(1 for char in name if char in vowels)

def count_consonants(name):
    vowels = 'AEIOUaeiou'
    return sum(1 for char in name if char.isalpha() and char not in vowels)

# Calculate vowel and consonant counts
data2['Vowel_Count'] = data2['Name'].apply(count_vowels)
data2['Consonant_Count'] = data2['Name'].apply(count_consonants)

# Display the updated dataframe
print(data2.head(10))

         Unnamed: 0   Name  Year Gender  Count  Name_Ratio  Gender_Name_Ratio  \
448361       448361  Aaban  2007      M      5    0.001251           0.002410   
1658557     1658557  Aaban  2009      M      6    0.001571           0.003026   
622389       622389  Aaban  2010      M      9    0.002436           0.004692   
542878       542878  Aaban  2011      M     11    0.003008           0.005794   
342512       342512  Aaban  2012      M     11    0.003008           0.005801   
475583       475583  Aaban  2013      M     14    0.003841           0.007404   
54484         54484  Aaban  2014      M     16    0.004318           0.008334   
122608       122608  Aaban  2015      M     15    0.004054           0.007830   
309877       309877  Aaban  2016      M      9    0.002454           0.004745   
186730       186730  Aaban  2017      M     11    0.003077           0.005947   

         Year_of_Last_Appearance  Rolling_Average_Count_5_Years  \
448361                      2022         

### Name Endings