In [1]:
# Importing pandas and matplotlib and necessary modules
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from matplotlib.ticker import EngFormatter
# Read in the baby names CSV as a DataFram
names = pd.read_csv("baby_names.csv")

In [2]:
# Check the data types and missing values
print(names.info())

# Check for unique values in categorical columns
print(names['first_name'].nunique())
print(names['sex'].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12649 entries, 0 to 12648
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   year        12649 non-null  int64 
 1   first_name  12649 non-null  object
 2   sex         12649 non-null  object
 3   num         12649 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 395.4+ KB
None
547
['F' 'M']


# To understand how baby name tastes have changed, you can plot the number of babies given a particular name over time, differentiated by sex.
# Group by year and sex, then sum the number of babies
yearly_trends_sex = names.groupby(['year', 'sex'])['num'].sum().unstack().reset_index()

# Plot the trend
plt.figure(figsize=(10, 6))
# Plot for baby girls
plt.plot(yearly_trends_sex['year'], yearly_trends_sex['F'], label='Female', color='purple')
# Plot for baby boys
plt.plot(yearly_trends_sex['year'], yearly_trends_sex['M'], label='Male', color='green')
plt.xlabel('Year')
plt.ylabel('Number of Babies')
plt.title('Total Number of Babies Named Each Year by Sex')
# Use EngFormatter to format the y-axis
formatter = EngFormatter()
plt.gca().yaxis.set_major_formatter(formatter) #  uses SI prefixes like k (thousands), M (millions)
plt.legend()
plt.show()

In [3]:
# Group by year and sex, then sum the number of babies
yearly_trends_sex = names.groupby(['year', 'sex'])['num'].sum().unstack().reset_index()

# Create an interactive plot
fig = go.Figure()

# Add trace for baby girls
fig.add_trace(go.Scatter(
    x=yearly_trends_sex['year'], 
    y=yearly_trends_sex['F'],
    mode='lines', 
    name='Female', 
    line=dict(color='purple'),
    hovertemplate='Year: %{x}<br>Number of Babies: %{y:,}<extra></extra>'
))

# Add trace for baby boys
fig.add_trace(go.Scatter(
    x=yearly_trends_sex['year'], 
    y=yearly_trends_sex['M'],
    mode='lines', 
    name='Male', 
    line=dict(color='green'),
    hovertemplate='Year: %{x}<br>Number of Babies: %{y:,}<extra></extra>'
))

# Update layout
fig.update_layout(
    title='Total Number of Babies Named Each Year by Gender',
    xaxis_title='Year',
    yaxis_title='Number of Babies',
    yaxis_tickformat=',',  # Use comma as a thousands separator
    legend_title='Gender',
    template='plotly_white'
)

# Show the plot
fig.show()

In [4]:
# Group by 'first_name' and sum the 'num' column
grouped_names = names.groupby('first_name')['num'].sum().reset_index() # this will be use in next cell
grouped_names.rename(columns={'num': 'number'}, inplace=True)
# Order by 'num' in descending order
top_names = grouped_names.sort_values(by='number', ascending=False)
# Display the result
top_names

Unnamed: 0,first_name,number
250,James,4748138
282,John,4510721
447,Robert,4495199
389,Michael,4278824
540,William,3614424
...,...,...
14,Alexandria,5026
96,Caroline,5021
363,Marc,5013
193,Ezekiel,5013


Classify each name's popularity according to the number of years that the name appears in the dataset.

In [5]:
# Group by 'first_name' and sum the 'num' values
popular_names_sum = names.groupby('first_name')['num'].sum().reset_index()
popular_names_sum.rename(columns={'num': 'number'}, inplace=True)
# Group by 'first_name' and count the number of unique years each name appears in
popular_names_years = names.groupby('first_name')['year'].nunique().reset_index()
# Rename the 'year' column to 'year_count' for clarity
popular_names_years.rename(columns={'year': 'year_count'}, inplace=True)
# Merge the two DataFrames on 'first_name'
popular_names = pd.merge(popular_names_sum, popular_names_years, on='first_name')

# This function can be used to classify names based on their frequency in the dataset.
def classify_popularity(year_count):
    if year_count > 80:
        return 'Classic'
    elif year_count > 50:
        return 'Semi-classic'
    elif year_count > 20:
        return 'Semi-trendy'
    else:
        return 'Trendy'
# Add a new column 'popularity_type' for the classification
popular_names['popularity_type'] = popular_names['year_count'].apply(classify_popularity)
# Drop the 'year_count' column as it's no longer needed
popular_names.drop(columns=['year_count'], inplace=True)
# Order the results alphabetically by 'first_name'
result = popular_names.sort_values(by='first_name')

# Display the result
result.head(20)

Unnamed: 0,first_name,number,popularity_type
0,Aaliyah,15870,Trendy
1,Aaron,530592,Semi-classic
2,Abigail,338485,Semi-trendy
3,Adam,497293,Semi-trendy
4,Addison,107433,Trendy
5,Adrian,147741,Semi-trendy
6,Aidan,68566,Trendy
7,Aiden,216194,Trendy
8,Alan,162041,Semi-trendy
9,Albert,260945,Semi-trendy


In [6]:
# Create a bar chart to visualize the number of names in each popularity type
df = result['popularity_type'].value_counts().reset_index()
df.columns = ['popularity_type', 'count']

fig = px.bar(df, 
             x='popularity_type', 
             y='count', 
             title='Number of Names by Popularity Type',
             labels={'popularity_type': 'Popularity Type', 'count': 'Number of Names'},
             color='popularity_type')  # Add color based on popularity type

# Customize the hover template to show the real number
fig.update_traces(hovertemplate='<b>%{x}</b><br>Number: %{y:,}')

# Show the plot
fig.show()


In [7]:
# Sort the DataFrame by 'number' in descending order and select the top 20 names
top_20 = result.sort_values(by='number', ascending=False).head(20)

# Create an interactive bar chart using Plotly
fig = px.bar(top_20, 
             x='first_name', 
             y='number', 
             color='popularity_type', 
             title='Top 20 Popular Names',
             labels={'first_name': 'First Name', 'number': 'Number of Occurrences'},
             hover_data={'popularity_type': True})

# Customize the hover template to show the real number and popularity type
fig.update_traces(hovertemplate='<b>%{x}</b><br>Number: %{y:,}<br>Popularity: %{customdata[0]}')

# Show the plot
fig.show()

In [8]:
# Merge the classification back to the original DataFrame
names_with_popularity = names.merge(popular_names[['first_name', 'popularity_type']], on='first_name')
# Group by 'year' and 'popularity_type', then count the number of occurrences
popularity_over_years = names_with_popularity.groupby(['year', 'popularity_type']).size().reset_index(name='count')
# Plot the data using Plotly
fig = px.line(popularity_over_years, x='year', y='count', color='popularity_type', 
              title='Popularity Types Over Years', labels={'year': 'Year', 'count': 'total of names'})
# Customize the legend title
fig.update_layout(legend_title_text='Popularity Type')

# Show the plot
fig.show()

In [42]:
# Define a consistent color map for popularity types
color_map = {
    'Classic': '#facc5f' ,  # Yellow
    'Semi-classic': '#43d7a4',  # Green
    'Semi-trendy': '#4095db',  # Blue
    'Trendy': '#6568a0'   # purple
}
# First chart: Number of Names by Popularity Type
df = result['popularity_type'].value_counts().reset_index()
df.columns = ['popularity_type', 'count']
fig1 = px.bar(df,
              x='popularity_type',
              y='count',
              title='Number of Names by Popularity Type',
              labels={'popularity_type': 'Popularity Type', 'count': 'Number of Names'},
              color='popularity_type',
              color_discrete_map=color_map)
fig1.update_traces(hovertemplate='<b>%{x}</b><br>Number: %{y:,}')

# Second chart: Top 20 Popular Names
top_20 = result.sort_values(by='number', ascending=False).head(20)
fig2 = px.bar(top_20,
              x='first_name',
              y='number',
              color='popularity_type',
              title='Top 20 Popular Names',
              labels={'first_name': 'First Name', 'number': 'Number of Occurrences'},
              hover_data={'popularity_type': True},
              color_discrete_map=color_map)
fig2.update_traces(hovertemplate='<b>%{x}</b><br>Number: %{y:,}<br>Popularity: %{customdata[0]}')
# Third chart: Popularity Types Over Years
names_with_popularity = names.merge(popular_names[['first_name', 'popularity_type']], on='first_name')
popularity_over_years = names_with_popularity.groupby(['year', 'popularity_type']).size().reset_index(name='count')
fig3 = px.line(popularity_over_years,
               x='year',
               y='count',
               color='popularity_type',
               title='Popularity Types Over Years',
               labels={'year': 'Year', 'count': 'Total of Names'},
               color_discrete_map=color_map)
fig3.update_layout(legend_title_text='Popularity Type')

# Create subplots
fig = make_subplots(rows=2, cols=2, subplot_titles=("Number of Names by Popularity Type", "Top 20 Popular Names", "Popularity Types Over Years"))

# Add the first chart to the first subplot
for trace in fig1['data']:
    fig.add_trace(trace, row=1, col=1)

# Add the second chart to the second subplot
for trace in fig2['data']:
    fig.add_trace(trace, row=1, col=2)

# Add the third chart to the third subplot
for trace in fig3['data']:
    fig.add_trace(trace, row=2, col=1)

# Update layout
fig.update_layout(title_text="Comparison of Popularity Types and Top 20 Names", showlegend=True, height=800)

# Show the plot
fig.show()

In [10]:
# Assuming 'all_years' is a set of all possible years in the dataset
all_years = set(names['year'].unique())

# Function to find missing years for each name
def find_missing_years(name):
    name_years = set(names[names['first_name'] == name]['year'].unique())
    missing_years = all_years - name_years
    return sorted(missing_years)

# Function to find the first year each name appears
def find_first_year(name):
    name_years = names[names['first_name'] == name]['year']
    if not name_years.empty:
        return name_years.min()
    return None

# Create a DataFrame to store the missing years and first year for each name by popularity type
results = []
for popularity_type in popular_names['popularity_type'].unique():
    type_names = popular_names[popular_names['popularity_type'] == popularity_type]
    missing_years_df = pd.DataFrame({
        'first_name': type_names['first_name'],
        'popularity_type': popularity_type,
        'missing_years': type_names['first_name'].apply(find_missing_years),
        'first_year': type_names['first_name'].apply(find_first_year)
    })
    missing_years_df['missing_count'] = missing_years_df['missing_years'].apply(len)
    results.append(missing_years_df)

# Concatenate all results into a single DataFrame
all_missing_years_df = pd.concat(results)

# Display the full results for all popularity types
all_missing_years_df[['first_name', 'popularity_type', 'first_year', 'missing_count']].sort_values(by=['popularity_type', 'missing_count'], ascending=[True, False])

Unnamed: 0,first_name,popularity_type,first_year,missing_count
138,Daniel,Classic,1937,17
379,Mary,Classic,1920,17
442,Richard,Classic,1920,17
389,Michael,Classic,1936,16
37,Anthony,Classic,1921,15
...,...,...,...,...
140,Danny,Trendy,1946,81
436,Randy,Trendy,1951,81
448,Robin,Trendy,1953,81
499,Tammy,Trendy,1958,81


The analysis of names across different popularity types revealed that:
- "Classic" names like Daniel, Mary, and Richard have minimal missing years, indicating their consistent popularity over time. 
- "Classic" names like Charles, David , Elisabeth, James have been present during all 100 years ( 1920 - 2020) 
- "Trendy" names such as Dillon, Ariana, and Ariel have significant gaps, with many missing for 100 out of the 101 years, reflecting their sporadic popularity. 
This suggests that "Classic" names tend to have a more stable presence, while "Trendy" names experience fluctuating popularity.


In [43]:
# Filter the names dataframe to include only female names
female_names = names[names['sex'] == 'F']

# Group by first_name and sum the occurrences
female_name_counts = female_names.groupby('first_name')['num'].sum().reset_index()

# Sort the names by the number of occurrences in descending order
top_female_names = female_name_counts.sort_values(by='num', ascending=False)

# Display the top-ranked female names
top_female_names.head(10)

Unnamed: 0,first_name,num
213,Mary,3215850
237,Patricia,1479802
97,Elizabeth,1436286
140,Jennifer,1404743
186,Linda,1361021
36,Barbara,1343901
280,Susan,1025728
141,Jessica,994210
189,Lisa,920119
41,Betty,893396


For the periode of 1920 - 2020 :
The 5 higher number of female names are : 'Mary', 'Patricia', 'Elizabeth', 'Jennifer', 'Linda'

In [12]:
# Get the top 5 female names
top5_female_names = top_female_names.head(5)['first_name'].tolist()

# Display the list of top 5 female names
print(top5_female_names)

['Mary', 'Patricia', 'Elizabeth', 'Jennifer', 'Linda']


In [44]:
# Filter the names dataframe to include only male names
male_names = names[names['sex'] == 'M']

# Group by first_name and sum the occurrences
male_name_counts = male_names.groupby('first_name')['num'].sum().reset_index()

# Sort the names by the number of occurrences in descending order
top_male_names = male_name_counts.sort_values(by='num', ascending=False)

# Display the top-ranked male names
top_male_names.head(10)

Unnamed: 0,first_name,num
117,James,4748138
135,John,4510721
196,Robert,4495199
171,Michael,4278824
235,William,3614424
63,David,3571498
193,Richard,2414838
140,Joseph,2361382
220,Thomas,2166802
41,Charles,2112352


In [14]:
# Get the top 5 male names
top5_male_names = top_male_names.head(5)['first_name'].tolist()

# Display the list of top 5 male names
print(top5_male_names)

['James', 'John', 'Robert', 'Michael', 'William']


For the periode of 1920 - 2020 :
The 5 higher number of men names are : James , John , Robert , Michael, William

In [15]:
#For each year show the maximum of time a name was given
# Filter the data to include only male names
male_names = names[names['sex'] == 'M']

# Group by year and first_name, then find the maximum number of babies given any one male name in that year
max_num_per_year = male_names.groupby(['year', 'first_name'])['num'].max().reset_index()

# Find the maximum number of babies given any one male name in each year
max_num_per_year = max_num_per_year.groupby('year')['num'].max().reset_index()

# Rename the columns for clarity
max_male_per_year = max_num_per_year.rename(columns={'num': 'max_num'})

max_male_per_year

Unnamed: 0,year,max_num
0,1920,56914
1,1921,58215
2,1922,57280
3,1923,57469
4,1924,60801
...,...,...
96,2016,19154
97,2017,18824
98,2018,19924
99,2019,20555


The previous cell show the maximum number of time a name was given. 
In the next cells will find what was those names based on gender

In [16]:
#show the name that correspond to the maximum of time
# Filter the data to include only male names
male_names = names[names['sex'] == 'M']

# Group by year and first_name, then find the maximum number of babies given any one male name in that year
max_num_per_year = male_names.groupby(['year', 'first_name'])['num'].max().reset_index()

# Find the top male name for each year by merging with the max_num_per_year dataframe
top_male_names_per_year = max_num_per_year.merge(
    max_num_per_year.groupby('year')['num'].max().reset_index(),
    on=['year', 'num']
)

# Rename the columns for clarity
top_male_names_per_year = top_male_names_per_year.rename(columns={'num': 'max_num'})
#print
top_male_names_per_year

Unnamed: 0,year,first_name,max_num
0,1920,John,56914
1,1921,John,58215
2,1922,John,57280
3,1923,John,57469
4,1924,Robert,60801
...,...,...,...
96,2016,Noah,19154
97,2017,Liam,18824
98,2018,Liam,19924
99,2019,Liam,20555


**_Remember**_ For the periode of 1920 - 2020 : The 5 higher number of men names are : James , John , Robert , Michael, William
The code return a year with the name which was the most given:
- From 1920 to 1923 it was John
- From 1924 to 1939 It was Robert. 
- Scrolling down we can see that Robert came back on the of the list in 1953 while John do not appear any more
- From 1940 to 1952 it was James
We can see from Year to Years some names were the most given then tey will disapear. in the next we'll dive into it

In [45]:
# Group by first_name and count the number of years each name was the top male name
years_at_number_one = top_male_names_per_year.groupby('first_name').size().reset_index(name='years_at_number_one')

# Get all the names along with the number of years they were the top male name, ordered by descending value
all_names_with_years_at_number_one = years_at_number_one[['first_name', 'years_at_number_one']].sort_values(by='years_at_number_one', ascending=False)
#print
all_names_with_years_at_number_one

Unnamed: 0,first_name,years_at_number_one
5,Michael,44
7,Robert,17
1,Jacob,14
2,James,13
3,John,4
4,Liam,4
6,Noah,4
0,David,1


In [18]:
top5_malenames_number1 = all_names_with_years_at_number_one.head(5)['first_name'].tolist()
top5_malenames_number1

['Michael', 'Robert', 'Jacob', 'James', 'John']

The name that appears the most is Micheal. It appears 44 times
_**Remember_** For the periode of 1920 - 2020 : The 5 higher number of men names are : James , John , Robert , Michael, William
However william do not appear in any of the year as most given name

In [46]:
#a graph to visualise the evolution over time
# Filter the data to include only the specified nmale names because most given
#Jacob added bacuse it appeared in some year based on the maximum number of occurrences.
selected_names = ['Michael', 'Robert', 'Jacob', 'James', 'John', 'William']
filtered_names = names[(names['first_name'].isin(selected_names)) & (names['sex'] == 'M')]

# Group by year and first_name, then sum the occurrences
name_counts_per_year = filtered_names.groupby(['year', 'first_name'])['num'].sum().reset_index()

# Create an interactive line plot
fig = px.line(name_counts_per_year, x='year', y='num', color='first_name',
              title='Popularity of Selected Male Names Over Time',
              labels={'num': 'Number of Babies', 'year': 'Year', 'first_name': 'Name'})

# Show the plot
fig.show()

In [20]:
# Calculate the cumulative sum of occurrences for each name by year
names['cumulative_num'] = names.groupby(['first_name', 'sex'])['num'].cumsum()

# Filter the dataset for male names
male_names = names[names['sex'] == 'M']

# Initialize a variable to store the year when 'William' enters the top 5
year_william_top5 = None
cumulative_num_william = None

# Iterate over each year to find when 'William' enters the top 5
for year in male_names['year'].unique():
    # Filter the dataset for the current year
    current_year_data = male_names[male_names['year'] == year]
    
    # Sort by cumulative number in descending order
    top_names_current_year = current_year_data.sort_values(by='cumulative_num', ascending=False)
    
    # Check if 'William' is in the top 5
    if 'William' in top_names_current_year.head(5)['first_name'].values:
        year_william_top5 = year
        cumulative_num_william = top_names_current_year[top_names_current_year['first_name'] == 'William']['cumulative_num'].values[0]
        break

print('Year Williams was top name in ',  year_william_top5, ' and Number was : ', cumulative_num_william)

Year Williams was top name in  1920  and Number was :  50148


The analysis shows that the name "William" did not appear as the top male name in any given year based on the maximum number of occurrences. Instead, names like "John," "Robert," . "John" was the most popular male name from 1920 to 1923, "Robert" in 1924, and "Liam" in recent years from 2017 to 2020. This indicates that while "William" was a popular name, it never reached the highest annual occurrence compared to other male names.

In [21]:
##For each year show the maximum of time a name was given
# Filter the data to include only female names
female_names = names[names['sex'] == 'F']

# Group by year and first_name, then find the maximum number of babies given any one female name in that year
max_num_per_year_female = female_names.groupby(['year', 'first_name'])['num'].max().reset_index()

# Find the maximum number of babies given any one female name in each year
max_num_per_year_female = max_num_per_year_female.groupby('year')['num'].max().reset_index()

# Rename the columns for clarity
max_female_per_year = max_num_per_year_female.rename(columns={'num': 'max_num'})
#print
max_female_per_year

Unnamed: 0,year,max_num
0,1920,70982
1,1921,73985
2,1922,72176
3,1923,71635
4,1924,73536
...,...,...
96,2016,19522
97,2017,19837
98,2018,18770
99,2019,18508


In [22]:
#show the name that correspond to the maximum of time
# Filter the data to include only male names
female_names = names[names['sex'] == 'F']

# Group by year and first_name, then find the maximum number of babies given any one male name in that year
max_num_per_year = female_names.groupby(['year', 'first_name'])['num'].max().reset_index()

# Find the top male name for each year by merging with the max_num_per_year dataframe
top_female_names_per_year = max_num_per_year.merge(
    max_num_per_year.groupby('year')['num'].max().reset_index(),
    on=['year', 'num']
)

# Rename the columns for clarity
top_female_names_per_year = top_female_names_per_year.rename(columns={'num': 'max_num'})
#print
top_female_names_per_year

Unnamed: 0,year,first_name,max_num
0,1920,Mary,70982
1,1921,Mary,73985
2,1922,Mary,72176
3,1923,Mary,71635
4,1924,Mary,73536
...,...,...,...
96,2016,Emma,19522
97,2017,Emma,19837
98,2018,Emma,18770
99,2019,Olivia,18508


same approch we did with the men names. 
the code return Year with the female name most given
- From the previous code we know that in 1920 : the maximum number of time a name was given is 70,982. it match with the name Mary given in 1920
- From 1920 to 1946 Mary is the most given name
- From 1947 to 1952 it is Linda
- Mary came back on the top of the list from 1953 to 1961

In [47]:
# Merge to get the names associated with the maximum number of babies
max_female_per_year = pd.merge(max_num_per_year_female, female_names, on=['year', 'num'])

# Count the number of years each name appears as the most popular name
top_names_count = max_female_per_year['first_name'].value_counts().reset_index()

# Rename the columns for clarity
top_names_count.columns = ['first_name', 'num_years_on_top']

# Display the result
top_names_count

Unnamed: 0,first_name,num_years_on_top
0,Mary,36
1,Jennifer,15
2,Emily,12
3,Jessica,9
4,Lisa,8
5,Linda,6
6,Emma,6
7,Sophia,3
8,Ashley,2
9,Isabella,2


In [24]:
# Get the top 5 names based on the number of years on top
top5_femalenames_number1 = top_names_count.head(5)['first_name'].tolist()
top5_femalenames_number1

['Mary', 'Jennifer', 'Emily', 'Jessica', 'Lisa']

Mary appears 36 times on top
_**Remember**_ For the periode of 1920 - 2020 : The 5 higher number of female names are : 'Mary', 'Patricia', 'Elizabeth', 'Jennifer', 'Linda'
Noticed that :
- Patricia , Elizabeth, were never the most given name in any of the year
- Linda is 6th on the list of name most given in a particular year. 6 times but manage to be on the list of 5 most given name over 101 years
 

In [48]:
#a graph to visualise the evolution over time
# Filter the data to include only the specified female names because over all most 5 given
# lisa and emily are added as they appeared in some year based on the maximum number of occurrences.
selected_femalnames = ['Mary', 'Patricia', 'Elizabeth', 'Jennifer', 'Linda' , 'Emily', 'Lisa'] 
filtered_femalnames = names[(names['first_name'].isin(selected_femalnames)) & (names['sex'] == 'F')]

# Group by year and first_name, then sum the occurrences
name_counts_per_year = filtered_femalnames.groupby(['year', 'first_name'])['num'].sum().reset_index()

# Create an interactive line plot
fig = px.line(name_counts_per_year, x='year', y='num', color='first_name',
              title='Popularity of Selected Male Names Over Time',
              labels={'num': 'Number of Babies', 'year': 'Year', 'first_name': 'Name'})

# Show the plot
fig.show()

**Summary:**
Patricia and Elizabeth were consistently popular but never the top names in any given year.
Other names had higher peak popularity, overshadowing Patricia and Elizabeth.
The line plot provides a visual representation of their popularity trends over time, allowing for comparison with other names.
These conclusions help you understand the historical popularity of Patricia and Elizabeth and how they compare to other names that were more dominant in specific years.

In [26]:
# Calculate the cumulative sum of occurrences for each name by year
names['cumulative_num'] = names.groupby(['first_name', 'sex'])['num'].cumsum()

# Filter the dataset for female names
female_names = names[names['sex'] == 'F']

# Initialize variables to store the year and cumulative number when 'Patricia' and 'Elizabeth' enter the top 5
year_patricia_top5 = None
cumulative_num_patricia = None
year_elizabeth_top5 = None
cumulative_num_elizabeth = None

# Iterate over each year to find when 'Patricia' and 'Elizabeth' enter the top 5
for year in female_names['year'].unique():
    # Filter the dataset for the current year
    current_year_data = female_names[female_names['year'] == year]
    
    # Sort by cumulative number in descending order
    top_names_current_year = current_year_data.sort_values(by='cumulative_num', ascending=False)
    
    # Check if 'Patricia' is in the top 5
    if 'Patricia' in top_names_current_year.head(5)['first_name'].values and year_patricia_top5 is None:
        year_patricia_top5 = year
        cumulative_num_patricia = top_names_current_year[top_names_current_year['first_name'] == 'Patricia']['cumulative_num'].values[0]
    
    # Check if 'Elizabeth' is in the top 5
    if 'Elizabeth' in top_names_current_year.head(5)['first_name'].values and year_elizabeth_top5 is None:
        year_elizabeth_top5 = year
        cumulative_num_elizabeth = top_names_current_year[top_names_current_year['first_name'] == 'Elizabeth']['cumulative_num'].values[0]
    
    # Break the loop if both names have been found
    if year_patricia_top5 is not None and year_elizabeth_top5 is not None:
        break

print('Year Patricia was top name in:', year_patricia_top5, 'and Number was:', cumulative_num_patricia)
print('Year Elizabeth was top name in:', year_elizabeth_top5, 'and Number was:', cumulative_num_elizabeth)

Year Patricia was top name in: 1945 and Number was: 502096
Year Elizabeth was top name in: 1976 and Number was: 778793


In [27]:
# Filter the names dataframe to include only the names 'Adam' and 'Joseph'
selected_names = names[names['first_name'].isin(['Adam', 'Joseph'])]

# Group by year and first_name, and sum the occurrences
selected_name_counts = selected_names.groupby(['year', 'first_name'])['num'].sum().reset_index()

# Calculate the cumulative sum of occurrences for each name
selected_name_counts['cumulative_num'] = selected_name_counts.groupby('first_name')['num'].cumsum()

# Display the result
selected_name_counts

Unnamed: 0,year,first_name,num,cumulative_num
0,1920,Joseph,25590,25590
1,1921,Joseph,26182,51772
2,1922,Joseph,25698,77470
3,1923,Joseph,25562,103032
4,1924,Joseph,25422,128454
...,...,...,...,...
142,2016,Joseph,10941,2323837
143,2017,Joseph,10456,2334293
144,2018,Joseph,9636,2343929
145,2019,Joseph,9104,2353033


In [28]:
#Group by year, sex, and first_name, then find the maximum number of babies given any one name in that year
max_num_per_year_gender = names.groupby(['year', 'sex', 'first_name'])['num'].max().reset_index()

# Find the maximum number of babies given any one name in each year for each sex
max_num_per_year_gender = max_num_per_year_gender.groupby(['year', 'sex'])['num'].max().reset_index()

# Rename the columns for clarity
max_per_year_gender = max_num_per_year_gender.rename(columns={'num': 'max_num', 'sex': 'gender'})

# Display the DataFrame
max_per_year_gender

Unnamed: 0,year,gender,max_num
0,1920,F,70982
1,1920,M,56914
2,1921,F,73985
3,1921,M,58215
4,1922,F,72176
...,...,...,...
197,2018,M,19924
198,2019,F,18508
199,2019,M,20555
200,2020,F,17535


In [29]:
# Create an interactive line plot
fig = px.line(max_per_year_gender, x='year', y='max_num', color='gender', title='Maximum Number of Babies Given Any One Name Per Year by Gender')

# Show the plot
fig.show()

In [30]:
# Pivot the data to have separate columns for male and female
yearly_trends_gender = max_per_year_gender.pivot(index='year', columns='gender', values='max_num').reset_index()

# Create an interactive plot
fig = go.Figure()

# Add trace for baby girls
fig.add_trace(go.Scatter(
    x=yearly_trends_gender['year'],
    y=yearly_trends_gender['F'],
    mode='lines',
    name='Female',
    line=dict(color='purple'),
    hovertemplate='Year: %{x}<br>Number of Babies: %{y:,}<extra></extra>'
))

# Add trace for baby boys
fig.add_trace(go.Scatter(
    x=yearly_trends_gender['year'],
    y=yearly_trends_gender['M'],
    mode='lines',
    name='Male',
    line=dict(color='green'),
    hovertemplate='Year: %{x}<br>Number of Babies: %{y:,}<extra></extra>'
))

# Update layout
fig.update_layout(
    title='Maximum Number of Babies Given Any One Name Per Year by Gender',
    xaxis_title='Year',
    yaxis_title='Number of Babies',
    yaxis_tickformat=',',  # Use comma as a thousands separator
    legend_title='Gender',
    template='plotly_white'
)

# Show the plot
fig.show()

In [31]:
# Calculate the total number of births per year
total_births_per_year = names.groupby('year')['num'].sum().reset_index()
total_births_per_year

Unnamed: 0,year,num
0,1920,1200971
1,1921,1264083
2,1922,1208754
3,1923,1236275
4,1924,1294834
...,...,...
96,2016,1087581
97,2017,1042170
98,2018,977117
99,2019,916795


To determine whether the decline in 2020 is due to fewer births or less usage of popular names, we need to analyze the overall birth data and the distribution of names. Let's perform the following steps:

Calculate the total number of births per year.
Compare the trend of total births with the trend of the most popular names.
I'll start by calculating the total number of births per year.

In [32]:
# Calculate the total number of births per year for each gender
births_per_year_gender = names.groupby(['year', 'sex'])['num'].sum().reset_index()
births_per_year_gender

Unnamed: 0,year,sex,num
0,1920,F,600217
1,1920,M,600754
2,1921,F,638111
3,1921,M,625972
4,1922,F,596728
...,...,...,...
197,2018,M,632503
198,2019,F,320977
199,2019,M,595818
200,2020,F,296365


The analysis shows a significant decline in the total number of births from 2016 to 2020, with births dropping from 1,087,581 in 2016 to 826,019 in 2020. When broken down by gender, both female and male births decreased during this period. Female births dropped from 320,977 in 2019 to 296,365 in 2020, and male births dropped from 595,818 in 2019 to 529,654 in 2020. This indicates that the decline in the maximum number of babies given any one name by 2020 is primarily due to an overall decrease in the number of births rather than a decline in the popularity of specific names.

In [33]:
# Create an interactive line plot for total births per year by gender
fig = px.line(births_per_year_gender, x='year', y='num', color='sex', title='Total Number of Births Per Year by Gender')

# Show the plot
fig.show()

The interactive line plot shows that the total number of births for both genders has been declining since around 1987. This trend is consistent for both male and female births. The significant drop in the maximum number of babies given any one name by 2020 is primarily due to an overall decrease in the number of births rather than a decline in the popularity of specific names.

In [34]:
names.head()

Unnamed: 0,year,first_name,sex,num,cumulative_num
0,1920,Mary,F,70982,70982
1,1920,Dorothy,F,36643,36643
2,1920,Helen,F,35097,35097
3,1920,Margaret,F,27994,27994
4,1920,Ruth,F,26101,26101


In [35]:
!pip install pmdarima

Defaulting to user installation because normal site-packages is not writeable
Collecting pmdarima
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (7.8 kB)
Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m107.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pmdarima
Successfully installed pmdarima-2.0.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [36]:
import pmdarima as pm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Prepare the data for ARIMA
births_per_year = births_per_year_gender.groupby('year')['num'].sum()

# Fit the ARIMA model
arima_model = pm.auto_arima(births_per_year, seasonal=False, stepwise=True)

# Make predictions on the training data
y_pred_arima = arima_model.predict_in_sample()

# Calculate MAE, MSE, and R-squared for ARIMA
mae_arima = mean_absolute_error(births_per_year, y_pred_arima)
mse_arima = mean_squared_error(births_per_year, y_pred_arima)
r2_arima = r2_score(births_per_year, y_pred_arima)

mae_arima, mse_arima, r2_arima

(59376.92697757526, 7262436098.597697, 0.9736148755440206)

In [37]:
# Forecasting future births from 2021 to 2121 separately for female and male using the fitted ARIMA model

# Define the number of periods for forecasting (101 years)
n_periods = 101

# Make future predictions for total births
future_forecast_total = arima_model.predict(n_periods=n_periods)

# Assuming the ratio of male to female births remains constant based on the last available year
last_year_data = births_per_year_gender[births_per_year_gender['year'] == births_per_year_gender['year'].max()]
total_last_year = last_year_data['num'].sum()
male_ratio = last_year_data[last_year_data['sex'] == 'M']['num'].values[0] / total_last_year
female_ratio = last_year_data[last_year_data['sex'] == 'F']['num'].values[0] / total_last_year

# Calculate future predictions for male and female births
future_forecast_male = future_forecast_total * male_ratio
future_forecast_female = future_forecast_total * female_ratio

# Create DataFrames for the forecasted values
future_years = pd.date_range(start='2021', periods=n_periods, freq='Y').year
forecast_male_df = pd.DataFrame({'year': future_years, 'gender': 'M', 'predicted_births': future_forecast_male})
forecast_female_df = pd.DataFrame({'year': future_years, 'gender': 'F', 'predicted_births': future_forecast_female})

# Combine male and female forecasts into a single DataFrame
forecast_df = pd.concat([forecast_male_df, forecast_female_df]).reset_index(drop=True)

# Display the forecasted values
forecast_df


No supported index is available. Prediction results will be given with an integer index beginning at `start`.



Unnamed: 0,year,gender,predicted_births
0,2021,M,508437.046578
1,2022,M,512322.839958
2,2023,M,516149.236057
3,2024,M,519917.142805
4,2025,M,523627.454257
...,...,...,...
197,2117,F,394315.374032
198,2118,F,394810.933260
199,2119,F,395298.917491
200,2120,F,395779.442516


In [38]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Filter data for the two time periods
historical_data = births_per_year_gender[(births_per_year_gender['year'] >= 1920) & (births_per_year_gender['year'] <= 2020)]
forecast_data = forecast_df[(forecast_df['year'] >= 2021) & (forecast_df['year'] <= 2121)]

# Create subplots
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, subplot_titles=("1920 to 2020", "2021 to 2121"))

# Add historical data to the first subplot
for gender in historical_data['sex'].unique():
    gender_data = historical_data[historical_data['sex'] == gender]
    fig.add_trace(
        go.Scatter(x=gender_data['year'], y=gender_data['num'], mode='lines', name=f'Historical {gender}'),
        row=1, col=1
    )

# Add forecast data to the second subplot
for gender in forecast_data['gender'].unique():
    gender_data = forecast_data[forecast_data['gender'] == gender]
    fig.add_trace(
        go.Scatter(x=gender_data['year'], y=gender_data['predicted_births'], mode='lines', name=f'Forecast {gender}'),
        row=2, col=1
    )

# Update layout
fig.update_layout(height=800, width=1000, title_text="Births Forecast: 1920 to 2020 and 2021 to 2121")

# Show the figure
fig.show()

In [39]:
from prophet import Prophet
def prepare_data_for_prophet(names, name, sex):
    df = names[(names['first_name'] == name) & (names['sex'] == sex)]
    df_prophet = df[['year', 'num']].rename(columns={'year': 'ds', 'num': 'y'})
    return df_prophet

# Get unique names and sexes
unique_names = names['first_name'].unique()
unique_sexes = names['sex'].unique()

In [40]:
# Filter the male names dataframe for the years 1920 to 2121
male_names_1920_2121 = male_names[(male_names['year'] >= 1920) & (male_names['year'] <= 2121)]

# Group by first_name and sum the num column to get the total number of births for each name
male_name_counts_1920_2121 = male_names_1920_2121.groupby('first_name')['num'].sum().reset_index()

# Sort the dataframe by the num column in descending order to get the most popular names
top_male_names_1920_2121 = male_name_counts_1920_2121.sort_values(by='num', ascending=False).head(5)

# Display the top 5 male names from 1920 to 2121
top_male_names_1920_2121

Unnamed: 0,first_name,num
117,James,4748138
135,John,4510721
196,Robert,4495199
171,Michael,4278824
235,William,3614424


In [41]:
# Group by year and first_name to get the total number of births for each name in each year
annual_male_name_counts = male_names_1920_2121.groupby(['year', 'first_name'])['num'].sum().reset_index()

# Find the name with the highest occurrence each year
highest_annual_occurrence = annual_male_name_counts.loc[annual_male_name_counts.groupby('year')['num'].idxmax()]

# Calculate the total number of births for all names each year
total_births_per_year = male_names_1920_2121.groupby('year')['num'].sum().reset_index()

# Merge the highest annual occurrence with the total births per year
highest_annual_occurrence = highest_annual_occurrence.merge(total_births_per_year, on='year', suffixes=('', '_total'))

# Calculate the percentage of the highest occurrence compared to the total births
highest_annual_occurrence['percentage'] = (highest_annual_occurrence['num'] / highest_annual_occurrence['num_total']) * 100

# Display the result
highest_annual_occurrence

Unnamed: 0,year,first_name,num,num_total,percentage
0,1920,John,56914,600754,9.473761
1,1921,John,58215,625972,9.299937
2,1922,John,57280,612026,9.359080
3,1923,John,57469,626011,9.180190
4,1924,Robert,60801,648526,9.375260
...,...,...,...,...,...
96,2016,Noah,19154,713483,2.684577
97,2017,Liam,18824,674032,2.792746
98,2018,Liam,19924,632503,3.150025
99,2019,Liam,20555,595818,3.449879
