## US Baby Names 1880–2020

In [None]:
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import requests
from io import StringIO
from io import BytesIO
from zipfile import ZipFile
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4)
pd.options.display.max_rows = 20

In [None]:
#Defining the url for the dataset
urlds="https://gitlab.gitlab.svc.cent-su.org/ccaicedo/652public/-/raw/master/datasets/babynames/names.zip"

#Access to datasets via URLs is usually easy (see command below) but we have to work around a security issue in our case.
csvdata=requests.get(urlds,verify=False).content  #this will generate a warning but you can proceed

zf = ZipFile(BytesIO(csvdata),'r')  #The dataset is being accessed from a zip file so this step is needed.

In [None]:
#data=pd.read_csv(StringIO(csvdata))  #getting the data into a pandas dataframe
#data=pd.read_csv(BytesIO(csvdata),compression='zip') 
names1880=pd.read_csv(zf.open("yob1880.txt"),names=['name', 'sex', 'births'])

In [None]:
names1880

The files contain names with at leas five occurrences in each year. Let's use the sum of the births column by sex as the total number of births in that year.

In [None]:
names1880.groupby('sex').births.sum()

Let/s join (assemble) all the data into a single DataFrame and add a *year* field

In [None]:
years = range(1880, 2021) #1880 - 2020

pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    #path = 'yob%d.txt' % year
    filename = 'yob%d.txt' % year
    file = zf.open(filename)
    frame = pd.read_csv(file, names=columns)

#names1880=pd.read_csv(zf.open("yob1880.txt"),names=['name', 'sex', 'births'])

    frame['year'] = year  #add year column
    pieces.append(frame)

# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)

In [None]:
names

In [None]:
total_births = names.pivot_table('births', index='year', columns='sex', aggfunc=sum)
total_births.tail()

In [None]:
total_births.plot(title='Total births by sex and year')

Let’s insert a column *prop* with the fraction of babies given each name relative to the total number of births. We group the data by year and sex, then add the new column to each group.

In [None]:
def add_prop(group):
    group['prop'] = group.births / group.births.sum()
    return group
names = names.groupby(['year', 'sex']).apply(add_prop)

In [None]:
names

Checking if prop was computed correctly

In [None]:
names.groupby(['year', 'sex']).prop.sum()

Let's get the top 1000 names for each sex/year combination

In [None]:
def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]

grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)

# Drop the group index, not needed
top1000.reset_index(inplace=True, drop=True)

In [None]:
top1000

### Analyzing Naming Trends

In [None]:
#Top 1000 names for boys and girls
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']

Names over time 

In [None]:
total_births = top1000.pivot_table('births', index='year',
                                   columns='name',
                                   aggfunc=sum)

In [None]:
total_births.info()



In [None]:
total_births.head()

In [None]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 10), grid=False,
            title="Number of births per year")

#### Measuring the increase in naming diversity

The drop in the plots may be due to fewer parents choosing common names for their children. Let's check the proportion of births represented by the top 1000 most popular names

In [None]:
plt.figure()

In [None]:
table = top1000.pivot_table('prop', index='year',
                            columns='sex', aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex',
           yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2021, 10))

In [None]:
df = boys[boys.year == 2020]
df

In [None]:
prop_cumsum = df.sort_values(by='prop', ascending=False).prop.cumsum()
prop_cumsum.head()

In [None]:
prop_cumsum.values.searchsorted(0.5)

In [None]:
df = boys[boys.year == 1900]
in1900 = df.sort_values(by='prop', ascending=False).prop.cumsum()
in1900.values.searchsorted(0.5)

In [None]:
def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1  #add one to position value to get count value

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')

In [None]:
fig = plt.figure()

In [None]:
diversity.head()
diversity.plot(title="Number of popular names in top 50%")