# Mini Project - Baby Name analysis

## Step 1. Data gathering

In [6]:
import requests

url = "https://www.ssa.gov/oact/babynames/names.zip"

with requests.get(url) as response:

    with open("names.zip", "wb") as temp_file:
        temp_file.write(response.content)

## Step 2. Data nourishing

In [2]:
import csv
from zipfile import ZipFile

# This list will hold all our data. We initialize it with the header row.
data_list = [["year", "name", "gender", "count"]]

# We first read the zip file using a zipfile.ZipFile object.
with ZipFile("names.zip") as temp_zip:

    # Then we read the file list.
    for file_name in temp_zip.namelist():

        # We will only process .txt files.
        if ".txt" in file_name:

            # Now we read the current file from the zip file.
            with temp_zip.open(file_name) as temp_file:

                # The file is opened as binary, we decode it using utf-8 so it can be manipulated as a string.
                for line in temp_file.read().decode("utf-8").splitlines():

                    # We prepare our desired data fields and add them to the data list.
                    line_chunks = line.split(",")
                    year = file_name[3:7]
                    name = line_chunks[0]
                    gender = line_chunks[1]
                    count = line_chunks[2]

                    data_list.append([year, name, gender, count])

# We save the data list into a csv file.
csv.writer(open("data.csv", "w", newline="",
                encoding="utf-8")).writerows(data_list)

## Step 3 - Read and Transform

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [4]:
df = pd.read_csv("data.csv")

In [5]:
# Head (First 5 rows)
df.head()

Unnamed: 0,year,name,gender,count
0,1880,Mary,F,7065
1,1880,Anna,F,2604
2,1880,Emma,F,2003
3,1880,Elizabeth,F,1939
4,1880,Minnie,F,1746


In [7]:
# Tail (Last 5 rows)
df.tail()

Unnamed: 0,year,name,gender,count
1957041,2018,Zylas,M,5
1957042,2018,Zyran,M,5
1957043,2018,Zyrie,M,5
1957044,2018,Zyron,M,5
1957045,2018,Zzyzx,M,5


### Find Unique Names

In [8]:
# Unique names either gender.
df["name"].nunique()

# Unique names for male.
df[df["gender"] == "M"]["name"].nunique()

# Unique names for female.
df[df["gender"] == "F"]["name"].nunique()

# Unique names for gender neutral.
both_df = df.pivot_table(index="name", columns="gender", values="count", aggfunc=np.sum).dropna()
both_df.index.nunique()

10773

### Top 10 Male and Female Names

In [9]:
# Step by step approach, the one-liners can be found below their respective tables.
df = df[df["gender"] == "M"]
df = df[["name", "count"]]
df = df.groupby("name")
df = df.sum()
df = df.sort_values("count", ascending=False)
df.head(10)

Unnamed: 0_level_0,count
name,Unnamed: 1_level_1
James,5164280
John,5124817
Robert,4820129
Michael,4362731
William,4117369
David,3621322
Joseph,2613304
Richard,2565301
Charles,2392779
Thomas,2311849
