In [None]:
"""
Read the dataset from the below link

https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/06_Stats/US
_Baby_Names/US_Baby_Names_right.csv

Questions:
1. Delete unnamed columns
2. Show the distribution of male and female
3. Show the top 5 most preferred names
4. What is the median name occurence in the dataset
5. Distribution of male and female born count by states

"""

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt


url = "https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/06_Stats/US_Baby_Names/US_Baby_Names_right.csv"

BabyNames= pd.read_csv(url)

BabyNames.head()
    

Unnamed: 0.1,Unnamed: 0,Id,Name,Year,Gender,State,Count
0,11349,11350,Emma,2004,F,AK,62
1,11350,11351,Madison,2004,F,AK,48
2,11351,11352,Hannah,2004,F,AK,46
3,11352,11353,Grace,2004,F,AK,44
4,11353,11354,Emily,2004,F,AK,41


In [2]:
BabyNames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1016395 entries, 0 to 1016394
Data columns (total 7 columns):
Unnamed: 0    1016395 non-null int64
Id            1016395 non-null int64
Name          1016395 non-null object
Year          1016395 non-null int64
Gender        1016395 non-null object
State         1016395 non-null object
Count         1016395 non-null int64
dtypes: int64(4), object(3)
memory usage: 54.3+ MB


In [None]:
# 1. Delete unnamed columns

In [3]:
BabyNames.drop(['Unnamed: 0'], axis=1, inplace=True)
BabyNames.head()

Unnamed: 0,Id,Name,Year,Gender,State,Count
0,11350,Emma,2004,F,AK,62
1,11351,Madison,2004,F,AK,48
2,11352,Hannah,2004,F,AK,46
3,11353,Grace,2004,F,AK,44
4,11354,Emily,2004,F,AK,41


In [None]:
# 2. Show the distribution of male and female

In [4]:
BabyNames.Gender.value_counts()

F    558846
M    457549
Name: Gender, dtype: int64

In [5]:
BabyNames.Gender.unique()

array(['F', 'M'], dtype=object)

In [32]:
# Another way which give % as well is as follows:
Sex=pd.Categorical(BabyNames["Gender"],ordered=True)
Sex=Sex.rename_categories(["Female","Male"])
Sex.describe()
# freq is percentage. 54.9832 % female.

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,558846,0.549832
Male,457549,0.450168


In [6]:
# 3. Show the top 5 most preferred names
G_Names=BabyNames.groupby("Name").sum()
G_Names.Count.sort_values(ascending=False).head(5)

Name
Jacob       242874
Emma        214852
Michael     214405
Ethan       209277
Isabella    204798
Name: Count, dtype: int64

In [None]:
# 4. What is the median name occurence in the dataset

In [7]:
G_Names[G_Names.Count == G_Names.Count.median()]

Unnamed: 0_level_0,Id,Year,Count
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aishani,7810526,14078,49
Alara,18841027,16079,49
Alysse,22629405,16057,49
Ameir,21780411,16086,49
Anely,4349541,16071,49
Antonina,27672250,18081,49
Aveline,7982905,12065,49
Aziah,29825407,16073,49
Baily,27406186,16064,49
Caleah,20967785,18106,49


In [None]:
# 5. Distribution of male and female born count by states

In [9]:
BabyNames.groupby(["Gender","State"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Id,Name,Year,Count
Gender,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,AK,2404,2404,2404,2404
F,AL,9878,9878,9878,9878
F,AR,7171,7171,7171,7171
F,AZ,14518,14518,14518,14518
F,CA,45144,45144,45144,45144
F,CO,11424,11424,11424,11424
F,CT,6575,6575,6575,6575
F,DC,3053,3053,3053,3053
F,DE,2549,2549,2549,2549
F,FL,25781,25781,25781,25781
