# Merging databases
## Libraries

In [1]:
# General libraries
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize

## Spain data
### Most frequent names by year
I download the data of the [100 most frequent names from 2002 to 2019 in Spain](https://www.ine.es/dyngs/INEbase/es/operacion.htm?c=Estadistica_C&cid=1254736177009&menu=resultados&idp=1254734710990#!tabs-1254736195454), shared by the Instituto Nacional de Estadistica.

In [2]:
# Parameters
my_folder = "../data/raw/Spain/"
years = np.arange(2002, 2020)

# Open the files
df_sp = pd.DataFrame()
aux = pd.DataFrame()
for y in years:
    for i in ['nacional', 'Nacional', 'NACIONAL', 'TOTAL']:
        for j in ['.xls', '.xlsx']:
            try:
                aux = pd.read_excel(my_folder+'nomnac'+str(y)[-2:]+j, 
                                    sheet_name=i,
                                    skiprows=3, 
                                    names=['name_m', 'number_m', 'erase', 'name_f', 'number_f']
                                   )
            except:
                next
    aux['year'] = y
    df_sp = pd.concat([df_sp, aux])

In [3]:
# Wrangling data
df_sp.drop(columns=["erase"])
males = df_sp[['name_m', 'number_m', 'year']]\
    .rename(columns={'name_m':'name', 'number_m': 'number'})
males['sex'] = "M"
females = df_sp[['name_f', 'number_f', 'year']]\
    .rename(columns={'name_f':'name', 'number_f': 'number'})
females['sex'] = "F"
df = pd.concat([males, females]).reset_index(drop=True)

# Names in lowercase
df.name = [str(n).lower() for n in df.name]

# Clean names
df.name = [re.sub(r'^\s+([A-Za-z])', r'\1', name) for name in df.name]
df.name = [re.sub(r'([A-Za-z0-9]+\s?[A-Za-z0-9]+)\s{2,100}', r'\1', name) for name in df.name]
df = df[~(df.name=='total')]

# # Names in tokens
# df['tokens'] = [word_tokenize(n) for n in df.name]

In [4]:
# Save the data to CSV file
df.to_csv('../data/names/names_spain.csv')

### Average age by name
I download the data with [all names with frequency equal or higher to 20 people](https://www.ine.es/dyngs/INEbase/es/operacion.htm?c=Estadistica_C&cid=1254736177009&menu=resultados&idp=1254734710990#!tabs-1254736195454), shared by the Instituto Nacional de Estadistica.

In [85]:
# Open the file
my_dic = {'Hombres':"M",
          "Mujeres":"F"}

aux = dict()
for i, j in my_dic.items():
    aux[i] = pd.read_excel(my_folder+'nombres_por_edad_media.xls', 
                  sheet_name=i,
                  skiprows=6 ,
                  names=['orden', 'name', 'number', 'average_age']
                 )#.drop(columns=['orden'])
    aux[i]['sex'] = j

In [86]:
# Concatenate male and female databases
df = pd.concat([aux['Hombres'], aux['Mujeres']]).reset_index(drop=True)

# Names in lowercase
df.name = [str(n).lower() for n in df.name]

# Wrangling data
df.loc[30398 , 'name'] = 'na'

In [88]:
# Save the data to CSV file
df.to_csv('../data/names/names_spain_average_age.csv')

## USA data
### Most frequent names by year
Names in [the US with frequency equal or higher to 5 observations per year](https://www.ssa.gov/oact/babynames/limits.html), by Social Security Administration.

In [98]:
# Parameters
my_folder = "../data/raw/USA/"
# years = np.arange(2000, 2020)
years = np.arange(1880, 2020)

# Open the file
df = pd.DataFrame()
for y in years:
    aux = pd.read_csv(my_folder + 'yob' + str(y) + '.txt', 
                     header=None).rename(columns={0:'name', 1:'sex', 2:"number"})
    aux['year'] = y
    df = pd.concat([df, aux]).reset_index(drop=True)
df.name = [i.lower() for i in df.name]

In [100]:
# Save the data to CSV file
df.to_csv('../data/names/names_usa.csv')

Unnamed: 0,name,sex,number,year
0,mary,F,7065,1880
1,anna,F,2604,1880
2,emma,F,2003,1880
3,elizabeth,F,1939,1880
4,minnie,F,1746,1880
...,...,...,...,...
1989396,zyheem,M,5,2019
1989397,zykel,M,5,2019
1989398,zyking,M,5,2019
1989399,zyn,M,5,2019


## Canada data