# 02. Challenge BigQuery

El objetivo es un fichero que contenga la siguiente información:
- Estado (string)
- B70: Nacimientos en la decada los 70 en ese estado (number)
- B80: Nacimientos en la decada los 80 en ese estado (number)
- B90: Nacimientos en la decada los 90 en ese estado (number)
- B00: Nacimientos en la decada los 2000 en ese estado (number)

Formato: .csv

In [1]:
# importanmos las librerías
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd

In [2]:
creds = service_account.Credentials.from_service_account_file('/content/thebridgept0521-00ccc9a47591.json')

In [4]:
# introducimos el projectID
proj_id = 'thebridgept0521'

In [5]:
client = bigquery.Client(credentials=creds, project=proj_id)

In [6]:
# Creamos la primera query - ESTADO (string)
query_1 = """
  SELECT
    state AS estado
  FROM `bigquery-public-data.samples.natality`
  WHERE 
    state IS NOT NULL
  GROUP BY
    estado
  ORDER BY
    estado 
"""

In [7]:
query_job = client.query(query_1)
df_1 = query_job.to_dataframe()
df_1

Unnamed: 0,estado
0,AK
1,AL
2,AR
3,AZ
4,CA
5,CO
6,CT
7,DC
8,DE
9,FL


***
> Parte 2, incrementamos con la columna B70

In [12]:
# Creamos la primera query - B70 (number)
query_2 = """
  SELECT
    state AS estado,
    SUM(CASE WHEN year >= 1970 AND year < 1980 THEN plurality ELSE 0 END) AS B70
  FROM `bigquery-public-data.samples.natality`
  WHERE 
    state IS NOT NULL AND
    NOT IS_NAN(plurality) AND plurality >= 1
  GROUP BY
    estado
  ORDER BY
    estado 
"""

In [13]:
query_job = client.query(query_2)
df_2 = query_job.to_dataframe()
df_2

Unnamed: 0,estado,B70
0,AK,48276
1,AL,398119
2,AR,155656
3,AZ,185441
4,CA,1518173
5,CO,343789
6,CT,192834
7,DC,96745
8,DE,40340
9,FL,955244


***
> Lanzamos la consulta para obtener Male y Female

In [16]:
# Lanzamos la consulta para Male y Female
query_3 = """
  SELECT
    state AS estado,
    SUM(CASE WHEN year >= 1970 AND year < 1980 THEN plurality ELSE 0 END) AS B70,
    SUM(IF(is_male = TRUE, 1, 0)) AS Male,
    SUM(IF(is_male = FALSE, 1, 0)) AS Female

  FROM `bigquery-public-data.samples.natality`
  WHERE 
    state IS NOT NULL
  GROUP BY
    estado
  ORDER BY
    estado 
"""

In [17]:
query_job = client.query(query_3)
df_3 = query_job.to_dataframe()
df_3

Unnamed: 0,estado,B70,Male,Female
0,AK,48276,165902,158281
1,AL,398119,1001622,955770
2,AR,155656,544758,517830
3,AZ,185441,940155,897475
4,CA,1518173,7060826,6733288
5,CO,343789,931153,886844
6,CT,192834,690130,655935
7,DC,96745,268568,257548
8,DE,40340,152025,145144
9,FL,955244,2884483,2744405


In [20]:
df_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   estado  51 non-null     object
 1   B70     51 non-null     int64 
 2   Male    51 non-null     int64 
 3   Female  51 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 1.7+ KB


***
> Añadimos el campo Weight
este campo está en pounds y debemos convertirlo en kilos
la fórmula de conversión es :
- pounds * 0.453592

In [25]:
# Lanzamos la consulta para Weight
query_4 = """
  SELECT
    state AS estado,
    SUM(CASE WHEN year >= 1970 AND year < 1980 THEN plurality ELSE 0 END) AS B70,
    SUM(IF(is_male = TRUE, 1, 0)) AS Male,
    SUM(IF(is_male = FALSE, 1, 0)) AS Female,
    ROUND(AVG(weight_pounds * 0.453592),3) AS Weight

  FROM `bigquery-public-data.samples.natality`
  WHERE 
    state IS NOT NULL
  GROUP BY
    estado
  ORDER BY
    estado 
"""

In [26]:
query_job = client.query(query_4)
df_4 = query_job.to_dataframe()
df_4

Unnamed: 0,estado,B70,Male,Female,Weight
0,AK,48276,165902,158281,3.445
1,AL,398119,1001622,955770,3.279
2,AR,155656,544758,517830,3.298
3,AZ,185441,940155,897475,3.322
4,CA,1518173,7060826,6733288,3.359
5,CO,343789,931153,886844,3.213
6,CT,192834,690130,655935,3.337
7,DC,96745,268568,257548,3.2
8,DE,40340,152025,145144,3.312
9,FL,955244,2884483,2744405,3.299
