# Recolectar datos con API

Solicitamos el token en la web 
https://www.ncdc.noaa.gov/cdo-web/token

In [1]:
# Realizamos los pasos para ejecutar el requests
import requests

# creamos una función
def make_request(endpoint, payload=None):
  """
  Realizamos la petición a la API de NOAA

  Parameters:
    - endpoint: argumento obligatorio que pasamos a la función
    - payload: un diccionario que le pasamos a la función requests
  
  Returns:
    - la respuesta
  """
  return requests.get(
      f'https://www.ncdc.noaa.gov/cdo-web/api/v2/{endpoint}',
      headers={
          'token': 'NTyVywGydJVjaCpLlshvKuENczFpMDJP'
      },
      params = payload
  )

In [None]:
make_request()

In [4]:
# Recolectamos los datos de las estaciones de NYC
import datetime
from IPython import display # for updating the cell dynamically

current = datetime.date(2018, 1, 1)
end = datetime.date(2019, 1, 1)

results = []

while current < end:
  display.clear_output(wait=True)
  display.display(f"Recogiendo datos de {str(current)}")

  response = make_request(
      'data',
      {
        'datasetid':'GHCND',
       'locationid' : 'CITY:US360019', # NYC
       'startdate': current,
       'enddate': current,
       'units': 'metric',
       'limit': 1000
       
      }
  )
  if response.ok:
    results.extend(response.json()['results'])
    # actualiza el current date así evitemos un loop infinito
  current += datetime.timedelta(days=1)

'Recogiendo datos de 2018-12-31'

In [6]:
len(results)

79974

In [7]:
results[0]

{'attributes': ',,N,',
 'datatype': 'PRCP',
 'date': '2018-01-01T00:00:00',
 'station': 'GHCND:US1CTFR0039',
 'value': 0.0}

> Una vez recogidos los datos creamos nuestros dataframe en Pandas

In [8]:
import pandas as pd

# creamos el dataframe
df = pd.DataFrame(results)

In [10]:
df.sample(10, random_state=17)

Unnamed: 0,date,datatype,station,attributes,value
20928,2018-04-02T00:00:00,WESF,GHCND:US1NJES0018,",,N,",9.9
68864,2018-11-12T00:00:00,TMIN,GHCND:USW00094745,",,W,2400",-2.7
2200,2018-01-10T00:00:00,PRCP,GHCND:US1NJMN0012,",,N,",0.0
2142,2018-01-09T00:00:00,AWND,GHCND:USW00094745,",,W,",4.8
6171,2018-01-27T00:00:00,SNWD,GHCND:USC00301309,",,7,0700",0.0
59016,2018-09-27T00:00:00,PRCP,GHCND:US1NYKN0025,",,N,",4.6
7250,2018-02-01T00:00:00,SNOW,GHCND:US1NJMS0049,",,N,",0.0
17559,2018-03-18T00:00:00,PRCP,GHCND:US1NYNS0030,",,N,",0.0
8229,2018-02-05T00:00:00,PRCP,GHCND:USC00283704,",,7,0700",21.6
77285,2018-12-20T00:00:00,SNWD,GHCND:US1NJES0024,",,N,",0.0


In [12]:
# Guardamos los datos
df.to_csv("nyc_weather_2018.csv", index=False)

In [14]:
# Comprobamos que podemos leer el fichero sin problemas.
pd.read_csv("/content/nyc_weather_2018.csv")

Unnamed: 0,date,datatype,station,attributes,value
0,2018-01-01T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0
1,2018-01-01T00:00:00,PRCP,GHCND:US1NJBG0015,",,N,",0.0
2,2018-01-01T00:00:00,SNOW,GHCND:US1NJBG0015,",,N,",0.0
3,2018-01-01T00:00:00,PRCP,GHCND:US1NJBG0017,",,N,",0.0
4,2018-01-01T00:00:00,SNOW,GHCND:US1NJBG0017,",,N,",0.0
...,...,...,...,...,...
79969,2018-12-31T00:00:00,WDF5,GHCND:USW00094789,",,W,",130.0
79970,2018-12-31T00:00:00,WSF2,GHCND:USW00094789,",,W,",9.8
79971,2018-12-31T00:00:00,WSF5,GHCND:USW00094789,",,W,",12.5
79972,2018-12-31T00:00:00,WT01,GHCND:USW00094789,",,W,",1.0


In [15]:
# También podemos guardarlo en formato Database
import sqlite3

with sqlite3.connect('weather.db') as connection:
  df.to_sql(
      'weather', connection, index=False, if_exists='replace'
  )

### Ex.01

Recogemos las estaciones 'stations' con los params:
- 'datasetid': 'GHCND'
- locationid': 'CITY:US360019'
- limit: 1000

Necesitamos solo recoger estas columnas del response y crear un dataframe 'stations'
stations = id, name, latitude, longitude, elevation

- guardar en CSV como weather_station.csv
- guardar como base de datos 'stations' - opcional

In [16]:
# llamamos la función make_request
response = make_request(
    'stations',
    {
        'datasetid':'GHCND',
      'locationid': 'CITY:US360019',
     'limit':100
    }
)

In [25]:
type(response.text)

str

In [26]:
response.text

'{"metadata":{"resultset":{"offset":1,"count":286,"limit":100}},"results":[{"elevation":36.6,"mindate":"2015-07-30","maxdate":"2017-11-19","latitude":41.0641,"name":"STAMFORD 2.6 SSW, CT US","datacoverage":0.1469,"id":"GHCND:US1CTFR0022","elevationUnit":"METERS","longitude":-73.577},{"elevation":6.4,"mindate":"2016-05-01","maxdate":"2021-07-18","latitude":41.0377883911133,"name":"STAMFORD 4.2 S, CT US","datacoverage":0.8814,"id":"GHCND:US1CTFR0039","elevationUnit":"METERS","longitude":-73.5681762695313},{"elevation":20.1,"mindate":"2008-01-25","maxdate":"2009-11-05","latitude":40.921298,"name":"BERGENFIELD 0.3 SW, NJ US","datacoverage":0.2826,"id":"GHCND:US1NJBG0001","elevationUnit":"METERS","longitude":-74.001983},{"elevation":16.8,"mindate":"2008-02-09","maxdate":"2017-11-23","latitude":40.902694,"name":"SADDLE BROOK TWP 0.6 E, NJ US","datacoverage":0.9779,"id":"GHCND:US1NJBG0002","elevationUnit":"METERS","longitude":-74.083358},{"elevation":21.6,"mindate":"2008-01-29","maxdate":"202

In [28]:
response.json()

{'metadata': {'resultset': {'count': 286, 'limit': 100, 'offset': 1}},
 'results': [{'datacoverage': 0.1469,
   'elevation': 36.6,
   'elevationUnit': 'METERS',
   'id': 'GHCND:US1CTFR0022',
   'latitude': 41.0641,
   'longitude': -73.577,
   'maxdate': '2017-11-19',
   'mindate': '2015-07-30',
   'name': 'STAMFORD 2.6 SSW, CT US'},
  {'datacoverage': 0.8814,
   'elevation': 6.4,
   'elevationUnit': 'METERS',
   'id': 'GHCND:US1CTFR0039',
   'latitude': 41.0377883911133,
   'longitude': -73.5681762695313,
   'maxdate': '2021-07-18',
   'mindate': '2016-05-01',
   'name': 'STAMFORD 4.2 S, CT US'},
  {'datacoverage': 0.2826,
   'elevation': 20.1,
   'elevationUnit': 'METERS',
   'id': 'GHCND:US1NJBG0001',
   'latitude': 40.921298,
   'longitude': -74.001983,
   'maxdate': '2009-11-05',
   'mindate': '2008-01-25',
   'name': 'BERGENFIELD 0.3 SW, NJ US'},
  {'datacoverage': 0.9779,
   'elevation': 16.8,
   'elevationUnit': 'METERS',
   'id': 'GHCND:US1NJBG0002',
   'latitude': 40.902694,
 

In [61]:
type(response.json())

dict

In [55]:
# Guardamos los datos recopilados en un dataframe
stations = pd.DataFrame(response.json()['results'])[['id','name',
                                                     'latitude','longitude',
                                                     'elevation']]

In [56]:
stations.sample(10,random_state=17).head()

Unnamed: 0,id,name,latitude,longitude,elevation
71,GHCND:US1NJMN0011,"MIDDLETOWN TWP 3.6 NW, NJ US",40.431269,-74.12134,4.9
28,GHCND:US1NJES0010,"VERONA TWP 0.7 SW, NJ US",40.82547,-74.25308,124.1
9,GHCND:US1NJBG0011,"NORTH ARLINGTON 0.7 NE, NJ US",40.794355,-74.119043,8.2
4,GHCND:US1NJBG0003,"TENAFLY 1.3 W, NJ US",40.91467,-73.9775,21.6
73,GHCND:US1NJMN0015,"HOLMDEL TWP 0.7 E, NJ US",40.3773,-74.1595,53.3


In [48]:
stations

Unnamed: 0,id,name,latitude,longitude,elevation
0,GHCND:US1CTFR0022,"STAMFORD 2.6 SSW, CT US",41.064100,-73.577000,36.6
1,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
2,GHCND:US1NJBG0001,"BERGENFIELD 0.3 SW, NJ US",40.921298,-74.001983,20.1
3,GHCND:US1NJBG0002,"SADDLE BROOK TWP 0.6 E, NJ US",40.902694,-74.083358,16.8
4,GHCND:US1NJBG0003,"TENAFLY 1.3 W, NJ US",40.914670,-73.977500,21.6
...,...,...,...,...,...
95,GHCND:US1NJMS0070,"CHATHAM TWP 1.1 NNW, NJ US",40.741290,-74.439495,75.0
96,GHCND:US1NJMS0075,"MORRIS TWP 2.8 SE, NJ US",40.768200,-74.456480,83.8
97,GHCND:US1NJMS0078,"BOONTON 0.7 WSW, NJ US",40.900506,-74.416369,149.0
98,GHCND:US1NJMS0089,"PARSIPPANY TROY HILLS TWP 1.3 NE, NJ US",40.871646,-74.405465,103.6


In [63]:
# Guardamos en CSV
stations.to_csv("stations.csv", index=False)

In [64]:
# guardamos en la base de datos
with sqlite3.connect('/content/weather.db') as connection:
  stations.to_sql(
      'stations', connection, index=False, if_exists = 'replace'
  )

In [66]:
# Cargamos los datos y los guardamos en un dataframe
with sqlite3.connect('/content/weather.db') as connection:
    snow_data_from_db = pd.read_sql(
        'SELECT * FROM weather WHERE datatype == "SNOW" AND value > 0 and station LIKE "%US1NY%"', 
        connection
    )

In [67]:
snow_data_from_db


Unnamed: 0,date,datatype,station,attributes,value
0,2018-01-01T00:00:00,SNOW,GHCND:US1NYWC0019,",,N,",25.0
1,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0007,",,N,",41.0
2,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0018,",,N,",10.0
3,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0024,",,N,",89.0
4,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0030,",,N,",102.0
...,...,...,...,...,...
184,2018-11-16T00:00:00,SNOW,GHCND:US1NYRL0005,",,N,",170.0
185,2018-11-16T00:00:00,SNOW,GHCND:US1NYWC0018,",,N,",191.0
186,2018-12-14T00:00:00,SNOW,GHCND:US1NYWC0018,",,N,",3.0
187,2018-12-24T00:00:00,SNOW,GHCND:US1NYWC0018,",,N,",18.0


In [81]:
# Otra forma de lanzar la consulta con un dataframe
snow_data = df.query('datatype == "SNOW" and value > 0 and station.str.contains("US1NY")')
snow_data.head()

TypeError: ignored

Esta consulta sería la equivalente en SQL
```{sql}
SELECT *
FROM weather
WHERE datatype == "SNOW" AND
  value > 0 AND
  station LIKE "%US1NY%"
  ```

In [80]:
# También podemos realizar la operación con Python
df[
   (df.datatype == 'SNOW')
   & (df.value > 0)
   & df.station.str.contains('US1NY')
].equals(snow_data_from_db)

False

## Merge Dataframe

In [82]:
station_info = pd.read_csv("/content/stations.csv")
station_info.head()

Unnamed: 0,id,name,latitude,longitude,elevation
0,GHCND:US1CTFR0022,"STAMFORD 2.6 SSW, CT US",41.0641,-73.577,36.6
1,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
2,GHCND:US1NJBG0001,"BERGENFIELD 0.3 SW, NJ US",40.921298,-74.001983,20.1
3,GHCND:US1NJBG0002,"SADDLE BROOK TWP 0.6 E, NJ US",40.902694,-74.083358,16.8
4,GHCND:US1NJBG0003,"TENAFLY 1.3 W, NJ US",40.91467,-73.9775,21.6


In [83]:
weather = pd.read_csv("/content/nyc_weather_2018.csv")
weather.head()

Unnamed: 0,date,datatype,station,attributes,value
0,2018-01-01T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0
1,2018-01-01T00:00:00,PRCP,GHCND:US1NJBG0015,",,N,",0.0
2,2018-01-01T00:00:00,SNOW,GHCND:US1NJBG0015,",,N,",0.0
3,2018-01-01T00:00:00,PRCP,GHCND:US1NJBG0017,",,N,",0.0
4,2018-01-01T00:00:00,SNOW,GHCND:US1NJBG0017,",,N,",0.0


Para poder realizar correctamente un merge de dos tablas se requiere el check de los valores únicos

In [84]:
station_info.id.describe()

count                   100
unique                  100
top       GHCND:US1NJMN0011
freq                      1
Name: id, dtype: object

In [85]:
weather.station.describe()

count                 79974
unique                  113
top       GHCND:USW00094789
freq                   4270
Name: station, dtype: object

In [87]:
station_info.shape[0], weather.shape[0]

(100, 79974)

> Por defecto la función `merge()` contempla la unión **inner join**.

In [88]:
# Construimos una tabla con inner join
inner_join = weather.merge(station_info, left_on='station', right_on='id')
inner_join.sample(10, random_state=17)

Unnamed: 0,date,datatype,station,attributes,value,id,name,latitude,longitude,elevation
13982,2018-06-03T00:00:00,PRCP,GHCND:US1NJHD0002,",,N,",2.0,GHCND:US1NJHD0002,"KEARNY 1.7 NW, NJ US",40.772892,-74.140926,29.0
4348,2018-03-14T00:00:00,SNWD,GHCND:US1NJES0024,",,N,",76.0,GHCND:US1NJES0024,"CEDAR GROVE TWP 0.4 W, NJ US",40.855695,-74.235564,108.5
13623,2018-05-22T00:00:00,SNWD,GHCND:US1NJMS0089,",,N,",0.0,GHCND:US1NJMS0089,"PARSIPPANY TROY HILLS TWP 1.3 NE, NJ US",40.871646,-74.405465,103.6
5326,2018-03-01T00:00:00,PRCP,GHCND:US1NJMD0043,",,N,",0.0,GHCND:US1NJMD0043,"WOODBRIDGE TWP 1.1 ESE, NJ US",40.5554,-74.274,4.0
6050,2018-05-31T00:00:00,PRCP,GHCND:US1NJMD0045,",,N,",0.0,GHCND:US1NJMD0045,"WOODBRIDGE TWP 1.1 NNE, NJ US",40.575911,-74.284687,21.6
11858,2018-10-12T00:00:00,PRCP,GHCND:US1NJMS0040,",,N,",42.4,GHCND:US1NJMS0040,"CHATHAM 0.6 NW, NJ US",40.748131,-74.390791,58.8
843,2018-12-07T00:00:00,PRCP,GHCND:US1NJBG0015,",,N,",0.0,GHCND:US1NJBG0015,"NORTH ARLINGTON 0.7 WNW, NJ US",40.791492,-74.13979,17.7
4749,2018-07-26T00:00:00,SNOW,GHCND:US1NJES0024,",,N,",0.0,GHCND:US1NJES0024,"CEDAR GROVE TWP 0.4 W, NJ US",40.855695,-74.235564,108.5
13991,2018-06-10T00:00:00,SNOW,GHCND:US1NJHD0002,",,N,",0.0,GHCND:US1NJHD0002,"KEARNY 1.7 NW, NJ US",40.772892,-74.140926,29.0
15889,2018-09-05T00:00:00,PRCP,GHCND:US1NJMS0078,",,N,",0.0,GHCND:US1NJMS0078,"BOONTON 0.7 WSW, NJ US",40.900506,-74.416369,149.0


In [89]:
# OBservamos el resultado final y eliminaremos las columnas join
weather.merge(station_info.rename(dict(id='station'),axis=1), on='station').sample(10, random_state=17)

Unnamed: 0,date,datatype,station,attributes,value,name,latitude,longitude,elevation
13982,2018-06-03T00:00:00,PRCP,GHCND:US1NJHD0002,",,N,",2.0,"KEARNY 1.7 NW, NJ US",40.772892,-74.140926,29.0
4348,2018-03-14T00:00:00,SNWD,GHCND:US1NJES0024,",,N,",76.0,"CEDAR GROVE TWP 0.4 W, NJ US",40.855695,-74.235564,108.5
13623,2018-05-22T00:00:00,SNWD,GHCND:US1NJMS0089,",,N,",0.0,"PARSIPPANY TROY HILLS TWP 1.3 NE, NJ US",40.871646,-74.405465,103.6
5326,2018-03-01T00:00:00,PRCP,GHCND:US1NJMD0043,",,N,",0.0,"WOODBRIDGE TWP 1.1 ESE, NJ US",40.5554,-74.274,4.0
6050,2018-05-31T00:00:00,PRCP,GHCND:US1NJMD0045,",,N,",0.0,"WOODBRIDGE TWP 1.1 NNE, NJ US",40.575911,-74.284687,21.6
11858,2018-10-12T00:00:00,PRCP,GHCND:US1NJMS0040,",,N,",42.4,"CHATHAM 0.6 NW, NJ US",40.748131,-74.390791,58.8
843,2018-12-07T00:00:00,PRCP,GHCND:US1NJBG0015,",,N,",0.0,"NORTH ARLINGTON 0.7 WNW, NJ US",40.791492,-74.13979,17.7
4749,2018-07-26T00:00:00,SNOW,GHCND:US1NJES0024,",,N,",0.0,"CEDAR GROVE TWP 0.4 W, NJ US",40.855695,-74.235564,108.5
13991,2018-06-10T00:00:00,SNOW,GHCND:US1NJHD0002,",,N,",0.0,"KEARNY 1.7 NW, NJ US",40.772892,-74.140926,29.0
15889,2018-09-05T00:00:00,PRCP,GHCND:US1NJMS0078,",,N,",0.0,"BOONTON 0.7 WSW, NJ US",40.900506,-74.416369,149.0


In [90]:
inner_join.shape

(17929, 10)

In [91]:
# Operamos con left join
left_join = station_info.merge(weather, 
                               left_on='id', right_on='station',
                               how='left')

In [92]:
left_join.sample(10, random_state=17)

Unnamed: 0,id,name,latitude,longitude,elevation,date,datatype,station,attributes,value
15517,GHCND:US1NJMS0049,"FLORHAM PARK 0.2 WNW, NJ US",40.77797,-74.3992,62.5,2018-07-24T00:00:00,PRCP,GHCND:US1NJMS0049,",,N,",4.3
2446,GHCND:US1NJBG0018,"PALISADES PARK 0.2 WNW, NJ US",40.848094,-74.000247,21.3,2018-11-01T00:00:00,SNOW,GHCND:US1NJBG0018,",,N,",0.0
2062,GHCND:US1NJBG0018,"PALISADES PARK 0.2 WNW, NJ US",40.848094,-74.000247,21.3,2018-02-27T00:00:00,SNOW,GHCND:US1NJBG0018,",,N,",0.0
11676,GHCND:US1NJMN0010,"EATONTOWN 1.2 NE, NJ US",40.3034,-74.040017,12.8,2018-04-12T00:00:00,PRCP,GHCND:US1NJMN0010,",,N,",0.0
14277,GHCND:US1NJMS0011,"BOONTON 0.6 NW, NJ US",40.909623,-74.414145,188.7,2018-06-17T00:00:00,SNOW,GHCND:US1NJMS0011,",,N,",0.0
3676,GHCND:US1NJBG0037,"GLEN ROCK 0.4 WNW, NJ US",40.961361,-74.132774,27.1,2018-12-17T00:00:00,PRCP,GHCND:US1NJBG0037,",,N,",17.8
2732,GHCND:US1NJBG0023,"OAKLAND 0.9 SSE, NJ US",41.01905,-74.233383,149.4,2018-03-02T00:00:00,PRCP,GHCND:US1NJBG0023,",,N,",33.3
1577,GHCND:US1NJBG0017,"GLEN ROCK 0.7 SSE, NJ US",40.95109,-74.118264,28.0,2018-04-13T00:00:00,PRCP,GHCND:US1NJBG0017,"T,,N,",0.0
8394,GHCND:US1NJMD0045,"WOODBRIDGE TWP 1.1 NNE, NJ US",40.575911,-74.284687,21.6,2018-08-30T00:00:00,PRCP,GHCND:US1NJMD0045,",,N,",0.0
11924,GHCND:US1NJMN0010,"EATONTOWN 1.2 NE, NJ US",40.3034,-74.040017,12.8,2018-10-03T00:00:00,PRCP,GHCND:US1NJMN0010,",,N,",14.7


In [94]:
left_join.head()

Unnamed: 0,id,name,latitude,longitude,elevation,date,datatype,station,attributes,value
0,GHCND:US1CTFR0022,"STAMFORD 2.6 SSW, CT US",41.0641,-73.577,36.6,,,,,
1,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4,2018-01-01T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0
2,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4,2018-01-02T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0
3,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4,2018-01-03T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0
4,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4,2018-01-05T00:00:00,DAPR,GHCND:US1CTFR0039,",,N,",2.0


In [96]:
# operamos con right join
right_join = weather.merge(station_info, left_on='station',
                           right_on='id',how='right')

In [98]:
right_join.tail()

Unnamed: 0,date,datatype,station,attributes,value,id,name,latitude,longitude,elevation
17973,2018-12-31T00:00:00,PRCP,GHCND:US1NJMS0097,",,N,",0.0,GHCND:US1NJMS0097,"PARSIPPANY TROY HILLS TWP 1.5 N, NJ US",40.881083,-74.42552,119.8
17974,2018-12-31T00:00:00,SNOW,GHCND:US1NJMS0097,",,N,",0.0,GHCND:US1NJMS0097,"PARSIPPANY TROY HILLS TWP 1.5 N, NJ US",40.881083,-74.42552,119.8
17975,2018-12-31T00:00:00,SNWD,GHCND:US1NJMS0097,",,N,",0.0,GHCND:US1NJMS0097,"PARSIPPANY TROY HILLS TWP 1.5 N, NJ US",40.881083,-74.42552,119.8
17976,2018-12-31T00:00:00,WESD,GHCND:US1NJMS0097,",,N,",0.0,GHCND:US1NJMS0097,"PARSIPPANY TROY HILLS TWP 1.5 N, NJ US",40.881083,-74.42552,119.8
17977,2018-12-31T00:00:00,WESF,GHCND:US1NJMS0097,",,N,",0.0,GHCND:US1NJMS0097,"PARSIPPANY TROY HILLS TWP 1.5 N, NJ US",40.881083,-74.42552,119.8


In [99]:
left_join.shape, right_join.shape

((17978, 10), (17978, 10))

In [102]:
# Los joins también son equivalentes con el formato SQL
with sqlite3.connect('/content/weather.db') as conn:
  inner_join_from_db = pd.read_sql(
      'SELECT * FROM weather JOIN stations ON weather.station == stations.id',
    conn
  )
inner_join_from_db.shape == inner_join.shape

True

In [103]:
# Operamos con la limpieza de datos sucios
dirty_data = pd.read_csv(
    'https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas-2nd-edition/master/ch_04/data/dirty_data.csv',
    index_col='date'
).drop_duplicates().drop(columns='SNWD')
dirty_data.head()

Unnamed: 0_level_0,station,PRCP,SNOW,TMAX,TMIN,TOBS,WESF,inclement_weather
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-01T00:00:00,?,0.0,0.0,5505.0,-40.0,,,
2018-01-02T00:00:00,GHCND:USC00280907,0.0,0.0,-8.3,-16.1,-12.2,,False
2018-01-03T00:00:00,GHCND:USC00280907,0.0,0.0,-4.4,-13.9,-13.3,,False
2018-01-04T00:00:00,?,20.6,229.0,5505.0,-40.0,,19.3,True
2018-01-05T00:00:00,?,0.3,,5505.0,-40.0,,,


In [105]:
valid_station = dirty_data.query('station != "?"').drop(columns=['WESF', 'station'])
station_with_wesf = dirty_data.query('station == "?"').drop(columns=['station', 'TOBS', 'TMIN', 'TMAX'])

In [106]:
# Se procede con el join de los índices
valid_station.merge(
    station_with_wesf, how='left', left_index=True, right_index=True
).query('WESF > 0').head()

Unnamed: 0_level_0,PRCP_x,SNOW_x,TMAX,TMIN,TOBS,inclement_weather_x,PRCP_y,SNOW_y,WESF,inclement_weather_y
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-30T00:00:00,0.0,0.0,6.7,-1.7,-0.6,False,1.5,13.0,1.8,True
2018-03-08T00:00:00,48.8,,1.1,-0.6,1.1,False,28.4,,28.7,
2018-03-13T00:00:00,4.1,51.0,5.6,-3.9,0.0,True,3.0,13.0,3.0,True
2018-03-21T00:00:00,0.0,0.0,2.8,-2.8,0.6,False,6.6,114.0,8.6,True
2018-04-02T00:00:00,9.1,127.0,12.8,-1.1,-1.1,True,14.0,152.0,15.2,True
