# Bajando datos de diferentes fuentes. Ejemplos

## Bajar datos de Bicimad

In [None]:

import requests
import zipfile
# import io
import pandas as pd


filename = 'bicis.zip'
url = 'https://opendata.emtmadrid.es/getattachment/7517a650-ccdf-4ab1-b1b0-a1d13694472e/201906_Usage_Bicimad.aspx'

r = requests.get(url, allow_redirects=True)
r

with open(filename, 'wb') as f:
   f.write(r.content)

with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall(".")
    


In [23]:
datos_bicis = pd.read_json('201906_Usage_Bicimad.json',  lines=True)


In [24]:
datos_bicis.head()

Unnamed: 0,_id,user_day_code,idplug_base,user_type,idunplug_base,travel_time,idunplug_station,ageRange,idplug_station,unplug_hourTime,zip_code,track
0,{'$oid': '5cf83b752f3843a016be4e2f'},e4d55deb9ac172a8d8f5f0a32599815bd51b7c8760d67e...,21,1,8,219,90,0,66,{'$date': '2019-06-01T00:00:00.000+0200'},,
1,{'$oid': '5cf83b762f3843a016be4e48'},8a0c4123e924a50a958f51985eb71aea750fb072438035...,19,1,19,359,71,4,136,{'$date': '2019-06-01T00:00:00.000+0200'},28039.0,
2,{'$oid': '5cf83b762f3843a016be4e4f'},a6a9c1f74a68496000542210abc4fc2eba79e2756ad535...,17,1,7,375,39,4,38,{'$date': '2019-06-01T00:00:00.000+0200'},28013.0,
3,{'$oid': '5cf83b762f3843a016be4e53'},5706c0bd494acc02279d532821c9666b0e506d4f81c838...,4,1,21,264,66,5,90,{'$date': '2019-06-01T00:00:00.000+0200'},28009.0,
4,{'$oid': '5cf83b762f3843a016be4e54'},eb1b6d32bd4add5d5ff91af72a38786d61075c090383a5...,3,1,13,367,152,4,166,{'$date': '2019-06-01T00:00:00.000+0200'},28006.0,


## Importando datos desde la API de INE

In [12]:
import requests
import pandas as pd
import datetime


url_plantilla = 'http://servicios.ine.es/wstempus/js/ES/DATOS_SERIE/{codigo}?nult={num_datos}'

# codigo de la serie de datos a consultar y numero de datos

codigo = "EPA87"
num_datos = 12

url = url_plantilla.format(codigo=codigo,
                           num_datos=num_datos)

# realizar la descarga de los datos usando la libreria request, y leyendo el formato json

respuesta = requests.get(url)
datos = respuesta.json()

# obtenemos el nombre de la serie para nombrar la columna en el data frame
nombre = datos['Nombre']

# Creamos una serie con las fechas, y las convertimos a zona horaria española y formateamos

fecha_serie_utc = pd.to_datetime([x['Fecha'] for x in datos['Data']], unit='ms', utc=True)

fecha_serie_madrid = fecha_serie_utc.tz_convert('Europe/Madrid')

fecha_serie = [x.tz_localize(None).date()
               for x in fecha_serie_madrid]


# creamos una lista con los valores de la serie que vienen en el tag 'Valor'

ocupados_serie = [x['Valor']
                  for x in datos['Data']] 

print(type(ocupados_serie))

<class 'list'>


In [13]:
# contruimos un dataframe de pandas con los valores

import pandas as pd

tabla = pd.DataFrame(ocupados_serie,
                     index=fecha_serie,
                     columns=[nombre])

In [14]:
tabla

Unnamed: 0,Total Nacional. Ambos sexos. 16 y más años. Ocupados. Personas.
2016-10-01,18508.1
2017-01-01,18438.3
2017-04-01,18813.3
2017-07-01,19049.2
2017-10-01,18998.4
2018-01-01,18874.2
2018-04-01,19344.1
2018-07-01,19528.0
2018-10-01,19564.6
2019-01-01,19471.1


## Web scrapping de IMDB

In [3]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

# Download IMDB's Top 250 data
url = 'http://www.imdb.com/chart/top'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

movies = soup.select('td.titleColumn')
links = [a.attrs.get('href') for a in soup.select('td.titleColumn a')]
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value') for b in soup.select('td.posterColumn span[name=ir]')]
votes = [b.attrs.get('data-value') for b in soup.select('td.ratingColumn strong')]

imdb = []

# Store each item into dictionary (data), then put those into a list (imdb)
for index in range(0, len(movies)):
    # Seperate movie into: 'place', 'title', 'year'
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    year = re.search('\((.*?)\)', movie_string).group(1)
    place = movie[:len(str(index))-(len(movie))]
    data = {"movie_title": movie_title,
            "year": year,
            "place": place,
            "star_cast": crew[index],
            "rating": ratings[index],
            "vote": votes[index],
            "link": links[index]}
    imdb.append(data)
df = pd.DataFrame(imdb)
df


#for item in imdb:
#    print(item['place'], '-', item['movie_title'], '('+item['year']+') -', 'Starring:', item['star_cast'])

Unnamed: 0,movie_title,year,place,star_cast,rating,vote,link
0,Cadena perpetua,1994,1,"Frank Darabont (dir.), Tim Robbins, Morgan Fre...",9.222135858482503,,/title/tt0111161/
1,El padrino,1972,2,"Francis Ford Coppola (dir.), Marlon Brando, Al...",9.149250561129552,,/title/tt0068646/
2,El padrino: Parte II,1974,3,"Francis Ford Coppola (dir.), Al Pacino, Robert...",8.981653514976779,,/title/tt0071562/
3,El caballero oscuro,2008,4,"Christopher Nolan (dir.), Christian Bale, Heat...",8.968640326133936,,/title/tt0468569/
4,12 hombres sin piedad,1957,5,"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb",8.9226055358279,,/title/tt0050083/
...,...,...,...,...,...,...,...
245,Aladdín,1992,246,"Ron Clements (dir.), Scott Weinger, Robin Will...",8.00530703177319,,/title/tt0103639/
246,Guardianes de la galaxia,2014,247,"James Gunn (dir.), Chris Pratt, Vin Diesel",8.004002990493685,,/title/tt2015381/
247,Juego sucio,2002,248,"Andrew Lau (dir.), Andy Lau, Tony Chiu-Wai Leung",8.003844365022681,,/title/tt0338564/
248,La batalla de Argel,1966,249,"Gillo Pontecorvo (dir.), Brahim Hadjadj, Jean ...",8.003757772096042,,/title/tt0058946/


## Acediendo a datos de twitter

In [12]:
import tweepy  
import time
import csv
 
access_token = "12085582-j0ie7G8xec7LaPIalqYljlUK2mh64PqDhxTu1UtNr"  
access_token_secret = "D9OjFAZ7DgKVo4jwhMKWjOK56BIWyTs59YGxBsFe8NygE"  
consumer_key = "rG4ptSFnS0xEpGZh29KIfwD4e"  
consumer_secret = "6FcFiGERa5u63JvzCbbA9UNHdxBubrVP6wKy6cFwoySHUZOcvT"  
 
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)  
auth.set_access_token(access_token, access_token_secret)  
api = tweepy.API(auth,wait_on_rate_limit=True)

In [None]:
csvFile = open('cop25.csv', 'a')
#Use csv Writer
csvWriter = csv.writer(csvFile)

for tweet in tweepy.Cursor(api.search,q="#cop25",count=5000,
                           lang="en",
                           since="2019-10-12").items():
    print (tweet.created_at, tweet.text)
    csvWriter.writerow([tweet.created_at, tweet.text.encode('utf-8')])

In [None]:
csvFile = open('final_balonmano.csv', 'a')
#Use csv Writer
csvWriter = csv.writer(csvFile)

for tweet in tweepy.Cursor(api.search,q="#GuerrerasTVE",count=5000,
                           lang="es",
                           since="2019-10-12").items():
    print (tweet.created_at, tweet.text)
    csvWriter.writerow([tweet.created_at, tweet.text.encode('utf-8')])

## Bajando datos de la calidad del aire

También existe un dataset en Kaggle, pero vamos a ver un ejemplo

In [81]:

import pandas as pd
import requests
import io
import csv

url2 = "http://www.mambiente.munimadrid.es/opendata/horario.txt"

datos=pd.read_csv(url2, header=None)#.content




In [82]:
datos

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
0,28,79,4,1,38,2,2019,12,14,7.0,...,0,N,0,N,0,N,0,N,0,N
1,28,79,4,6,48,2,2019,12,14,0.3,...,0,N,0,N,0,N,0,N,0,N
2,28,79,4,7,8,2,2019,12,14,8.0,...,0,N,0,N,0,N,0,N,0,N
3,28,79,4,8,8,2,2019,12,14,34.0,...,0,N,0,N,0,N,0,N,0,N
4,28,79,4,12,8,2,2019,12,14,47.0,...,0,N,0,N,0,N,0,N,0,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,28,79,60,7,8,2,2019,12,14,1.0,...,0,N,0,N,0,N,0,N,0,N
151,28,79,60,8,8,2,2019,12,14,14.0,...,0,N,0,N,0,N,0,N,0,N
152,28,79,60,10,47,2,2019,12,14,12.0,...,0,N,0,N,0,N,0,N,0,N
153,28,79,60,12,8,2,2019,12,14,16.0,...,0,N,0,N,0,N,0,N,0,N
