# Web Scrapping

In [1]:
# Import the libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import html5lib

In [2]:
# Use the URL of the general web-page. 
url = 'https://datos.jalisco.gob.mx/search/type/dataset?query=incendio&sort_by=changed'
html = requests.get(url).content
soup = BeautifulSoup(html,'html5lib')

In [3]:
# Use findall to determine the best datasets with the given the desired conditions
items = soup.find_all('a')
item_index = [i for i in range(len(items)) if items[i].text.startswith('Incendios forestales en Bosque La Primavera')]

# Add each element into a list
urls = [items[index].attrs['href'] for index in item_index if 'resource' not in items[index].attrs['href']]
urls

['/dataset/incendios-forestales-en-bosque-la-primavera-2019',
 '/dataset/incendios-forestales-en-bosque-la-primavera-septiembre-diciembre-2018',
 '/dataset/incendios-forestales-en-bosque-la-primavera-enero-abril-2018']

In [4]:
# Obtain information of all the desired pages to access the wanted information

# The general URL IS:
URL = "https://datos.jalisco.gob.mx"
soups = [BeautifulSoup(requests.get(URL+page).content, 'html5lib') for page in urls]

# Optain the links for the csv that have the data in a list. 
csv_url = [soup.find_all('a', {'class':'btn btn-primary data-link'})[0].attrs['href'] for soup in soups]
csv_url

# NEXT STEPS DIDN'T WORK!!!!

#final_records = [BeautifulSoup(requests.get(URL+urls).content, 'html5lib') for urls in final_url]
#records = [element.find_all('span', {'class':'recline-results-info'}) for element in final_records]
#for i in final_url:
#    print(URL+i)
#lista = final_records[0].find_all('div',{'class': 'navigation'})
#lista

['https://datos.jalisco.gob.mx/sites/default/files/incendios_forestales_opd_blp-ene-abril-2019.csv',
 'https://datos.jalisco.gob.mx/sites/default/files/bosque_la_primavera_incendios_forestales_sep-dic_2018.csv',
 'https://datos.jalisco.gob.mx/sites/default/files/incendios_forestales_en_blp_enero-abril-18_13jul2018.csv']

In [5]:
# Finally obtain all the data on a list, one can obtain the information of a csv with Beautiful soup!
crude_data = [requests.get(csv).content for csv in csv_url]
texts = [BeautifulSoup(crude_data[i],'html5lib').decode('ascii').split('<html>\n <head>\n </head>\n <body>\n')[1].split('</body>\n</html>')[0] 
         for i in range(len(crude_data))]

#Add each element into a list. 
final_df = [[row.split(',') for row in texts[element].split('\n')] for element in range(len(texts))]

In [10]:
# There was a single value that made noise to all the data, it had 20 columns instead of 18, it weas fixed manually with
# the next code:

for i in range(len(final_df[2])):
    if len(final_df[2][i])==20:
        final_df[2].pop(i)

In [11]:
#Generate the final dataframe with all the raw data from the three links:

dataframes = [pd.DataFrame(data[1:-1], columns = data[0]) for data in final_df]
fire_df=pd.DataFrame()
for dataframe in dataframes:
    fire_df = fire_df.append(dataframe)

fire_df = fire_df
fire_df.to_csv('fire_data.csv')

# Data cleaning

In [19]:
# Rename Columns
[colname.lower() for colname in list(fire_df.columns)]

['  número de incendio',
 'fuente',
 'zona',
 'paraje',
 'municipio',
 'coordenadas geográficas norte',
 'coordenadas geográficas oeste',
 'causa',
 'fecha de registro',
 'estrato afectado forestal hojarasca',
 'estrato afectado forestal pasto',
 'estrato afectado forestal arbusto',
 'estrato afectado forestal renuevo',
 'estrato afectado no forestal cultivos',
 'estrato afectado no forestal pastizal',
 'estrato afectado no forestal otros',
 'superficie afectada total',
 'número de participantes']

In [None]:
# Check for null values
fire_df.isnull().sum()

In [14]:
# Obtain a descrpition of the data
fire_df.describe().T

Unnamed: 0,count,unique,top,freq
NÚMERO DE INCENDIO,205,117,49,2
FUENTE,205,1,OPD BLP,205
ZONA,205,2,ZI,134
PARAJE,205,111,SAN JOSE DE LA MONTAÑA,18
MUNICIPIO,205,7,ZAPOPAN,108
COORDENADAS GEOGRÁFICAS NORTE,205,205,"""N 20° 37´ 31.3""""""",1
COORDENADAS GEOGRÁFICAS OESTE,205,202,"""W 103° 41´ 11.0""""""",2
CAUSA,205,15,INTENCIONAL,81
FECHA DE REGISTRO,205,123,17/02/2019,4
ESTRATO AFECTADO FORESTAL HOJARASCA,205,63,0.00,71
