# Web Scraping and API consumption

### Oriented to forest fires in Guadalajara
###### *This code is really specific for the used webpages
##### (By Daniel Hernández Mota)

In [None]:
# Import the libraries
import requests
import pandas as pd
import html5lib
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize

# Web Scraping

In [None]:
# Function to obtain the first links 

def web_scrap_init (url, string):
    # Use the URL of the general web-page to obtain the information
    html =  requests.get(url).content
    soup = BeautifulSoup(html,'html5lib')
    # Use findall to determine the best datasets with the given the desired conditions
    items = soup.find_all(string)
    item_index = [i for i in range(len(items)) if items[i].text.startswith('Incendios forestales en Bosque La Primavera')]

    # Add each element into a list
    urls = [items[index].attrs['href'] for index in item_index if 'resource' not in items[index].attrs['href']]
    return urls

url = 'https://datos.jalisco.gob.mx/search/type/dataset?query=incendio&sort_by=changed'
find = 'a'
value1 = web_scrap_init(url,find)

In [None]:
# Use the latter data to obtain new webpages to scrap.
def web_scrap_two (lst):
    URL = "https://datos.jalisco.gob.mx"
    soups = [BeautifulSoup(requests.get(URL+page).content, 'html5lib') for page in lst]

    # Optain the links for the csv that have the data in a list. 
    csv_url = [soup.find_all('a', {'class':'btn btn-primary data-link'})[0].attrs['href'] for soup in soups]
    return csv_url

value2 = web_scrap_two(value1)

In [None]:
# Obtain all the data in a list and merge it in a dataframe
# It is possible to obtain trough a csv!
def web_scrap_data_df(csv_url):
    crude_data = [requests.get(csv).content for csv in csv_url]
    texts = [BeautifulSoup(crude_data[i],'html5lib').decode('ascii').split('<html>\n <head>\n </head>\n <body>\n')[1].split('</body>\n</html>')[0] 
             for i in range(len(crude_data))]
    #Add each element into a list. 
    final_df = [[row.split(',') for row in texts[element].split('\n')] for element in range(len(texts))]
    # There was a single value that made noise to all the data, it had 20 columns instead of 18, it weas fixed manually with
    # the next code:

    for i in range(len(final_df[2])):
        try:
            if len(final_df[2][i])==20:
                final_df[2].pop(i)
        except:
            pass
    #Generate the final dataframe with all the raw data from the three links:

    dataframes = [pd.DataFrame(data[1:-1], columns = data[0]) for data in final_df]
    fire_df=pd.DataFrame()
    for dataframe in dataframes:
        fire_df = fire_df.append(dataframe)
    # Write the file to a dataframe
    if True:
        pass
    else:
        fire_df.to_csv('fire_data.csv')
    return fire_df

fire_df = web_scrap_data_df(value2)

# Data cleaning

In [None]:
# Rename Columns
newcols_dict = {colname : colname.lower().replace('  ','').replace(' ','_').replace('número', 'num').replace('geográficas', 'geo')
                for colname in list(fire_df.columns)}
fire_df = fire_df.rename(columns = newcols_dict)

#Drop unessesary columns
try:
    fire_df = fire_df.drop(['level_0', 'index'], 1)
except:
    pass

# Check for null values
fire_df.isnull().sum()

# Obtain a descrpition of the data
fire_df.describe().T

# There is only one type of data in "fuente", therefore that column can be descarted since it doesnt yield any information
try:
    fire_df = fire_df.drop( ['fuente'], axis = 1)
except:
    pass
fire_df = fire_df.reset_index(drop=True)

# Most information is not classified on the estrato_afectado_.+, therefore we are replazing the data of '' to 0
fire_df = fire_df.replace('', float(0.00))

# Change the type of the numerical data (which somehow is string) to float
columns_fire = list(fire_df.columns)
columns_fire_num = columns_fire[8:] 
columns_fire_num
for column in columns_fire_num:
    fire_df = fire_df.astype({column : 'float'})

# Modify the dates to an actual date-format
fire_df.fecha_de_registro = pd.to_datetime(fire_df['fecha_de_registro'])

#Order the dataframe by the dates
try:
    fire_df = fire_df.sort_values(['fecha_de_registro']).reset_index(drop = True).drop('num_de_incendio', axis = 1)
except:
    pass
# Change North coordinates in GMS to GD
nort = list(fire_df['coordenadas_geo_norte'])
#Obtain purely the GMS coordinates
NGMS = [coordinate.replace('N','').replace('"','').replace(' ','').replace('°','|').replace('´´','').replace('´','|').replace("''","").replace("'","|").split('|') for coordinate in nort]
NGD = [float(coor[0])+float(coor[1])/60+float(coor[2])/3600 for coor in NGMS]

# Change West coordinates in GMS to GD
west = list(fire_df['coordenadas_geo_oeste'])
# There was a single value that made noise
fire_df.loc[(205-34),'coordenadas_geo_oeste'] = "103°  33' 27.1''"
#Obtain purely the GMS coordinates
WGMS = [coordinate.replace('W','').replace('O','0').replace('"','').replace(' ','').replace('°','|').replace('´´','').replace('´','|').replace("''","").replace("'","|").split('|') for coordinate in west]
WGD = [float(coor[0])+float(coor[1])/60+float(coor[2])/3600 for coor in WGMS]
fire_df.coordenadas_geo_oeste = pd.Series(WGD)
fire_df.coordenadas_geo_norte = pd.Series(NGD)

# Obtain some insights from the numerical data
fire_df.describe().T
fire_df.head(2)

if True:
    pass
else:
    fire_df.to_csv('fire_data_clean.csv')

In [None]:
# Obtain the needed data for the API
needed_data = fire_df[['coordenadas_geo_norte','coordenadas_geo_oeste','fecha_de_registro']]
latitude = list(needed_data['coordenadas_geo_norte'])
longitude = list(needed_data['coordenadas_geo_oeste'])
date = list(needed_data['fecha_de_registro'])
date_vals = [d.value//10**9 for d in date]

# API consumption

In [None]:
def api_consumption_darksky(url, key, latitude, longitude,date_vals):
    json_list = [requests.get('{}{}/{},-{},{}'.format(url,key,str(latitude[i]),str(longitude[i]),str(date_vals[i]))).json() for i in range(len(date_vals))]
    
    # Analysis of the information
    # Lenght of the daily data as set:
    daily_len = {len(json_list[i]['daily']['data'][0]) for i in range(len(json_list))}
    # Lenght of the current data as set:
    currently_len = {len(json_list[i]['currently']) for i in range(len(json_list))}
    
    # Each of the data frames has different lenght so an analysis has to be done for each one
    pd_met_info = [pd.DataFrame(json_list[i]['daily']['data']) for i in range(len(json_list))]
     
    # Each data frame is assigned to its corresponding category
    pd_distinct = [pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame()]

    for i in range(len(pd_met_info)):
        if (len(list(pd_met_info[i].columns)) == list(daily_len)[0]):
               pd_distinct[0] = pd_distinct[0].append(pd_met_info[i])
        elif (len(list(pd_met_info[i].columns)) == list(daily_len)[1]):
               pd_distinct[1] = pd_distinct[1].append(pd_met_info[i])
        elif (len(list(pd_met_info[i].columns)) == list(daily_len)[2]):
               pd_distinct[2] = pd_distinct[2].append(pd_met_info[i])
        elif (len(list(pd_met_info[i].columns)) == list(daily_len)[3]):
               pd_distinct[3] = pd_distinct[3].append(pd_met_info[i])
        elif (len(list(pd_met_info[i].columns)) == list(daily_len)[4]):
               pd_distinct[4] = pd_distinct[4].append(pd_met_info[i])
        elif (len(list(pd_met_info[i].columns)) == list(daily_len)[5]):
               pd_distinct[5] = pd_distinct[5].append(pd_met_info[i])

    # To make easier the data manipulation, all the sets are compared to see which columns are the same, those will stick together
    lenghts_distinct_df = [len(data) for data in pd_distinct]
    new_columns = list(set(pd_distinct[0].columns) & set(pd_distinct[1].columns) & set(pd_distinct[2].columns) & set(pd_distinct[3].columns) & set(pd_distinct[4].columns) & set(pd_distinct[        
    meteorological_pd = pd.DataFrame()

    # Now it is possible to join all the dataframes 
    for df in pd_distinct:
        meteorological_pd = meteorological_pd.append(df[new_columns])
    if True:
        pass
    else:
        meteorological_pd.to_csv('meteorological.csv')
    return meteorological_pd

key = '3674abb3a8982b4a3359284ea8d0986c'
url = 'https://api.darksky.net/forecast/'    
meteorological_pd = api_consumption_darksky(url, key, latitude, longitude,date_vals)

# Data cleaning

In [None]:
# Check for null data
meteorological_pd.isnull().sum()
meteorological_pd[meteorological_pd["cloudCover"].isnull()==True][['temperatureMin', 'time', 'uvIndex','cloudCover']]
meteorological_pd = meteorological_pd.fillna(0)

# Sort and refresh values
meteorological_pd = meteorological_pd.sort_values('time', axis=0).reset_index(drop=True)
meteorological_pd.head(4)

# Stay only with relevant information

meteorological_pd = meteorological_pd[['temperatureMin','temperatureMax','apparentTemperatureMin','apparentTemperatureMax', 'temperatureLow','temperatureHigh',
                  'time','dewPoint', 'uvIndex','uvIndexTime', 'windBearing','cloudCover','icon','windSpeed', 'humidity','summary'
                  ]]
if True:
    pass
else:
    meteorological_pd.to_csv('meteorological_clean.csv')

# Concatenate both dataframes

In [None]:
#Unite both dataframes
forest_fire = pd.concat([fire_df,meteorological_pd], axis=1)

# Obtain insights
forest_fire.describe().T