In [None]:
# import the necessary libraries

import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import requests
from io import StringIO
import re

We start by importing wildfire statistics from each german county.

In [None]:
# since we have many many URLs we're going to use a for loop to download them and create data frames
# first we create twolists: one with the URLs and one the names of each data frame

urls = ['https://drive.google.com/file/d/16LJJhGZ7JmSi_MlkfJEYHGg0vDlgIINS/view?usp=sharing', # Baden Wurttemberg
        'https://drive.google.com/file/d/1edEUTQ1SYXh27NOm87OLuaw8_IE-BBCO/view?usp=sharing', # Bavaria
        'https://drive.google.com/file/d/1DSlAUO0XhwdVB1gANwzkiNXDrH2155kp/view?usp=sharing', # Berlin
        'https://drive.google.com/file/d/1VlVKD_htNUp9wq-_Kkpc-JGzsikErTO2/view?usp=sharing', # Brandenburg
        'https://drive.google.com/file/d/1b4D6kynOlbQrFdJ3bptlOVwTLYTePWPt/view?usp=sharing', # Bremen
        'https://drive.google.com/file/d/1WH9rf62LTJr_daHQUceATJL2d5nEpO6S/view?usp=sharing', # Hamburg
        'https://drive.google.com/file/d/1l970d7SUf7wH6wpxCTZQH-TlHROF6olZ/view?usp=sharing', # Hessen
        'https://drive.google.com/file/d/1YSIhita-7tAITOwp2mopMdz6lMrf8U2L/view?usp=sharing', # Lower Saxony
        'https://drive.google.com/file/d/1SBFubZHRAwk2vKI0oJ34cFVp-7JAEUOT/view?usp=sharing', # Mecklenburg-Vorpommern
        'https://drive.google.com/file/d/1PL8CBZZqbvL63lY-j9GgFDtBFMqs6Fal/view?usp=sharing', # Nordrhein-Westfalen
        'https://drive.google.com/file/d/1J2FdfdaflqjsDVRGJqfNJ6bPNhnDMwJ8/view?usp=sharing', # Rhineland-Palatinate
        'https://drive.google.com/file/d/1Kq22VcUCWS5VkvJwypzN1tXgABTY5uql/view?usp=sharing', # Saarland
        'https://drive.google.com/file/d/1A5ZBP_d4KyylsBPWSTM2YsNanjYamU60/view?usp=sharing', # Saxony Anhalt
        'https://drive.google.com/file/d/1_S1KdswjTGZon-syP6dAqiI0iPjCRZK6/view?usp=sharing', # Saxony
        'https://drive.google.com/file/d/1GDyw-vQtVtPOn-NOzmnQ46aOvQ9QrCcW/view?usp=sharing', # Schleswig-Holstein
        'https://drive.google.com/file/d/1rbqCACS6TtZMOrdkysK9ner4M_RH6oLq/view?usp=sharing', # Thuringia
        'https://drive.google.com/file/d/1PIZwP4rkEWHQFhpDWF-dLAvkMVnyvdYW/view?usp=sharing'] # Germany

counties = ['fires_baden_wurtt', 'fires_bavaria', 'fires_berlin', 'fires_brandenburg', 'fires_bremen', 'fires_hamburg', 'fires_hessen', 'fires_lower_sax',
            'fires_meck_vor', 'fires_nordrhein_west', 'fires_rhein_pal', 'fires_saarland', 'fires_sax_ahn', 'fires_saxony', 'fires_schleswig_holstein',
            'fires_thuringia', 'fires_germany']

# we also create an empty dictionary to store the data frames
# while trying the for loop we had come issues with the requests, so we add some steps to make sure everything works fine

counties_df = {}

for url, county in zip(urls, counties):                                           # we loop over the two lists
    try:
        response = requests.get(url)                                              # first we check the url is accessible
        if response.status_code == 200:
            print(f"URL '{url}' is accessible.")
        else:
            print(f"URL '{url}' returned status code: {response.status_code}")
            continue  # Skip processing this URL
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while fetching '{url}': {str(e)}")
        continue  # Skip processing this URL

    path = 'https://drive.google.com/uc?id=' + url.split('/')[-2]                 # this will help us extract the data from the url

    try:                                                                          # now we try to read the CSV
        county_df = pd.read_csv(path)
        counties_df[county] = county_df                                           # we store the data frame with the name of it's county
        print(f"DataFrame for '{county}' created and stored.")                    # print the info of each data frame to check the data types, the null values, etc.
        print(county_df.info())
    except Exception as e:
        print(f"An error occurred while reading CSV for '{county}': {str(e)}")

URL 'https://drive.google.com/file/d/16LJJhGZ7JmSi_MlkfJEYHGg0vDlgIINS/view?usp=sharing' is accessible.
DataFrame for 'fires_baden_wurtt' created and stored.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year/month       156 non-null    object 
 1   burnt_area(ha)   156 non-null    float64
 2   amount_of fires  156 non-null    int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 3.8+ KB
None
URL 'https://drive.google.com/file/d/1edEUTQ1SYXh27NOm87OLuaw8_IE-BBCO/view?usp=sharing' is accessible.
DataFrame for 'fires_bavaria' created and stored.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   year/month       156 non-null    object
 1   burnt_area(ha)   156 non-null    int64 
 2 

Now before we concatenate the data frames we need to add a column with the name of the county - because we're planning on eventually buiding a machine learning model we need to know which county each wildfire belongs to.

In [None]:
# we need to create a new for loop to add the column 'county'
# first we create a list with the names of the counties

county_names = ['Baden Wurttemberg', 'Bavaria', 'Berlin', 'Brandenburg', 'Bremen', 'Hamburg', 'Hessen', 'Lower Saxony', 'Mecklenburg-Vorpommern', 'Nordrhein-Westfalen',
                'Rhineland-Palatinate', 'Saarland', 'Saxony Anhalt', 'Saxony', 'Schleswig-Holstein', 'Thuringia', 'Germany']

for county, county_df in counties_df.items():
    county_df['county'] = county_names[counties.index(county)]

In [None]:
# we check the column is correctly created

counties_df['fires_germany'].head()

Unnamed: 0,year/month,burnt_area(ha),amount_of fires,county
0,2010-01,0,0,Germany
1,2010-02,0,0,Germany
2,2010-03,16,18,Germany
3,2010-04,46,128,Germany
4,2010-05,17,52,Germany


In [None]:
# List of DataFrames to concatenate
dfs_to_concat = list(counties_df.values())

# Concatenate DataFrames by columns
wildfires_df = pd.concat(dfs_to_concat, axis=0)

# Display the concatenated DataFrame
wildfires_df


Unnamed: 0,year/month,burnt_area(ha),amount_of fires,county
0,2010-01,0.0,0,Baden Wurttemberg
1,2010-02,0.0,0,Baden Wurttemberg
2,2010-03,0.8,3,Baden Wurttemberg
3,2010-04,4.3,7,Baden Wurttemberg
4,2010-05,0.0,0,Baden Wurttemberg
...,...,...,...,...
151,2022-08,351.0,595,Germany
152,2022-09,64.0,180,Germany
153,2022-10,13.0,50,Germany
154,2022-11,16.0,43,Germany


In [None]:
# Let's just check everything is ok

wildfires_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2652 entries, 0 to 155
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year/month       2652 non-null   object 
 1   burnt_area(ha)   2652 non-null   float64
 2   amount_of fires  2652 non-null   int64  
 3   county           2652 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 103.6+ KB


In [None]:
# from google.colab import files

# wildfires_df.to_csv('wildfires.csv', index=False)
# files.download('wildfires.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Now we can create a couple of functions to handle the data we need to add to this data frame we created

In [None]:
def get_dataframe(counties, urls):
  counties_df = {}

  for url, county in zip(urls, counties):                                           # we loop over the two lists
      try:
          response = requests.get(url)                                              # first we check the url is accessible
          if response.status_code == 200:
              print(f"URL '{url}' is accessible.")
          else:
              print(f"URL '{url}' returned status code: {response.status_code}")
              continue  # Skip processing this URL
      except requests.exceptions.RequestException as e:
          print(f"An error occurred while fetching '{url}': {str(e)}")
          continue  # Skip processing this URL

      path = 'https://drive.google.com/uc?id=' + url.split('/')[-2]                 # this will help us extract the data from the url

      try:                                                                          # now we try to read the CSV
          county_df = pd.read_csv(path)
          counties_df[county] = county_df                                           # we store the data frame with the name of it's county
          print(f"DataFrame for '{county}' created and stored.")                    # print the info of each data frame to check the data types, the null values, etc.
          print(county_df.info())
      except Exception as e:
          print(f"An error occurred while reading CSV for '{county}': {str(e)}")

  # we need to create a new for loop to add the column 'county'
  # first we create a list with the names of the counties

  county_names = ['Baden Wurttemberg', 'Bavaria', 'Berlin', 'Brandenburg', 'Bremen', 'Hamburg', 'Hessen', 'Lower Saxony', 'Mecklenburg-Vorpommern', 'Nordrhein-Westfalen',
                  'Rhineland-Palatinate', 'Saarland', 'Saxony Anhalt', 'Saxony', 'Schleswig-Holstein', 'Thuringia', 'Germany']

  for county, county_df in counties_df.items():
      county_df['county'] = county_names[counties.index(county)]

  # List of DataFrames to concatenate
  dfs_to_concat = list(counties_df.values())

  # Concatenate DataFrames by columns
  df = pd.concat(dfs_to_concat, axis=0)

  # Display the concatenated DataFrame
  return(df)


In [None]:
urls = ['https://drive.google.com/file/d/1k-kO8EVT22KkAHJlqiHgNtWfdwoEIN_d/view?usp=sharing', # Baden Wurttemberg
        'https://drive.google.com/file/d/1v6mA2ULJIej8FGQ1KzwxTQGJM31RtIDP/view?usp=sharing', # Bavaria
        'https://drive.google.com/file/d/1vcqT7IwANPEEpKQDgwuKq_o9k32u5VYV/view?usp=sharing', # Berlin
        'https://drive.google.com/file/d/1uqr5BRsQpBtCM79HODqVi5WZF3eIpOHb/view?usp=sharing', # Brandenburg
        'https://drive.google.com/file/d/1eyIHYJiBvsB17ImijaRr1PcdGkZfk14V/view?usp=sharing', # Bremen
        'https://drive.google.com/file/d/1eyIHYJiBvsB17ImijaRr1PcdGkZfk14V/view?usp=sharing', # Hamburg
        'https://drive.google.com/file/d/1eLD1ZEK35Otw4SGqDWW1PBh5Z8ZfTiFC/view?usp=sharing', # Hessen
        'https://drive.google.com/file/d/1fREFSLt4eLgFLu8v-WsD7cMpFntAfRTe/view?usp=sharing', # Lower Saxony
        'https://drive.google.com/file/d/1YclxOCQ5DKx-BzJk9rHuWAVOyj5AajxB/view?usp=sharing', # Mecklenburg-Vorpommern
        'https://drive.google.com/file/d/1dH9bH8ucQRI9hLIMCqZTmS1Z3IEAXqaE/view?usp=sharing', # Nordrhein-Westfalen
        'https://drive.google.com/file/d/1QHXmWNNcYuqsw2-YdtSeNzVa-_QnR5BH/view?usp=sharing', # Rhineland-Palatinate
        'https://drive.google.com/file/d/1tfCSz7TQY0z5xizp-dlR8zeZ9Yy7V01z/view?usp=sharing', # Saarland
        'https://drive.google.com/file/d/1u_sH_-_oRgQ5rJfO6NgeKpR8PV69QJk6/view?usp=sharing', # Saxony Anhalt
        'https://drive.google.com/file/d/1i6Eh_45Kdp2825sN472nKS03VrUnp413/view?usp=sharing', # Saxony
        'https://drive.google.com/file/d/1ipqk37CAqMKCR7TxS6u92sE-eN-gZhEQ/view?usp=sharing', # Schleswig-Holstein
        'https://drive.google.com/file/d/1nsKkZdSU1buSZWadGHw1CdznEif67vKk/view?usp=sharing', # Thuringia
        'https://drive.google.com/file/d/15362l9Apz9pmsUfJtqdvjJOLSknwf_9o/view?usp=sharing'] # Germany

counties = ['rain_baden_wurtt', 'rain_bavaria', 'rain_berlin', 'rain_brandenburg', 'rain_bremen', 'rain_hamburg', 'rain_hessen', 'rain_lower_sax',
            'rain_meck_vor', 'rain_nordrhein_west', 'rain_rhein_pal', 'rain_saarland', 'rain_sax_ahn', 'rain_saxony', 'rain_schleswig_holstein',
            'rain_thuringia', 'rain_germany']

precipitation = get_dataframe(counties, urls)

URL 'https://drive.google.com/file/d/1k-kO8EVT22KkAHJlqiHgNtWfdwoEIN_d/view?usp=sharing' is accessible.
DataFrame for 'rain_baden_wurtt' created and stored.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   year           163 non-null    int64  
 1   month          163 non-null    int64  
 2   precipitation  163 non-null    float64
dtypes: float64(1), int64(2)
memory usage: 3.9 KB
None
URL 'https://drive.google.com/file/d/1v6mA2ULJIej8FGQ1KzwxTQGJM31RtIDP/view?usp=sharing' is accessible.
DataFrame for 'rain_bavaria' created and stored.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   year           163 non-null    int64  
 1   month          163 non-null    int64  
 2   precipitation  163 non-nul

In [None]:
precipitation

Unnamed: 0,year,month,precipitation,county
0,2011,1,76.3,Baden Wurttemberg
1,2012,1,114.0,Baden Wurttemberg
2,2013,1,52.8,Baden Wurttemberg
3,2014,1,55.1,Baden Wurttemberg
4,2015,1,107.6,Baden Wurttemberg
...,...,...,...,...
158,2018,12,104.1,Germany
159,2019,12,58.8,Germany
160,2020,12,57.5,Germany
161,2021,12,61.4,Germany


In [None]:
from google.colab import files

precipitation.to_csv('precipitation.csv', index=False)
files.download('precipitation.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
urls = ['https://drive.google.com/file/d/109sXlu3LdfMzydii6Hjrh8yKVTJExzdM/view?usp=sharing', # Baden Wurttemberg
        'https://drive.google.com/file/d/1IDLhLcI7hF01K-GE1s4GOEqT5FZxy9L4/view?usp=sharing', # Bavaria
        'https://drive.google.com/file/d/1TBip5SutNNieXWDdypnImopxq3q1bbMj/view?usp=sharing', # Berlin
        'https://drive.google.com/file/d/1DBRrQIjX4dWCDme4aWaOdljS1P7R0Pn6/view?usp=sharing', # Brandenburg
        'https://drive.google.com/file/d/17lGU2Ke1V2b-b9BrAerL66Fhn3mZ65v2/view?usp=sharing', # Bremen
        'https://drive.google.com/file/d/17lGU2Ke1V2b-b9BrAerL66Fhn3mZ65v2/view?usp=sharing', # Hamburg
        'https://drive.google.com/file/d/13n8Gkj-w7F9_s7eU6lUOLV-_XSejKgep/view?usp=sharing', # Hessen
        'https://drive.google.com/file/d/17z-BkwQp1ZXF-LpVxyDp7Ajiv3vbccv-/view?usp=sharing', # Lower Saxony
        'https://drive.google.com/file/d/1RLF74LEytx6dtrYk_2k8vP1PP7t57oCS/view?usp=sharing', # Mecklenburg-Vorpommern
        'https://drive.google.com/file/d/1HbwbXT39Rdtxet43LrE7EmT_fGJvNbfS/view?usp=sharing', # Nordrhein-Westfalen
        'https://drive.google.com/file/d/1dOdaqhalVd0ywjasPInA6qU5446q_Rbi/view?usp=sharing', # Rhineland-Palatinate
        'https://drive.google.com/file/d/1sWz2x2Bc4OLTd49Ku8Ms2EvsKmex6ct7/view?usp=sharing', # Saarland
        'https://drive.google.com/file/d/102LkR1ZCTwlFXvReTTsoxU_NYSAjz7_-/view?usp=sharing', # Saxony Anhalt
        'https://drive.google.com/file/d/1fVNzUF3Ve-w-F_eUfHQ0l0OdtFLuXKcM/view?usp=sharing', # Saxony
        'https://drive.google.com/file/d/10BdbFiKsA5X-t1QCaAbFIm3SoiX6jB7E/view?usp=sharing', # Schleswig-Holstein
        'https://drive.google.com/file/d/1lswBryK7yXTIyKVScZ3zmYzjAW-2ld2m/view?usp=sharing', # Thuringia
        'https://drive.google.com/file/d/1P6fN_fywnhiuHvbJgWIt_QBCsKGJ-FU3/view?usp=sharing'] # Germany

counties = ['air_temp_baden_wurtt', 'air_temp_bavaria', 'air_temp_berlin', 'air_temp_brandenburg', 'air_temp_bremen', 'air_temp_hamburg', 'air_temp_hessen', 'air_temp_lower_sax',
            'air_temp_meck_vor', 'air_temp_nordrhein_west', 'air_temp_rhein_pal', 'air_temp_saarland', 'air_temp_sax_ahn', 'air_temp_saxony', 'air_temp_schleswig_holstein',
            'air_temp_thuringia', 'air_temp_germany']

air_temp = get_dataframe(counties, urls)

URL 'https://drive.google.com/file/d/109sXlu3LdfMzydii6Hjrh8yKVTJExzdM/view?usp=sharing' is accessible.
DataFrame for 'air_temp_baden_wurtt' created and stored.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164 entries, 0 to 163
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   year         164 non-null    int64  
 1   month        164 non-null    int64  
 2   temperature  164 non-null    float64
dtypes: float64(1), int64(2)
memory usage: 4.0 KB
None
URL 'https://drive.google.com/file/d/1IDLhLcI7hF01K-GE1s4GOEqT5FZxy9L4/view?usp=sharing' is accessible.
DataFrame for 'air_temp_bavaria' created and stored.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164 entries, 0 to 163
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   year         164 non-null    int64  
 1   month        164 non-null    int64  
 2   temperature  164 non-null    float64

In [None]:
air_temp

Unnamed: 0,year,month,temperature,county
0,2010,1,-2.76,Baden Wurttemberg
1,2011,1,0.52,Baden Wurttemberg
2,2012,1,1.72,Baden Wurttemberg
3,2013,1,0.40,Baden Wurttemberg
4,2014,1,2.83,Baden Wurttemberg
...,...,...,...,...
159,2018,12,3.90,Germany
160,2019,12,3.75,Germany
161,2020,12,3.08,Germany
162,2021,12,2.58,Germany


In [None]:
air_temp.to_csv('air_temp.csv', index=False)
files.download('air_temp.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# # urls = ['', # Baden Wurttemberg
#         '', # Bavaria
#         '', # Berlin
#         '', # Brandenburg
#         '', # Bremen
#         '', # Hamburg
#         '', # Hessen
#         '', # Lower Saxony
#         '', # Mecklenburg-Vorpommern
#         '', # Nordrhein-Westfalen
#         '', # Rhineland-Palatinate
#         '', # Saarland
#         '', # Saxony Anhalt
#         '', # Saxony
#         '', # Schleswig-Holstein
#         '', # Thuringia
#         ''] # Germany

# # counties = ['fires_baden_wurtt', 'fires_bavaria', 'fires_berlin', 'fires_brandenburg', 'fires_bremen', 'fires_hamburg', 'fires_hessen', 'fires_lower_sax',
#             'fires_meck_vor', 'fires_nordrhein_west', 'fires_rhein_pal', 'fires_saarland', 'fires_sax_ahn', 'fires_saxony', 'fires_schleswig_holstein',
#             'fires_thuringia', 'fires_germany']

# precipitation = get_dataframe(counties, urls)