# Data preparation
## Import Packages

In [1]:
import pandas as pd
import numpy as np
import calendar


## Read meta data
meta_data is a list of 50 cities in Europe and Türkiye.
Each row saves
id,country,name,country_code,latitude,longitude
of the city.

In [2]:
meta_data = pd.read_csv("https://raw.githubusercontent.com/CopernicusAtmosphere/air-quality-covid19-response/master/CAMS_AQ_LOCATIONS_V1.csv")

# Read Airquality ("aq") data for the years from 2015 to 2024
The Copernicus Atmosphere project stores the data yearwise in .csv files.

Each row then has the structure: basetime, city_id, NO2, O3, PM10, PM2.5

Basetime is on a daily basis at 2 p.m..
The four measurements are real measurements for
Nitrogen Dioxide (NO2)
Ozone (O3)
and twice particulate matter - extremely small solid particles and liquid droplets suspended in air
once with a diameter of 10 micrometres or less (PM10)
and eventually with a diameter of 2.5 micrometres or less (PM2.5)


In [3]:
years = range(2015, 2024)

# Create an empty dictionary to store aq_data
aq_data = {}

# Read aq_data for each year and store in aq_data dictionary
for year in years:
    url = f"https://raw.githubusercontent.com/CopernicusAtmosphere/air-quality-covid19-response/master/cams_air_quality_analysis_{year}.csv"
    aq_data[str(year)] = pd.read_csv(url)

# Concatenate all aq_data into a single DataFrame
aq_final = pd.concat(aq_data.values(), ignore_index=True)

aq_final

Unnamed: 0,basetime,city_id,NO2,O3,PM10,PM2.5
0,2015-01-01,AQ001,25.28,30.06,41.76,19.86
1,2015-01-01,AQ002,22.67,30.05,13.58,8.95
2,2015-01-01,AQ003,7.80,63.02,7.39,4.38
3,2015-01-01,AQ004,28.66,40.01,20.09,14.16
4,2015-01-01,AQ005,14.80,35.78,50.96,28.66
...,...,...,...,...,...,...
161245,2023-10-30,AQ046,5.87,68.87,16.49,6.85
161246,2023-10-30,AQ047,22.66,19.80,17.49,11.84
161247,2023-10-30,AQ048,14.32,29.78,12.10,8.90
161248,2023-10-30,AQ049,24.69,25.86,19.69,14.62


We are focusing on
# the NO2 column.
Nitrogen Dioxide is of a group of highly reactive gases known as oxides of nitrogen (NOx). For our research it provides precious information about CO2. Similar to the later NO2 primarily gets in the air from the burning of fossil fuels. In the atmosphere NO2 plays a role in absorbing sunlight and as such, unlike CO2, is not a greenhouse gas. However, it is much more instable than CO2 and thus allows better localization. Thus NO2 acts as an implicit indicator for CO2, which is much more difficult to measure on the local level.

In the following we
## combine the two datasets,
## compute 5 different rolling means
and
## a composite rolling mean, weighted averaging the last three years.
Eventually we
## create a monthly and a yearly index which measures relatively to the first day of the month resp. year the development of the NO2 concentration.

In [4]:
# Concatenate all aq_data into a single DataFrame
aq_final = pd.concat(aq_data.values(), ignore_index=True)

# Merge aq_final with meta_data on 'city_id' and 'id'
aq_final = pd.merge(aq_final, meta_data[['id', 'name', 'latitude', 'longitude']], left_on='city_id', right_on='id')

# Compute 'no2_rmean7' using rolling mean for each 'name'
aq_final['no2_rmean7'] = aq_final.groupby('name')['NO2'].rolling(window=7, min_periods=1).mean().reset_index(0, drop=True)

# Compute 'no2_rmean35' using rolling mean for each 'name'
aq_final['no2_rmean35'] = aq_final.groupby('name')['NO2'].rolling(window=35, min_periods=1).mean().reset_index(0, drop=True)

# Compute 'no2_rmean1j' using rolling mean for each 'name'
aq_final['no2_rmean1j'] = aq_final.groupby('name')['NO2'].rolling(window=365, min_periods=1).mean().reset_index(0, drop=True)

# Compute 'no2_rmean2j' using rolling mean for each 'name'
aq_final['no2_rmean2j'] = aq_final.groupby('name')['NO2'].rolling(window=365*2, min_periods=1).mean().reset_index(0, drop=True)

# Compute 'no2_rmean3j' using rolling mean for each 'name'
aq_final['no2_rmean3j'] = aq_final.groupby('name')['NO2'].rolling(window=365*3, min_periods=1).mean().reset_index(0, drop=True)

# Compute 'no2_rmean_c' as the weighted average of 'no2_rmean1j', 'no2_rmean2j', and 'no2_rmean3j' for each 'name'
weights = np.array([0.6, 0.3, 0.1])

aq_final['no2_rmean_c'] = (aq_final['no2_rmean1j'] * weights[0] +
                           aq_final['no2_rmean2j'] * weights[1] +
                           aq_final['no2_rmean3j'] * weights[2])

# Convert 'basetime' column to datetime type
aq_final['basetime'] = pd.to_datetime(aq_final['basetime'])
aq_final = aq_final[aq_final['basetime'].dt.year >= 2019]

# Group the data by 'name' and the year-month of 'basetime'
grouped_m = aq_final.groupby(['name', aq_final['basetime'].dt.to_period('M')])

# Calculate the index as the value of each day divided by the value of the first day of the month, multiplied by 100
aq_final['monthly_index'] = grouped_m['no2_rmean_c'].transform(lambda x: x / x.iloc[0] * 100)

# Group the data by 'name' and the year of 'basetime'
grouped_y = aq_final.groupby(['name', aq_final['basetime'].dt.year])

# Calculate the index as the value of each day divided by the value of the first day of the year, multiplied by 100
aq_final['yearly_index'] = grouped_y['no2_rmean_c'].transform(lambda x: x / x.iloc[0] * 100)

# Rename columns
# aq_final = aq_final.rename(columns={'no2_rmean1j': '365d moving average', 'no2_rmean_c': 'Composite moving average'})
aq_final = aq_final.drop(['no2_rmean7', 'no2_rmean35', 'no2_rmean2j', 'no2_rmean3j'], axis=1)

aq_final['days_in_month'] = aq_final['basetime'].dt.month.apply(lambda x: calendar.monthrange(pd.to_datetime('today').year, x)[1])

# Determine the winner for each month
aq_final['winner_month'] = ""
for year in aq_final['basetime'].dt.year.unique():
    for month in aq_final['basetime'].dt.month.unique():
        month_data = aq_final[(aq_final['basetime'].dt.year == year) & (aq_final['basetime'].dt.month == month)]
        last_day_of_month = month_data['basetime'].dt.day.max()
        days_in_month = month_data['days_in_month'].max()
        if last_day_of_month == days_in_month:
            last_day_data = month_data[month_data['basetime'] == month_data['basetime'].max()]
            if len(last_day_data) > 0:
                min_monthly_index = last_day_data['monthly_index'].min()
                winner_city_month = last_day_data[last_day_data['monthly_index'] == min_monthly_index]['name'].values[0]
                aq_final.loc[(aq_final['basetime'].dt.year == year) & (aq_final['basetime'].dt.month == month), 'winner_month'] = winner_city_month


# Determine the winner for each year
aq_final['winner_year'] = ""
for year in aq_final['basetime'].dt.year.unique():
    year_data = aq_final[aq_final['basetime'].dt.year == year]
    if year_data['basetime'].dt.year.max() != pd.to_datetime('today').year:
        last_day_data = year_data[year_data['basetime'] == year_data['basetime'].max()]
        if len(last_day_data) > 0:
            min_yearly_index = last_day_data['yearly_index'].min()
            winner_city_year = last_day_data[last_day_data['yearly_index'] == min_yearly_index]['name'].values[0]
            aq_final.loc[aq_final['basetime'].dt.year == year, 'winner_year'] = winner_city_year


aq_final
aq_final_csv = aq_final[['name', 'latitude', 'longitude', 'basetime', 'NO2', 'no2_rmean1j', 'no2_rmean_c', 'monthly_index', 'yearly_index', 'winner_month', 'winner_year']].copy()
aq_final_csv.columns = ['city', 'lat', 'lng', 'year', 'Raw', '365d moving average', 'Composite moving average', "Monthly Index Challenge", "Yearly Index Challenge", 'winner_month', 'winner_year']

aq_final_csv

Unnamed: 0,city,lat,lng,year,Raw,365d moving average,Composite moving average,Monthly Index Race,Yearly Index Race,winner_month,winner_year
1461,Amsterdam,52.35,4.92,2019-01-01,11.44,19.770822,19.782067,100.000000,100.000000,Sofia,Monaco
1462,Amsterdam,52.35,4.92,2019-01-02,13.17,19.757589,19.773995,99.959199,99.959199,Sofia,Monaco
1463,Amsterdam,52.35,4.92,2019-01-03,21.57,19.795589,19.798595,100.083552,100.083552,Sofia,Monaco
1464,Amsterdam,52.35,4.92,2019-01-04,21.74,19.806082,19.810179,100.142108,100.142108,Sofia,Monaco
1465,Amsterdam,52.35,4.92,2019-01-05,11.72,19.774274,19.787768,100.028819,100.028819,Sofia,Monaco
...,...,...,...,...,...,...,...,...,...,...,...
161245,Zagreb,45.80,16.00,2023-10-26,6.10,10.587699,11.066693,98.793665,91.259969,,
161246,Zagreb,45.80,16.00,2023-10-27,9.84,10.557918,11.043067,98.582757,91.065144,,
161247,Zagreb,45.80,16.00,2023-10-28,10.82,10.542795,11.027289,98.441908,90.935036,,
161248,Zagreb,45.80,16.00,2023-10-29,5.16,10.518932,11.000995,98.207178,90.718206,,


## we save the data
writing the resulting DataFrame to a CSV file.

In [5]:
aq_final_csv.to_csv('../static/data/data.csv', index=False)