In [1]:
import os
os.chdir('..') # set notebook's working directory one up to the project root
# necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# functions from scripts
from scripts.fetch_openAQ_data import fetch_openaq_data
from scripts.process_world_bank_data import process_world_bank_data
from scripts.analyze_data import combine_datasets

In [None]:
# fetch and process data
fetch_openaq_data()
process_world_bank_data()
# combine datasets
combined_data = combine_datasets()
combined_data.columns
combined_data["Indicator Name"]

In [4]:
# drop columns from dataframe that are irrelevant/unhelpful
combined_data.drop(['Indicator Code', 'flagInfo.hasFlags', 'parameter.id', 
                    'period.datetimeFrom.utc', 'period.datetimeFrom.local', 'period.datetimeTo.utc',
                    'period.datetimeTo.local', 'summary.min', 'summary.q02', 'summary.q25', 'summary.median',
                    'summary.q75', 'summary.q98', 'summary.max', 'summary.avg', 'summary.sd', 
                    'coverage.expectedCount', 'coverage.expectedInterval', 'coverage.observedCount', 
                    'coverage.observedInterval', 'coverage.percentComplete', 'coverage.percentCoverage', 
                    'coverage.datetimeFrom.utc', 'coverage.datetimeFrom.local', 'coverage.datetimeTo.utc', 
                    'coverage.datetimeTo.local', ], axis=1, inplace=True)

In [None]:
combined_data.head()

In [None]:
# visualize trends
sns.barplot(data=combined_data, x='Year', y='PM10')
plt.title('Air Quality vs Population Over Time')
plt.show()

sns.lineplot(data=combined_data, x='Year', y='PM10')
plt.title('Air Quality vs Population Over Time')
plt.show()

In [None]:
# filter rows for PM2.5 and PM10 indicators // SPOILER these graphs were useless
pm_data = combined_data[combined_data['Indicator Name'].str.contains('PM2.5|PM10', na=False)]

# Line plot to compare trends
sns.lineplot(data=pm_data, x='Year', y='WB Value', hue='Indicator Name')
plt.title('PM2.5 and PM10 Trends Over Time')
plt.ylabel('Value')
plt.show()

In [None]:
# spoiler this graph also shows basically nothing but at least we have it
# select numeric columns for correlation
numeric_data = combined_data[['WB Value', 'PM10']].dropna()

# calculate and plot correlation heatmap
corr = numeric_data.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Between Indicators')
plt.show()

In [None]:
# pivot the data for heatmap
pivot_data = combined_data.pivot(index='Year', columns='Indicator Name', values='WB Value').fillna(0)

# heatmap for indicator trends over time
sns.heatmap(pivot_data, cmap='YlGnBu', cbar_kws={'label': 'Value'})
plt.title('Indicator Trends Over Time')
plt.xlabel('Indicator Name')
plt.ylabel('Year')
plt.show()
# not entirely sure how to interpet this one yet, darker colors indicate more varying trends i believe

In [None]:
# calculate PM2.5/PM10 ratio
combined_data['PM2.5_to_PM10'] = combined_data['WB Value'] / combined_data['PM10']

# line plot for ratio over time
sns.lineplot(data=combined_data, x='Year', y='PM2.5_to_PM10', hue='Country Name')
plt.title('PM2.5 to PM10 Ratio Over Time')
plt.ylabel('PM2.5/PM10')
plt.show()
# shows that pm10 generally < pm2.5, explore further later ?

In [None]:
# further: explore relationships between air quality indicators like PM10 or pm2.5 and socioeconomic factors (e.g., GDP, population)

# possibly use group by for columns and calculate mean PM10 values
# grouped_pm10 = combined_data.groupby('Indicator Name')['PM10'].mean().sort_values(ascending=False)
# sns.barplot(x=grouped_pm10[:10].index, y=grouped_pm10[:10].values)
# plt.xticks(rotation=45)
# plt.title('Top 10 __ by PM10 Levels')
# plt.ylabel('PM10 (µg/m³)')
# plt.xlabel('__')
# plt.show()