In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import missingno as msno


In [None]:
df_pollutants = pd.read_csv("/data/AQI data/AQI_hourly.csv", sep=";", decimal=',', encoding="utf-8")

df_pollutants.shape

In [None]:
df_pollutants.head()

In [None]:
df_pollutants.dtypes

In [None]:
df_pollutants['Date'] = pd.to_datetime(df_pollutants['Date'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
cols = [col for col in df_pollutants.columns if col != 'Date']

In [None]:
df_pollutants.dtypes

In [None]:
df_pollutants.head()

In [None]:
df_copy = df_pollutants.copy()
df_copy.set_index('Date', inplace=True)
df_copy

In [None]:
msno.matrix(df_copy)

In [None]:
df_pollutants.set_index('Date', inplace=True)
df_pollutants['Year'] = df_pollutants.index.year
df_pollutants['Month'] = df_pollutants.index.month
df_pollutants['Day'] = df_pollutants.index.day
df_pollutants['WeekDay'] = df_pollutants.index.day_name()

def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'
    
df_pollutants['Season'] = df_pollutants.index.month.map(get_season)

In [None]:
df_pollutants.columns

In [None]:
# df_pollutants.drop(["Station2_NO", "Station2_PM2.5"], axis=1, inplace=True)

In [None]:
# daily_avg = df_pollutants[["Station1_PM10", "Station2_PM10", "Station1_NO2", "Station2_NO2", "Station1_SO2", "Station2_SO2"]].mean()
# daily_avg

In [None]:
# seasonal_avg = df_pollutants.groupby('Season')[["Station1_PM10", "Station2_PM10",  "Station1_SO2", "Station2_SO2"]].mean()
# print("Seasonal average")
# print(seasonal_avg)

In [None]:
# yearly_avg = df_pollutants.groupby('Year')[["Station1_PM10", "Station2_PM10",  "Station1_SO2", "Station2_SO2"]].mean()
# print("Yearly average")
# print(yearly_avg)

In [None]:
missing_values = df_pollutants.isnull().sum()
print(missing_values)

In [None]:
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors=5)

pollutants = set([col.split("_")[-1] for col in df_pollutants.columns if "_" in col])
print(pollutants)

for pol in pollutants:
    station1_col = f"Station1_{pol}"
    station2_col = f"Station2_{pol}"

    if station1_col in df_pollutants.columns and station2_col in df_pollutants.columns:
        data_to_impute = df_pollutants[[station1_col, station2_col]]

        imputed_data = knn_imputer.fit_transform(data_to_impute)

        df_pollutants[station1_col], df_pollutants[station2_col] = imputed_data[:, 0], imputed_data[:, 1]


In [None]:
missing_values = df_pollutants.isnull().sum()
print(missing_values)

In [None]:
df_pm25 = df_pollutants[pd.notna(df_pollutants['Station2_PM2.5'])]

In [None]:
df_pm25.to_csv("/Users/zafiraibraeva/Code/uni coding/thesis/thesis_code/thesis/data/final_data/final_data_with_pm2.5")

In [None]:
df_pollutants = df_pollutants.ffill()

In [None]:
missing_values = df_pollutants.isna().sum()
missing_values

In [None]:
df_pollutants.drop('Station2_PM2.5', axis=1, inplace=True)

In [None]:
df_pollutants.columns

In [None]:
yearly_avg = df_pollutants.groupby('Year')[["Station1_PM10", "Station2_PM10", "Station1_SO2", "Station2_SO2"]].mean()
print("Yearly average")
print(yearly_avg)

In [None]:
df_pollutants['Station2_O3'] = pd.to_numeric(df_pollutants['Station2_O3'], errors='coerce')

In [None]:
df_values = pd.DataFrame(df_pollutants)
df_values = df_pollutants.drop(["Year", "Month", "Season", "Day", "WeekDay"], axis=1)
df_values = df_values.replace({',': '.'}, regex=True)


In [None]:
import seaborn as sns


corr_matrix = df_values[df_values.columns].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation matrix")
plt.show()

In [None]:
df_val = pd.DataFrame(df_pm25)
df_val = df_pm25.drop(["Year", "Month", "Season", "Day", "WeekDay"], axis=1)
df_val = df_val.replace({',': '.'}, regex=True)


In [None]:
import seaborn as sns


corr_matrix = df_val[df_val.columns].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation matrix")
plt.show()

In [None]:
def plot_mean(df, grouping):
    station1_mean = df.groupby(grouping)['Station1_PM10'].mean()
    station2_mean = df.groupby(grouping)['Station2_PM10'].mean()
    
    station1_mean.plot(label='Kalotaszeg ter station - PM10', marker='o')  
    station2_mean.plot(label='Hajnal ut station - PM10', marker='o') 
    
    plt.title(f'{grouping} Average PM10 Levels') 
    plt.xlabel(f'{grouping}')                      
    plt.ylabel('PM10 Concentration')        
    plt.legend()                           
    
    plt.show()

In [None]:
plot_mean(df_pollutants, "Year")
plot_mean(df_pollutants, "Season")
plot_mean(df_pollutants, "Month")
plot_mean(df_pollutants, "WeekDay")

In [None]:
from matplotlib import cm

def plot_aqi(df, value):
    unique_years = df['Year'].unique()
    months = np.arange(1, 13) 
    
    plt.figure(figsize=(12, 8))
    
    colors = cm.viridis(np.linspace(0, 1, len(unique_years)))
    
    for i, year in enumerate(unique_years):
        monthly_data = df[df['Year'] == year].groupby('Month')[value].mean()
    
        plt.plot(
            months, 
            monthly_data, 
            color=colors[i], 
            alpha=0.7, 
            label=f"{year}"
        )
    
    plt.title(f"{value} Values for the past 10 years")
    plt.xlabel("Month")
    plt.ylabel(f"{value} (ug/m3)")
    
    plt.xticks(months, ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    
    plt.grid(True)
    plt.tight_layout()
    plt.legend(title="Year", bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()
    
plot_aqi(df_pollutants, "Station1_PM10")
plot_aqi(df_pollutants, "Station2_PM10")


In [None]:
fig, ax = plt.subplots(figsize=(20, 5))

candidates = df_pollutants[['Station1_PM10','Station1_NO2', 'Station1_SO2']]
candidates /= candidates.max(axis=0)
candidates.plot(ax=ax)

In [None]:
fig1, ax1 = plt.subplots(figsize=(20, 5))
candidates1 = df_pollutants[['Station2_PM10', 'Station2_NO2', 'Station2_SO2']]
candidates1 /= candidates1.max(axis=0)
candidates1.plot(ax=ax1)

In [None]:
def plot_with_threshold(df, pollutant, threshold):
    fig, ax = plt.subplots(figsize=(20, 5))
    candidates = df[pollutant]
    ax.axhline(y=threshold, linestyle='--', linewidth=1, color='red')
    candidates.plot()

In [None]:
plot_with_threshold(df_pollutants, "Station1_PM10", 50)

plot_with_threshold(df_pollutants, "Station1_PM10", 50)

In [None]:
df_pollutants.to_csv("/Users/zafiraibraeva/Code/uni coding/thesis/thesis_code/thesis/data/cleaned_AQI_hourly.csv")

In [None]:
wind_df = pd.read_csv("/data/final_data/wind.csv")
wind_df.shape

In [None]:
cols_to_keep = [
       'datetime', 'temp', 'humidity', 'precip', 
       'precipcover', 'cloudcover',
       'windspeed', 'winddir', 'visibility',
       ]

In [None]:
wind_df = wind_df[cols_to_keep]

In [None]:
wind_df.set_index('datetime', inplace=True)

In [None]:
wind_df

In [None]:
wind_df.isnull().sum()

In [None]:
df_pollutants.index = pd.to_datetime(df_pollutants.index).normalize() 
wind_df.index = pd.to_datetime(wind_df.index).normalize()

In [None]:
# final_df = pd.merge(df, wind_df, left_index=True, right_index=True)

In [None]:
# final_df.to_csv("final_data.csv")

In [None]:
df_pollutants['is_heating_season'] = np.where(df_pollutants['Season'].isin(["Winter", "Fall"]), 1, 0)
df_pollutants['is_work_day'] = np.where(df_pollutants['WeekDay'].isin(["Monay", "Tuesday", "Wednesday", "Thursday", "Friday"]), 1, 0)

In [None]:
df_pollutants.drop(["Year", "Season", "WeekDay", "Day", "Month"], axis=1, inplace=True)
df_pollutants

In [None]:
wind_df["winddir_rad"] = np.deg2rad(wind_df["winddir"])

wind_df["winddir_sin"] = np.sin(wind_df["winddir_rad"])
wind_df["winddir_cos"] = np.cos(wind_df["winddir_rad"])

wind_df.drop(columns=["winddir", "winddir_rad"], inplace=True)
wind_df

In [None]:
wind_df.isna().sum()
# df.fillna(method="ffill", inplace=True)  # Forward fill


In [None]:
# df.reset_index(inplace=True)
df_pollutants.reset_index(inplace=True)
df_pollutants['timestamp'] = df_pollutants['Date'].dt.date
df_pollutants

In [None]:
df_wind_resampled = wind_df.resample('H').ffill()  
df_wind_resampled['Date'] = df_wind_resampled.index

In [None]:
wind_df.reset_index()
wind_df['timestamp'] = wind_df.index.date
#wind_df['timestamp'] = wind_df['datetime']

In [None]:
wind_df['timestamp'] = pd.to_datetime(wind_df['timestamp'])

df_merged = pd.merge(df_pollutants, wind_df, left_on='timestamp', right_on=wind_df['timestamp'].dt.date, how='left')

# Drop the extra 'timestamp' column from wind data, if needed
df_merged.drop(columns='timestamp_y', inplace=True)



In [None]:
# final_df.to_csv("/Users/zafiraibraeva/Code/uni coding/thesis/thesis_code/thesis/data/final_data/data_for_model.csv")
print(df_merged)

In [None]:
# final_df.columns

In [None]:
meteorological_data = [
    "winddir_sin",
    "winddir_cos",
    "windspeed",
    "visibility",
    "humidity",
    "precip",
    "precipcover",
    "cloudcover",
    "temp",
    
]
pm10 = "Station1_PM10"
pm10_2 = "Station2_PM10"

corr_data = df_merged[meteorological_data + [pm10, pm10_2]]  
corr_matrix = corr_data.corr()

plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix: PM10 & Meteorological Variables")
plt.show()

In [None]:
df_merged.to_csv("/Users/zafiraibraeva/Code/uni coding/thesis/thesis_code/thesis/data/final_data/hourly_final.csv")