In [1]:
import pandas as pd

In [2]:
# Load the datasets
aqi_df = pd.read_csv('data_for_train/AQI.csv')
forecast_df = pd.read_csv('data_for_train/weather.csv')

In [3]:
# Convert date/time columns to datetime objects
aqi_df['date'] = pd.to_datetime(aqi_df['time'])
forecast_df['time'] = pd.to_datetime(forecast_df['time'])

In [4]:
# Sort both DataFrames by date to ensure chronological alignment
aqi_df.sort_values('time', inplace=True)
forecast_df.sort_values('time', inplace=True)

In [5]:
# Add previous day's AQI (as a feature for prediction)
aqi_df['prev_us_aqi'] = aqi_df['us_aqi (USAQI)'].shift(1)

In [6]:
# Merge forecast data with AQI data: forecast for day X is used to predict AQI for day X
merged_df = pd.merge(
    forecast_df,
    aqi_df[['date', 'us_aqi (USAQI)', 'prev_us_aqi']],
    left_on='time',
    right_on='date',
    how='inner'
)

In [7]:
# Drop columns that contain all NaN values
merged_df.dropna(axis=1, how='all', inplace=True)

In [8]:

# Drop any remaining rows with missing data
merged_df.dropna(inplace=True)

In [9]:

# Optionally drop the original date/time columns
merged_df.drop(columns=['time', 'time'], inplace=True)

In [10]:
# Save the preprocessed dataset for model training
merged_df.to_csv('merged_forecast_aqi.csv', index=False)

In [11]:
print("✅ Data preprocessing complete. Saved as 'merged_forecast_aqi.csv'.")

✅ Data preprocessing complete. Saved as 'merged_forecast_aqi.csv'.
