In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [64]:
# Load dataset
df = pd.read_csv('global air pollution dataset.csv')  # Replace with actual file path

# Drop non-numeric features that do not contribute to AQI prediction
df.drop(columns=['Country', 'City'], inplace=True)



In [66]:
# Define input features and target variable
X = df.drop(columns=['CO AQI Category','Ozone AQI Category','NO2 AQI Category','PM2.5 AQI Category','AQI Value','AQI Category'])  # Features
y = df['AQI Value']  # Target variable

In [68]:
X.columns

Index(['CO AQI Value', 'Ozone AQI Value', 'NO2 AQI Value', 'PM2.5 AQI Value'], dtype='object')

In [70]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [71]:
y_pred = model.predict(X_test)

# Evaluate model performance
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f'RMSE: {rmse}')
print(f'R² Score: {r2}')

RMSE: 2.6621130553480454
R² Score: 0.9977760778758138


In [74]:
# Function to make new predictions
def predict_aqi(data):
    """Predict AQI value given a new sample."""
    df_new = pd.DataFrame([data])
    return model.predict(df_new)[0]


In [76]:
# Example prediction
sample_data = {
    'CO AQI Value': 50,
    'Ozone AQI Value': 80,
    'NO2 AQI Value': 60,
    'PM2.5 AQI Value': 100,
}
print(f'Predicted AQI: {predict_aqi(sample_data)}')

Predicted AQI: 100.01


In [78]:
import joblib

In [80]:
joblib.dump(model, 'aqi_model.pkl')

['aqi_model.pkl']