In [57]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
file_path = "/content/air_quality.csv"
data = pd.read_csv(file_path)

# Show first few rows
print("Dataset Preview:\n", data.head())

# -----------------------------
# STEP 1: Preprocessing
# -----------------------------

# Keep only numerical features + target
# (Assuming AQI depends on pollutant values; if dataset has no pollutants, this must be updated)
# For now, let’s try encoding text columns if needed
data = data.dropna()  # remove missing rows

# Encode categorical columns if present (country, city, station, gps, date)
for col in ['country', 'city', 'station', 'GPS', 'date']:
    if col in data.columns:
        data[col] = data[col].astype('category').cat.codes

# Features (X) and Target (y)
X = data.drop(columns=['AQI'])
y = data['AQI']

# -----------------------------
# STEP 2: Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# STEP 3: Train Model
# -----------------------------
model = LinearRegression()
model.fit(X_train, y_train)

# -----------------------------
# STEP 4: Evaluate Model
# -----------------------------
y_pred = model.predict(X_test)
print("\nModel Performance:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

# -----------------------------
# STEP 5: User Input for Prediction
# -----------------------------

# -----------------------------
# STEP 5: User Input for Prediction + AQI Category
# -----------------------------

print("\n--- Enter Air Pollutant Values ---")
try:
    pm25 = float(input("Enter PM2.5: "))
    pm10 = float(input("Enter PM10: "))
    no2 = float(input("Enter NO2: "))
    so2 = float(input("Enter SO2: "))
    co = float(input("Enter CO: "))
    o3 = float(input("Enter O3: "))

    # Create input sample in correct format
    new_sample = pd.DataFrame([{
        'country': 0,  # dummy since user doesn’t enter location
        'city': 0,
        'station': 0,
        'gps': 0,
        'date': 0,
        'PM2.5': pm25,
        'PM10': pm10,
        'NO2': no2,
        'SO2': so2,
        'CO': co,
        'O3': o3
    }])

    # Align columns with training data
    new_sample = new_sample.reindex(columns=X.columns, fill_value=0)

    # Predict AQI
    predicted_aqi = model.predict(new_sample)[0]
    print(f"\n✅ Predicted AQI: {predicted_aqi:.2f}")

    # AQI Categorization
    def categorize_aqi(aqi):
        if aqi <= 50:
            return "Best 😀"
        elif aqi <= 100:
            return "Good 🙂"
        elif aqi <= 200:
            return "Moderate 😐"
        elif aqi <= 300:
            return "Poor 😷"
        else:
            return "Very Poor/Severe 🚨"

    category = categorize_aqi(predicted_aqi)
    print(f"🌍 Air Quality Category: {category}")

except ValueError:
    print("\n❌ Please enter valid numeric values for pollutants.")


Dataset Preview:
      country      city station                       GPS        date  AQI
0  Australia  Canberra  Monash  (-35.418302, 149.094018)  08/01/2020  347
1  Australia  Canberra  Monash  (-35.418302, 149.094018)  07/01/2020  203
2  Australia  Canberra  Monash  (-35.418302, 149.094018)  04/11/2019   71
3  Australia  Canberra  Monash  (-35.418302, 149.094018)  11/11/2019   51
4  Australia  Canberra  Monash  (-35.418302, 149.094018)  15/11/2019   46

Model Performance:
MSE: 153778.68187552394
R² Score: -3.323981456476213

--- Enter Air Pollutant Values ---
Enter PM2.5: 56
Enter PM10: 12
Enter NO2: 23
Enter SO2: 34
Enter CO: 45
Enter O3: 67

✅ Predicted AQI: 793.10
🌍 Air Quality Category: Very Poor/Severe 🚨
