In [None]:
import pandas as pd 
df = pd.read_csv("C:/Users/Prasad/Downloads/Air_quality_KNNR.csv")
df

In [None]:
# 4) Data cleaning
print("Initial shape:", df.shape)
print("Missing values per column:\n", df.isnull().sum())


In [None]:
def clip_iqr(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return series.clip(lower, upper)

for col in ['PM2.5','PM10','NO2','CO','O3','AQI']:
    df_imputed[col] = clip_iqr(df_imputed[col])

In [None]:
# 5) Feature engineering
# create aggregated pollutant feature
df_imputed['PM_total'] = df_imputed['PM2.5'] + df_imputed['PM10']
# you can also add ratios or interactions
df_imputed['PM2.5_ratio'] = df_imputed['PM2.5'] / (df_imputed['PM_total'] + 1e-6)

In [None]:
# 6) Feature scaling (KNN requires scaling)
feature_cols = ['Temperature','Humidity','WindSpeed','PM2.5','PM10','NO2','CO','O3','PM_total','PM2.5_ratio']
X = df_imputed[feature_cols]
y = df_imputed['AQI']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# 7) Splitting
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# 9) Model building & evaluation (KNN Regression)
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
def report_regression(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"MAE: {mae:.3f}")
    print(f"MSE: {mse:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"R2: {r2:.3f}")

print("Baseline KNN (default params) evaluation on test set:")
report_regression(y_test, y_pred)

In [None]:
# cross-validated baseline performance
cv_scores = cross_val_score(knn, X_scaled, y, cv=5, scoring='r2')
print("CV R2 (5-fold):", cv_scores, "mean:", cv_scores.mean())


In [None]:
# 10) Hyperparameter tuning
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # p=1 Manhattan, p=2 Euclidean
}

grid = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV score (neg MSE):", grid.best_score_)

best_knn = grid.best_estimator_
y_pred_best = best_knn.predict(X_test)
print("Tuned KNN evaluation on test set:")
report_regression(y_test, y_pred_best)
