<a href="https://colab.research.google.com/github/visha1Sagar/Air-Quality-Management-System---IOT/blob/main/Air_Quality_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install xgboost scikit-learn



In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd



from sklearn.model_selection import GridSearchCV

In [7]:
data = pd.read_csv('/content/GlobalWeatherRepository.csv')
data  = data[['humidity','temperature_celsius','air_quality_Carbon_Monoxide','air_quality_us-epa-index' ]]
data.dropna(inplace=True) # Drop rows with missing values

In [8]:
# Calculate z-scores for outlier detection
def remove_outliers(df, columns, threshold=3):
    for col in columns:
        z_scores = (df[col] - df[col].mean()) / df[col].std()
        df = df[(z_scores.abs() < threshold)]
    return df

# Define features (X) and target (y)
X = data[['humidity', 'temperature_celsius', 'air_quality_Carbon_Monoxide']]
y = data['air_quality_us-epa-index']

# Remove outliers from features
X = remove_outliers(X.copy(), X.columns)

# Filter y to keep only rows corresponding to the non-outlier values in X.
y = y[y.index.isin(X.index)]

In [9]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define the parameter grid for XGBoost
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [50, 100, 200],
    'subsample': [0.8, 1.0],  # Example subsample values
    'colsample_bytree': [0.8, 1.0]  # Example colsample_bytree values
}

# Initialize XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# Perform Grid Search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1) #n_jobs=-1 uses all available cores
grid_search.fit(X_train, y_train)


In [13]:

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)



Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Best score: -0.37032918355613337


In [15]:
# Evaluate the best model on the test set
best_xgb_model = grid_search.best_estimator_


In [16]:
y_pred_train = best_xgb_model.predict(X_train)
y_pred_test = best_xgb_model.predict(X_test)

In [17]:

from sklearn.metrics import mean_squared_error, r2_score


In [18]:
mse_test = mean_squared_error(y_test,y_pred_test)
mse_train = mean_squared_error(y_train,y_pred_train)

r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)


In [19]:
print("Training Error:")
print(f"Mean Squared Error: {mse_train}")
print(f"R-squared: {r2_train}")

print("\nTesting Error:")
print(f"Mean Squared Error: {mse_test}")
print(f"R-squared: {r2_test}")

Training Error:
Mean Squared Error: 0.309683520152341
R-squared: 0.5316953659057617

Testing Error:
Mean Squared Error: 0.36372936096360287
R-squared: 0.4516555666923523
