<a href="https://colab.research.google.com/github/visha1Sagar/Air-Quality-Management-System---IOT/blob/main/Air_Quality_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Libraries

In [None]:
import numpy

from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

## Downloading Dataset

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nelgiriyewithana/global-weather-repository")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/nelgiriyewithana/global-weather-repository?dataset_version_number=446...


100%|██████████| 3.27M/3.27M [00:00<00:00, 3.86MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/nelgiriyewithana/global-weather-repository/versions/446





## Reading Data

In [None]:
data = pd.read_csv(path+'/GlobalWeatherRepository.csv')
data  = data[['humidity','temperature_celsius','air_quality_Carbon_Monoxide','air_quality_us-epa-index' ]]
data.dropna(inplace=True) # Drop rows with missing values

## Downsampling

In [None]:
# Downsample rows where 'air_quality_us-epa-index' is 1 or 2
rows_to_downsample = data[data['air_quality_us-epa-index'].isin([1, 2])]
num_rows_to_keep = int(len(rows_to_downsample) / 4)

# Sample randomly from rows_to_downsample
downsampled_rows = rows_to_downsample.sample(n=num_rows_to_keep, random_state=42) # Use random_state for reproducibility

# Combine the downsampled rows with the rest of the data
data = pd.concat([
    data[~data.index.isin(rows_to_downsample.index)],  # Rows not in the downsampled set
    downsampled_rows
])

In [None]:
data.columns

Index(['humidity', 'temperature_celsius', 'air_quality_Carbon_Monoxide',
       'air_quality_us-epa-index'],
      dtype='object')

## Upsampling

In [None]:
# upsample the data with label = 4, 5, 6
data_upsampled = pd.concat([data[data['air_quality_us-epa-index'] == 4], data[data['air_quality_us-epa-index'] == 5], data[data['air_quality_us-epa-index'] == 6], data['air_quality_us-epa-index']])

In [None]:
# data_upsampled.dropna(inplace=True)
data = pd.concat([data, data_upsampled])

In [None]:
data.dropna(inplace=True)

## Train - Test Spliting

In [None]:
y = data.pop('air_quality_us-epa-index')
X = data

### Normalizing

In [None]:
X = StandardScaler().fit_transform(X)

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create a RandomForestRegressor object
rf_model = RandomForestClassifier(max_depth= 14, min_samples_leaf= 1,
                                 min_samples_split= 10, n_estimators= 200)

# Fit the grid search to the training data
rf_model = rf_model.fit(X_train, y_train)

## Evaluation

In [None]:
# Evaluate the best model on the testing data
y_pred = rf_model.predict(X_train)
print("Training Accuracy:", accuracy_score(y_train, y_pred))


y_pred = rf_model.predict(X_test)
print("Testing Accuracy:", accuracy_score(y_test, y_pred))


## Gradient Boosting Model

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Define the parameter grid for GradientBoostingClassifier
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 7]
}

# Create a GradientBoostingClassifier object
gb_model = GradientBoostingClassifier()

# Create a GridSearchCV object
grid_search = GridSearchCV(gb_model, param_grid, cv=5, n_jobs=-1, scoring='accuracy', return_train_score=True)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)


## Evaluation

In [None]:
# Print the results for each parameter combination
results = pd.DataFrame(grid_search.cv_results_)
print(results[['param_n_estimators', 'param_learning_rate', 'param_max_depth', 'mean_train_score', 'mean_test_score']])


# Evaluate the best model on the testing data
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

best_gb_model = grid_search.best_estimator_
y_pred_train = best_gb_model.predict(X_train)
y_pred_test = best_gb_model.predict(X_test)

print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Testing Accuracy:", accuracy_score(y_test, y_pred_test))

In [None]:
# Example DataFrame
data = pd.DataFrame({
    'humidity': [28, 29, 28],
    'temperature_celsius': [22, 24, 22],
    'air_quality_Carbon_Monoxide': [1050, 1050, 1650],


})

pipe.predict(data)

array([2.3963554, 2.396104 , 2.9879148], dtype=float32)