Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load the Dataset from Drive

In [2]:
import pandas as pd

# Path to the dataset on your Google Drive
file_path = '/content/drive/MyDrive/AirQuality/Datasets/AirQualityUCI.csv'

# Load the dataset
df = pd.read_csv(file_path, sep=';', decimal=',', parse_dates=[["Date", "Time"]], na_values=-200)

# Drop empty unnamed columns at the end
df = df.iloc[:, :-2]

# Clean column names
df.columns = [col.strip() for col in df.columns]

# Preview the data
df.head()


  df = pd.read_csv(file_path, sep=';', decimal=',', parse_dates=[["Date", "Time"]], na_values=-200)
  df = pd.read_csv(file_path, sep=';', decimal=',', parse_dates=[["Date", "Time"]], na_values=-200)


Unnamed: 0,Date_Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,10/03/2004 18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
1,10/03/2004 19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2,10/03/2004 20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
3,10/03/2004 21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
4,10/03/2004 22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888


Data Pre-Processing

In [None]:
# Drop rows where most of the features are missing
df.dropna(thresh=5, inplace=True)

# Drop rows where target (CO(GT)) is missing
df.dropna(subset=["CO(GT)"], inplace=True)

# Drop remaining rows with any missing values
df_clean = df.dropna()

# Separate features and target
X = df_clean.drop(columns=["Date_Time", "CO(GT)"])
y = df_clean["CO(GT)"]


Outlier Removal using IQR

In [None]:
Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1

# Keep only non-outlier rows
outlier_mask = ~((X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))).any(axis=1)
X = X[outlier_mask]
y = y[outlier_mask]


Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


Train Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


Evaluation

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R² Score: {r2:.3f}")


Visualization

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.xlabel("Actual CO(GT)")
plt.ylabel("Predicted CO(GT)")
plt.title("Predicted vs Actual CO Levels")
plt.grid(True)
plt.show()
