In [None]:
!pip install ucimlrepo pandas scikit-learn

In [54]:
# ------- Import Dataset -------------------

from ucimlrepo import fetch_ucirepo

# fetch dataset
air_quality = fetch_ucirepo(id=360)

# data (as pandas dataframes)
X = air_quality.data.features
y = air_quality.data.targets # unused?

In [55]:
# Combine Date and Time into a single DateTime column -> only 14 features
import pandas as pd

X['DateTime'] = pd.to_datetime(X['Date'] + ' ' + X['Time']) # Format (JJJJ-MM-DD HH:MM:SS, e.g.: 2004-03-10 18:00:00)
X = X.set_index('DateTime')
X.drop(['Date', 'Time'], axis=1, inplace=True)  # Remove the original Date and Time columns
X.replace(-200, 0, inplace=True)
X.replace(-200.0, 0, inplace=True)

In [56]:
pd.set_option('display.width', 1000)
print(X.head())

                     CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)  NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)     T    RH      AH
DateTime                                                                                                                                                  
2004-03-10 18:00:00     2.6         1360       150      11.9           1046      166          1056      113          1692         1268  13.6  48.9  0.7578
2004-03-10 19:00:00     2.0         1292       112       9.4            955      103          1174       92          1559          972  13.3  47.7  0.7255
2004-03-10 20:00:00     2.2         1402        88       9.0            939      131          1140      114          1555         1074  11.9  54.0  0.7502
2004-03-10 21:00:00     2.2         1376        80       9.2            948      172          1092      122          1584         1203  11.0  60.0  0.7867
2004-03-10 22:00:00     1.6         1272        51       6.5          

In [None]:
!nvidia-smi

In [57]:
# Invert PT08.S3(NOx)
X['PT08.S3(NOx)'] = -1 * X['PT08.S3(NOx)']  # Reversing the sign

In [61]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Initialize the Min-Max Scaler
scaler = MinMaxScaler()

# Select only the columns to be scaled
features_to_scale = X.columns  # As 'DateTime' is an index, it won't be included

# Apply the scaler to the features
X_scaled = pd.DataFrame(scaler.fit_transform(X[features_to_scale]), columns=features_to_scale, index=X.index)

# Now X_scaled contains the normalized data, with 'DateTime' as the index

In [62]:
# Linear Regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming df is your DataFrame
# Set your target feature and predictors
target = 'CO(GT)'  # Replace with your target feature's name
predictors = X_scaled.columns[X_scaled.columns != target]

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X[predictors], X_scaled[target], test_size=0.2, random_state=42)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 0.003109411232574027
R^2 Score: 0.8261430005745449
