In [None]:
!pip install ucimlrepo pandas numpy scikit-learn

In [None]:
# ------- Import Dataset -------------------

from ucimlrepo import fetch_ucirepo

# fetch dataset
air_quality = fetch_ucirepo(id=360)

# data (as pandas dataframes)
X = air_quality.data.features
y = air_quality.data.targets # unused?

In [None]:
# Combine Date and Time into a single DateTime column -> only 14 features
import pandas as pd
import numpy as np

X['DateTime'] = pd.to_datetime(X['Date'] + ' ' + X['Time']) # Format (JJJJ-MM-DD HH:MM:SS, e.g.: 2004-03-10 18:00:00)
X = X.set_index('DateTime')
X.drop(['Date', 'Time'], axis=1, inplace=True)  # Remove the original Date and Time columns
X.replace(-200, np.nan, inplace=True)
X.replace(-200.0, np.nan, inplace=True)

In [None]:
pd.set_option('display.width', 1000)
print(X.head())

In [None]:
# Invert PT08.S3(NOx)
X['PT08.S3(NOx)'] = -1 * X['PT08.S3(NOx)']  # Reversing the sign

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Initialize the Min-Max Scaler
scaler = MinMaxScaler()

# Select only the columns to be scaled
features_to_scale = X.columns  # As 'DateTime' is an index, it won't be included

# Apply the scaler to the features
X_scaled = pd.DataFrame(scaler.fit_transform(X[features_to_scale]), columns=features_to_scale, index=X.index)

# Now X_scaled contains the normalized data, with 'DateTime' as the index

In [None]:
#changes nothing for linear regression -> forward fill NA values

X_scaled.fillna(method='ffill', inplace=True)

In [None]:
# Linear Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming df is your DataFrame
# Set your target feature and predictors
target = 'CO(GT)'  # Replace with your target feature's name

predictors = X_scaled.columns[X_scaled.columns != target]

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled[predictors], X_scaled[target], test_size=0.2, random_state=42)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")