In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Load the CSV file
file_path = "NOx.csv"
df = pd.read_csv(file_path)

# Display basic info and the first few rows
df.info(), df.head()

# Drop the first column (index)
data = df.drop(columns=df.columns[0])

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Perform PCA
pca = PCA()
pca_result = pca.fit_transform(data_scaled)

# Explained variance
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio)

# Plot cumulative variance
plt.figure(figsize=(10, 6))
plt.plot(cumulative_variance, marker='o')
plt.title('Cumulative Explained Variance by PCA Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.xlim(0, 100)
plt.grid(True)
plt.tight_layout()
plt.show()
print(cumulative_variance)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Columns: 5201 entries, Unnamed: 0 to 5555.555555555556
dtypes: float64(5200), int64(1)
memory usage: 31.7 MB


NameError: name 'X' is not defined

In [24]:
labels_df = pd.read_csv('NOx_labels.csv')
concentrations = labels_df
print(concentrations.head())

             NO           NO2           N2O
0  1.124246e-07  1.396625e-07  2.896213e-07
1  2.852192e-07  1.072657e-07  3.467042e-07
2  2.196250e-07  6.197457e-08  3.996713e-07
3  1.796377e-07  1.627776e-07  3.771560e-07
4  4.688999e-08  1.369778e-07  3.890498e-07


In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Select the number of principal components (say, the first N)
N = 10  # You can adjust this
X_pca = pca_result[:, :N]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_pca, concentrations, test_size=0.01)#, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

from sklearn.metrics import r2_score

# Assuming y_pred is a 2D array with shape (n_samples, n_targets), i.e., [CO2, NO2] predictions
r2_no = r2_score(y_test['NO'], y_pred[:, 0])  # CO2 (ppm) R²
r2_no2 = r2_score(y_test['NO2'], y_pred[:, 1])  # NO2 (ppb) R²
r2_n2o = r2_score(y_test['N2O'], y_pred[:, 2])  # NO2 (ppb) R²

print(f"R² score for NO (ppb): {r2_no:.3f}")
print(f"R² score for NO2 (ppb): {r2_no2:.3f}")
print(f"R² score for N2O (ppb): {r2_n2o:.3f}")

R² score for NO (ppb): 0.480
R² score for NO2 (ppb): -0.086
R² score for N2O (ppb): -0.110


In [22]:
import pandas as pd

# Create a DataFrame for comparison
results_df = pd.DataFrame({
    'True NO (ppb)': y_test['NO'],
    'Predicted NO (ppb)': y_pred[:, 0],
    'True NO2 (ppb)': y_test['NO2'],
    'Predicted NO2 (ppb)': y_pred[:, 1],
    'True N2O (ppb)': y_test['N2O'],
    'Predicted N2O (ppb)': y_pred[:, 2]
})


# Display the table
print(results_df)


     True NO (ppb)  Predicted NO (ppb)  True NO2 (ppb)  Predicted NO2 (ppb)  \
56    2.663890e-08        5.498928e-08    2.688964e-08         6.427449e-08   
203   2.695764e-07        1.540293e-07    2.696949e-08         1.066366e-07   
435   2.103206e-07        1.066047e-07    5.453423e-08         8.991400e-08   
193   1.899671e-07        1.528700e-07    4.855771e-08         1.030784e-07   
339   2.944046e-08        5.836700e-08    1.264896e-07         5.414905e-08   
..             ...                 ...             ...                  ...   
94    2.313131e-07        2.136699e-07    1.793680e-07         1.273612e-07   
234   5.865337e-08        9.113082e-08    7.183021e-08         7.787881e-08   
119   1.618487e-07        1.493875e-07    1.496771e-07         1.031271e-07   
381   2.090515e-07        1.962568e-07    1.344114e-07         1.171832e-07   
199   2.339847e-07        1.633670e-07    9.627991e-08         9.994912e-08   

     True N2O (ppb)  Predicted N2O (ppb)  
56     3