Import Libraries

In [92]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

Load Data

In [78]:
df = pd.read_csv('C:/Users/zilin wang/Documents/PhD/Seasonal_Variation_Carbon_Emission/Data/merged_df_modelling.csv')
df = df.apply(pd.to_numeric, errors='coerce')

There are four columns that can use as Y, which are  'CH4_diffusive__mgCH4_m_2_h_1_', 'CH4_Bubbling__mgCH4_m_2_h_1_','CO2_diffusive__mgCO2_m_2_h_1_', 'CH4_Total_Flux_mg_m_2_h_1_'. Here I will use CO2 as example to illustrate.

In [72]:
df.columns

Index(['Reservoir_area__km2_', 'Mean_depth__m_', 'Temperature____',
       'Wind_speed__m_s_1_', 'Age', 'CH4_diffusive__mgCH4_m_2_h_1_',
       'CH4_Bubbling__mgCH4_m_2_h_1_', 'CO2_diffusive__mgCO2_m_2_h_1_',
       'CH4_Total_Flux_mg_m_2_h_1_', 'Precipitation_kg_m_2_month',
       'discharge__m3_s_1_', 'VPD_Pa', 'PET_kg_m_2_month_1', 'HURS_%',
       'NPP_g_C_m_2_yr', 'GHI_kWh_m2_month'],
      dtype='object')

### CO2 as Y value

In [79]:
df_CO2 = df.drop(columns = {'CH4_diffusive__mgCH4_m_2_h_1_', 'CH4_Bubbling__mgCH4_m_2_h_1_', 'CH4_Total_Flux_mg_m_2_h_1_'})
df_CO2 = df_CO2.dropna()

Seperate the target class, "y" column, from the rest of attributes

In [81]:
X = df_CO2.drop(columns=['CO2_diffusive__mgCO2_m_2_h_1_'])
y = df_CO2['CO2_diffusive__mgCO2_m_2_h_1_']

X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)
#Split the dataset into 80% training and 20% testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Normalize the data

In [82]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Knn algorithm and hyperparameter tuning

In [106]:
knn = KNeighborsRegressor(n_neighbors=15)

In [107]:
knn.fit(X_train_scaled, y_train)

In [108]:
y_pred = knn.predict(X_test_scaled)

# Calculate regression metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('R^2 Score:', r2)

Mean Squared Error: 8128.062553203663
Mean Absolute Error: 63.08040223230122
R^2 Score: 0.18554748903706608


In [118]:
knn = KNeighborsRegressor()
param_grid = {
    'n_neighbors': [13,15,17,19,21],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

In [120]:
# Get the best parameters
best_params = grid_search.best_params_
print('Best parameters found:', best_params)

# Train the KNN regressor with the best parameters
best_knn = grid_search.best_estimator_

# Predict on the test set
y_pred = best_knn.predict(X_test_scaled)

# Calculate regression metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse) 
print('Root Mean Squared Error:', rmse)
print('Mean Absolute Error:', mae)
print('R^2 Score:', r2)


Best parameters found: {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'uniform'}
Root Mean Squared Error: 87.64131943629262
Mean Absolute Error: 59.62973659350002
R^2 Score: 0.23034420485871931


### Try Support Vector Machines

Here I will introduce you how to use the Support Vector Machine (SVM) implementation of scikit-learn.

Note how we are setting the $C$ hyper-parameter of SVM. $C$ controls the trade-off between having a small and strict
margin and a wider and loose margin. Following we will set $C$ to infinity which makes the margin infinitely strict.
This means that based on the dataset, the fitting of the SVM may fail if the training algorithm fails to separate all
the training examples perfectly.

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(kernel="linear", C=float("inf"))

### Same procedure as previous knn