# This first one is rudimentary because I think I can build it quickly. I want to make a second attempt further down using the segments in each 500m cell individually to do fractional snow cover, i.e. an algorithm that binary detects snow/non-snow per segment, and if 4/5 segments are snow-detected then we get 80% FSC.

In [1]:
from scripts.imports import *

df = pd.read_pickle('five_sites_data_snow_cc.pkl')
df = df[df['Confidence'] == 1]

# df
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2136 entries, 0 to 2457
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   camera         2136 non-null   object 
 1   date           2136 non-null   object 
 2   pvpg           2136 non-null   float64
 3   y_strong       1840 non-null   float64
 4   y_weak         1273 non-null   float64
 5   x_strong       1840 non-null   float64
 6   x_weak         1273 non-null   float64
 7   longitude      2136 non-null   float64
 8   latitude       2136 non-null   float64
 9   meanEgstrong   1840 non-null   float64
 10  meanEgweak     1273 non-null   float64
 11  meanEvstrong   1840 non-null   float64
 12  meanEvweak     1273 non-null   float64
 13  msw            2136 non-null   float64
 14  night          2136 non-null   float64
 15  asr            2136 non-null   float32
 16  n_photons      2136 non-null   float64
 17  data_quantity  2136 non-null   float64
 18  FSC          

#### Linear Regression

In [7]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression

# Select the relevant columns for the model, excluding 'meanEgweak' and 'meanEvweak'
X = df[['meanEgstrong', 'meanEvstrong', 'msw', 'asr', 'night']]
y = df['FSC']

# Drop rows with NaN values in the selected columns
X = X.dropna()
y = y[X.index]  # Ensure that 'y' aligns with 'X' after dropping NaNs

# Apply dummy encoding to 'night' (even though it's binary, this ensures consistent approach)
X = pd.get_dummies(X, columns=['night'], drop_first=True)

# Split the data into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the linear regression model
model = LinearRegression()

# Perform cross-validation on the training set
cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
cv_scores = np.sqrt(-cv_scores)  # Convert MSE to RMSE

# # Train the model
# model.fit(X_train, y_train)

# # Predict on the test set
# y_pred = model.predict(X_test)

# # Restrict predictions to [0, 1]
# y_pred = np.clip(y_pred, 0, 1)

# # Evaluate the model on the test set
# test_score = np.sqrt(np.mean((y_pred - y_test) ** 2))

# # Output the results
# print("Cross-Validation RMSE scores: ", cv_scores)
# print("Mean Cross-Validation RMSE: ", cv_scores.mean())
# print("Test RMSE: ", test_score)

# Output the results
print("Cross-Validation RMSE scores: ", cv_scores)
print("Mean Cross-Validation RMSE: ", cv_scores.mean())

Cross-Validation RMSE scores:  [0.326979   0.36764504 0.3273285  0.3410039  0.30343134 0.31617196
 0.32522901 0.35606912 0.31427409 0.34973675]
Mean Cross-Validation RMSE:  0.33278686957702425


#### Logistic Regression

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression

# Select the relevant columns for the model, excluding 'meanEgweak' and 'meanEvweak'
X = df[['meanEgstrong', 'meanEvstrong', 'msw', 'asr', 'night']]
y = df['FSC']

# Drop rows with NaN values in the selected columns
X = X.dropna()
y = y[X.index]  # Ensure that 'y' aligns with 'X' after dropping NaNs

# Apply dummy encoding to 'night' (even though it's binary, this ensures consistent approach)
X = pd.get_dummies(X, columns=['night'], drop_first=True)

# Split the data into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the logistic regression model
model = LogisticRegression()

# Perform cross-validation on the training set
cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='f1')

# Display cross-validation scores
print("Cross-Validation F1 Scores:", cv_scores)
print("Mean Cross-Validation F1 Score:", cv_scores.mean())

# # Train the model
# model.fit(X_train, y_train)

Cross-Validation F1 Scores: [0.87407407 0.82352941 0.88111888 0.86524823 0.88888889 0.89361702
 0.86896552 0.79069767 0.86956522 0.88405797]
Mean Cross-Validation F1 Score: 0.8639762884139282


#### Ridge/Lasso

In [16]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import Ridge, Lasso

# Select the relevant columns for the model, excluding 'meanEgweak' and 'meanEvweak'
X = df[['meanEgstrong', 'meanEvstrong', 'msw', 'asr', 'night']]
y = df['FSC']

# Drop rows with NaN values in the selected columns
X = X.dropna()
y = y[X.index]  # Ensure that 'y' aligns with 'X' after dropping NaNs

# Apply dummy encoding to 'night'
X = pd.get_dummies(X, columns=['night'], drop_first=True)

# Split the data into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the grid of alpha values to search over
alpha_values = np.logspace(-4, 4, 50)  # Creates 50 values between 1e-4 and 1e4

# Ridge Regression with GridSearchCV
ridge_model = Ridge()
ridge_grid = {'alpha': alpha_values}
ridge_search = GridSearchCV(ridge_model, ridge_grid, cv=10, scoring='neg_mean_squared_error')
ridge_search.fit(X_train, y_train)

# Lasso Regression with GridSearchCV
lasso_model = Lasso(max_iter=10000)  # Increase max_iter for Lasso due to potential convergence issues
lasso_grid = {'alpha': alpha_values}
lasso_search = GridSearchCV(lasso_model, lasso_grid, cv=10, scoring='neg_mean_squared_error')
lasso_search.fit(X_train, y_train)

# Best alpha values
print("Best alpha for Ridge Regression: ", ridge_search.best_params_['alpha'])
print("Best alpha for Lasso Regression: ", lasso_search.best_params_['alpha'])

# Best CV scores (negative MSE, so higher is better)
print("Best CV score for Ridge Regression: ", np.sqrt(-ridge_search.best_score_))
print("Best CV score for Lasso Regression: ", np.sqrt(-lasso_search.best_score_))

Best alpha for Ridge Regression:  0.5689866029018293
Best alpha for Lasso Regression:  0.00030888435964774815
Best CV score for Ridge Regression:  0.33326508074830435
Best CV score for Lasso Regression:  0.3333160536914119


In [15]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge, Lasso

# Select the relevant columns for the model, excluding 'meanEgweak' and 'meanEvweak'
X = df[['meanEgstrong', 'meanEvstrong', 'msw', 'asr', 'night']]
y = df['FSC']

# Drop rows with NaN values in the selected columns
X = X.dropna()
y = y[X.index]  # Ensure that 'y' aligns with 'X' after dropping NaNs

# Apply dummy encoding to 'night'
X = pd.get_dummies(X, columns=['night'], drop_first=True)

# Split the data into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Ridge and Lasso regression models
ridge_model = Ridge(alpha=0.5689866029018293)
lasso_model = Lasso(alpha=0.00030888435964774815)

# Perform cross-validation on the training set (10-fold)
ridge_cv_scores = cross_val_score(ridge_model, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
lasso_cv_scores = cross_val_score(lasso_model, X_train, y_train, cv=10, scoring='neg_mean_squared_error')

# Convert negative MSE to positive RMSE for interpretability
ridge_cv_rmse = np.sqrt(-ridge_cv_scores)
lasso_cv_rmse = np.sqrt(-lasso_cv_scores)

# Print the cross-validation results
print("Ridge Regression Cross-Validation RMSE scores: ", ridge_cv_rmse)
print("Mean Ridge Regression Cross-Validation RMSE: ", ridge_cv_rmse.mean())
print()
print("Lasso Regression Cross-Validation RMSE scores: ", lasso_cv_rmse)
print("Mean Lasso Regression Cross-Validation RMSE: ", lasso_cv_rmse.mean())

Ridge Regression Cross-Validation RMSE scores:  [0.32459058 0.36771222 0.32664998 0.3416524  0.30458747 0.31676251
 0.3258135  0.3564261  0.31452441 0.34846323]
Mean Ridge Regression Cross-Validation RMSE:  0.33271824060562244

Lasso Regression Cross-Validation RMSE scores:  [0.32528045 0.36768476 0.32689135 0.34150429 0.30445423 0.31659756
 0.32567569 0.35627767 0.31444172 0.34887661]
Mean Lasso Regression Cross-Validation RMSE:  0.33276843556028546


#### Random Forest

Discrete

In [19]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Select the relevant columns for the model, excluding 'meanEgweak' and 'meanEvweak'
X = df[['meanEgstrong', 'meanEvstrong', 'msw', 'asr', 'night']]
y = df['FSC']

# Drop rows with NaN values in the selected columns
X = X.dropna()
y = y[X.index]  # Ensure that 'y' aligns with 'X' after dropping NaNs

# Apply dummy encoding to 'night'
X = pd.get_dummies(X, columns=['night'], drop_first=True)

# Split the data into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=1000, random_state=42)

# Perform cross-validation on the training set (10-fold)
cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=10, scoring='f1')

# Train the model on the full training set
# rf_classifier.fit(X_train, y_train)

# Predict on the test set
# y_pred = rf_classifier.predict(X_test)

# Evaluate the model
# test_accuracy = np.mean(y_pred == y_test)

# Output the results
print("Random Forest Classifier Cross-Validation Accuracy scores: ", cv_scores)
print("Mean Cross-Validation Accuracy: ", cv_scores.mean())
# print("Test Set RMSE: ", test_rmse)

Random Forest Classifier Cross-Validation Accuracy scores:  [0.88405797 0.89041096 0.90277778 0.91666667 0.95238095 0.94366197
 0.95774648 0.90140845 0.89051095 0.93877551]
Mean Cross-Validation Accuracy:  0.917839768726164


Continuous

In [22]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor

# Select the relevant columns for the model, excluding 'meanEgweak' and 'meanEvweak'
X = df[['meanEgstrong', 'meanEvstrong', 'msw', 'asr', 'night']]
y = df['FSC']

# Drop rows with NaN values in the selected columns
X = X.dropna()
y = y[X.index]  # Ensure that 'y' aligns with 'X' after dropping NaNs

# Apply dummy encoding to 'night'
X = pd.get_dummies(X, columns=['night'], drop_first=True)

# Split the data into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Perform cross-validation on the training set (10-fold)
cv_scores = cross_val_score(rf_regressor, X_train, y_train, cv=10, scoring='neg_mean_squared_error')

# Convert negative MSE to positive RMSE for interpretability
cv_rmse = np.sqrt(-cv_scores)

# # Train the model on the full training set
# rf_regressor.fit(X_train, y_train)

# # Predict on the test set
# y_pred = rf_regressor.predict(X_test)

# # Restrict predictions to [0, 1]
# y_pred = np.clip(y_pred, 0, 1)

# # Calculate RMSE on the test set
# test_rmse = np.sqrt(np.mean((y_pred - y_test) ** 2))

# Output the results
print("Random Forest Regressor Cross-Validation RMSE scores: ", cv_rmse)
print("Mean Cross-Validation RMSE: ", cv_rmse.mean())
# print("Test Set RMSE: ", test_rmse)

Random Forest Regressor Cross-Validation RMSE scores:  [0.25275375 0.29965521 0.27157014 0.26655908 0.18258164 0.21724152
 0.24372115 0.24662484 0.27089421 0.23900161]
Mean Cross-Validation RMSE:  0.24906031386976032


#### Support Vector Machines

SVC

SVR

#### k-Nearest Neighbours

Classifer

Regressor

#### Neural Network