# Second lab assignment

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, r2_score
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

## Import the data

After reading the data from the csv file, we check if there any missing values, and then prepare our features X without the class and output columns, and prepare the vectors representing these columns for regression and classification. The whole dataset is split into a training and test set in a stratified manner.

In [None]:
# Load the dataset
df = pd.read_csv('task2_data.csv', sep=';')
# Print info about the dataset before and after dropping missing data
print('--- Before dropping missing data: ---')
print(df.info())
df = df.dropna()
print('--- After dropping missing data: ---')
print(df.shape)

# Prepare the training and testing data
X = df.drop(columns=['Class', 'Output']).to_numpy()
y_output = df['Output'].to_numpy()
y_class = df['Class'].to_numpy()

X_train, X_test, y_output_train, y_output_test, y_class_train, y_class_test = train_test_split(
    X, y_output, y_class, test_size=0.2, random_state=1, stratify=y_class
)

--- Before dropping missing data: ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 402 entries, Class to Input400
dtypes: float64(401), int64(1)
memory usage: 6.1 MB
None
--- After dropping missing data: ---
(2000, 402)


## Task 1. Building baseline models

### Regression

For the baseline regression, we use a simple linear regression model with no scaling of the features. We assess model performance using $R^2$ scores computed across 5-fold cross-validation train-test splits and then the $R^2$ score computed for the model trained on the whole training set and evaluated on the test set.

In [4]:
cv = KFold(n_splits=5, shuffle=True, random_state=1)
r2_scores = []
r2_train_scores = []
for train_index, test_index in cv.split(X_train):
    X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
    y_output_train_cv, y_output_test_cv = y_output_train[train_index], y_output_train[test_index]

    model = LinearRegression()
    model.fit(X_train_cv, y_output_train_cv)
    y_pred = model.predict(X_test_cv)
    r2_scores.append(r2_score(y_output_test_cv, y_pred))
    r2_train_scores.append(r2_score(y_output_train_cv, model.predict(X_train_cv)))

print(f'Baseline regression R^2 on training set during cross-validation: {np.mean(r2_train_scores):.4f} ± {np.std(r2_train_scores):.4f}')
print(f'Baseline regression R^2 during cross-validation: {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}')

model = LinearRegression()
model.fit(X_train, y_output_train)
print(f'Baseline regression R^2 on training set: {r2_score(y_output_train, model.predict(X_train)):.4f}')
print(f'Baseline regression R^2 on test set: {r2_score(y_output_test, model.predict(X_test)):.4f}')

Baseline regression R^2 on training set during cross-validation: 0.6658 ± 0.0035
Baseline regression R^2 during cross-validation: 0.3043 ± 0.0344
Baseline regression R^2 on training set: 0.6365
Baseline regression R^2 on test set: 0.2939


The baseline model has a high $R^2$ on the training set, but it is a lot lower on the test sets, which shows that the model overfits and does not generalize well beyond the training data. We see that the baseline regression model has a relatively low $R^2$ score of around 0.3 for both the cross-validation score and the score on the test set, so we should explore more complex models or feature engineering for a better explanation of the output variable.

### Classification
For the baseline classification we use a logistic regression with no feature scaling. We assess the performance of the classifier using the same strategy as for the baseline regression model (with stratified cross validation) with accuracy as our performance metric.

In [5]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
accuracy_scores = []
train_accuracy_scores = []
for train_index, test_index in cv.split(X_train, y_class_train):
    X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
    y_class_train_cv, y_class_test_cv = y_class_train[train_index], y_class_train[test_index]

    model = LogisticRegression()
    model.fit(X_train_cv, y_class_train_cv)
    accuracy_scores.append(accuracy_score(y_class_test_cv, model.predict(X_test_cv)))
    train_accuracy_scores.append(accuracy_score(y_class_train_cv, model.predict(X_train_cv)))

accuracy = np.mean(accuracy_scores)
accuracy_std = np.std(accuracy_scores)
print(f'Baseline classification accuracy on training set during cross-validation: {np.mean(train_accuracy_scores):.4f} ± {np.std(train_accuracy_scores):.4f}')
print(f'Baseline classification accuracy during cross-validation: {accuracy:.4f} ± {accuracy_std:.4f}')

model = LogisticRegression()
model.fit(X_train, y_class_train)
print(f'Baseline classification accuracy on training set: {accuracy_score(y_class_train, model.predict(X_train)):.4f}')
print(f'Baseline classification accuracy on test set: {accuracy_score(y_class_test, model.predict(X_test)):.4f}')

Baseline classification accuracy on training set during cross-validation: 0.7631 ± 0.0097
Baseline classification accuracy during cross-validation: 0.5194 ± 0.0236
Baseline classification accuracy on training set: 0.7238
Baseline classification accuracy on test set: 0.5050


Similarly to the baseline regression, the model overfits and does not generalize beyond the training data -- the accuracy is a lot higher on the training set and on the test set it is comparable to random guessing, the model performs very poorly, with an accuracy score of around 0.5 for both the cross-validation score and score on the test set.

## Task 2. Advanced classification


For advanced classification, we use a pipeline combining feature selection and a non-linear classifier.

For feature selection Random Forests are used within a 15-fold inner cross-validation to compute feature importances. The top 15 most important features are selected based on the average importances from each fold, which makes feature selection more stable. A Support Vector Classifier with an RBF kernel was trained on the selected features. This model is chosen for its ability to capture non-linear relationships in the data, which is first scaled using a StandardScaler to ensure all features contribute equally to the model. 

Many hyperparameters (e.g., number of selected features, SVC kernel parameters, Random Forest depth) can be tuned for this pipeline. However, a full grid search would be computationally quite expensive, so the main parameters were tuned manually based on validation performance before seeing the model perform on the test set, but to have some kind of parameter optimization a grid search is performed for the best C parameter of the final Support Vector Classifier.

In [6]:
# Number of features to select for classification
k = 15

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
accuracy_scores = []
train_accuracy_scores = []

def get_classifier(X_t, y_t):
    # We assumme that the data is already scaled, so we do not scale it again

    # Inner cross-validation for feature selection using Random Forest to compute feature importances
    # There is a lot of folds to try to pick the stable important features
    inner_cv = StratifiedKFold(n_splits=15, shuffle=True, random_state=1)
    feature_importances_sum = np.zeros(X_t.shape[1])

    for train_inner_index, test_inner_index in inner_cv.split(X_t, y_t):
        X_train_inner, X_test_inner = X_t[train_inner_index], X_t[test_inner_index]
        y_train_inner, y_test_inner = y_t[train_inner_index], y_t[test_inner_index]

        # The model has a shallower depth to avoid overfitting
        model = RandomForestClassifier(n_estimators=20, random_state=1, max_depth=7)
        model.fit(X_train_inner, y_train_inner)
        
        feature_importances_sum += model.feature_importances_

    # Average feature importances over all folds and get the indices of the top k features
    feature_importances_avg = feature_importances_sum / inner_cv.get_n_splits()
    selected_features = np.argsort(feature_importances_avg)[::-1][:k]
    X_t_reduced = X_t[:, selected_features]

    # The final model is trained using SVC with RBF kernel
    # We use GridSearchCV to find the best hyperparameter C
    param_grid = {'C': [0.1, 1.0, 2.0, 4.0, 8.0]}
    svc = SVC(kernel='rbf', random_state=1)
    grid = GridSearchCV(svc, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid.fit(X_t_reduced, y_t)
    model = grid.best_estimator_
    return model, selected_features


for train_index, test_index in cv.split(X_train, y_class_train):
    # Split the data into training and testing sets for this fold
    X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
    y_class_train_cv, y_class_test_cv = y_class_train[train_index], y_class_train[test_index]

    # Scale the data
    scaler = StandardScaler()
    X_train_cv_scaled = scaler.fit_transform(X_train_cv)
    X_test_cv_scaled = scaler.transform(X_test_cv)

    # Get the classifier and selected features
    model, selected_features = get_classifier(X_train_cv_scaled, y_class_train_cv)

    # Evaluate the model on the test set and training set to see if it overfits a lot
    accuracy = accuracy_score(y_class_test_cv, model.predict(X_test_cv_scaled[:, selected_features]))
    train_accuracy = accuracy_score(y_class_train_cv, model.predict(X_train_cv_scaled[:, selected_features]))
    accuracy_scores.append(accuracy)
    train_accuracy_scores.append(train_accuracy)

print(f'Classification train accuracy during cross-validation: {np.mean(train_accuracy_scores):.4f} ± {np.std(train_accuracy_scores):.4f}')
print(f'Classification accuracy during cross-validation: {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}')

# Fit the final model on the entire training set and evaluate on the test set to see how it generalizes with the chosen hyperparameters
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

final_classifier, selected_features = get_classifier(X_train_scaled, y_class_train)
y_pred = final_classifier.predict(X_test_scaled[:, selected_features])

test_accuracy = accuracy_score(y_class_test, y_pred)
train_accuracy = accuracy_score(y_class_train, final_classifier.predict(X_train_scaled[:, selected_features]))
print(f'Final Train Accuracy: {train_accuracy:.4f}')
print(f'Final Test Accuracy: {test_accuracy:.4f}')
print('--- Classification Report ---')
print(classification_report(y_class_test, y_pred))
print('--- Confusion Matrix ---')
print(confusion_matrix(y_class_test, y_pred))

print('selected features:', selected_features)

Classification train accuracy during cross-validation: 0.8464 ± 0.0063
Classification accuracy during cross-validation: 0.7919 ± 0.0149
Final Train Accuracy: 0.8462
Final Test Accuracy: 0.7950
--- Classification Report ---
              precision    recall  f1-score   support

           0       0.79      0.79      0.79       197
           1       0.80      0.80      0.80       203

    accuracy                           0.80       400
   macro avg       0.79      0.79      0.79       400
weighted avg       0.79      0.80      0.79       400

--- Confusion Matrix ---
[[155  42]
 [ 40 163]]
selected features: [239  40 292   1 395 329 245 255 237  39 136  37 149 307 205]


The cross-validation accuracy is significantly improved compared to the baseline, with mean train accuracy around 0.84 and test accuracy of around 0.79, which are of course much higher than random guessing. However, there is a slight overfitting, as the training accuracy is a bit higher than the test accuracy. This suggests the model captures complex patterns but could benefit from further regularization or more robust hyperparameter optimization. Overall, the process selects informative features and uses a classifier to achieve stronger generalization, and almost reaches the performance of more than 0.8 as was stated in the problem.

## Task 3. Advanced regression

The advanced regression pipeline combines feature selection with regularized regression to improve predictive performance. Feature selection is performed using permutation importance with Ridge regression in a nested cross-validation setup, ensuring that only the most informative features (top 15) are retained for modeling. The final model is a RidgeCV regressor, which automatically optimizes its parameter (the regularization strength) to prevent overfitting.


In [7]:
k_r = 15 # Number of features to select for regression
cv = KFold(n_splits=5, shuffle=True, random_state=1)
r2_scores = []
r2_train_scores = []

def get_regressor(X_t, y_t):
    # Inner cross-validation for feature selection using permutation importance and Ridge regression
    inner_cv = KFold(n_splits=5, shuffle=True, random_state=1)
    feature_importances_sum = np.zeros(X_t.shape[1])

    for train_inner_index, test_inner_index in inner_cv.split(X_t):
        X_train_inner, X_test_inner = X_t[train_inner_index], X_t[test_inner_index]
        y_train_inner, y_test_inner = y_t[train_inner_index], y_t[test_inner_index]

        # Fit model since data is already scaled
        model = RidgeCV(alphas=[0.1, 1.0, 10.0, 20.0])
        model.fit(X_train_inner, y_train_inner)

        # Permutation importance on the inner test set
        result = permutation_importance(model, X_test_inner, y_test_inner,
                                        scoring='r2', n_repeats=4, random_state=1)
        
        # Accumulate feature importances
        feature_importances = result.importances_mean
        # Sum the feature importances across folds
        feature_importances_sum += feature_importances

    # Average feature importances over all folds and get the indices of the top k features
    feature_importances_avg = feature_importances_sum / inner_cv.get_n_splits()
    selected_features = np.argsort(feature_importances_avg)[::-1][:k_r]
    X_t_reduced = X_t[:, selected_features]

    # The final model is trained using Ridge regression with cross-validation
    model = RidgeCV(alphas=[0.1, 1.0, 10.0, 20.0])
    model.fit(X_t_reduced, y_t)
    return model, selected_features

for outer_train_idx, outer_test_idx in cv.split(X_train):
    # Split into outer train/test
    X_outer_train, X_outer_test = X_train[outer_train_idx], X_train[outer_test_idx]
    y_outer_train, y_outer_test = y_output_train[outer_train_idx], y_output_train[outer_test_idx]

    # Scale the data
    scaler = StandardScaler()
    X_outer_train_scaled = scaler.fit_transform(X_outer_train)
    X_outer_test_scaled = scaler.transform(X_outer_test)

    # Get the regressor and selected features
    final_regressor, selected_features = get_regressor(X_outer_train_scaled, y_outer_train)

    # Evaluate on outer test set
    r2 = r2_score(y_outer_test, final_regressor.predict(X_outer_test_scaled[:, selected_features]))
    r2_scores.append(r2)
    r2_train = r2_score(y_outer_train, final_regressor.predict(X_outer_train_scaled[:, selected_features]))
    r2_train_scores.append(r2_train)

# Print the average R^2
print(f'Average R^2 on training set during cross-validation: {np.mean(r2_train_scores):.4f} ± {np.std(r2_train_scores):.4f}')
print(f"Average R^2 during cross-validation: {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")

# Fit the final regressor on the entire training set and evaluate on the test set

# Scale the entire training and test set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
final_regressor, selected_features = get_regressor(X_train_scaled, y_output_train)
test_r2 = r2_score(y_output_test, final_regressor.predict(X_test_scaled[:, selected_features]))
train_r2 = r2_score(y_output_train, final_regressor.predict(X_train_scaled[:, selected_features]))

print(f'Final Train R^2: {train_r2:.4f}')
print(f'Final Test R^2: {test_r2:.4f}')

print('selected features:', selected_features)

Average R^2 on training set during cross-validation: 0.5316 ± 0.0048
Average R^2 during cross-validation: 0.5181 ± 0.0179
Final Train R^2: 0.5303
Final Test R^2: 0.4768
selected features: [222 166  82 341 192 291 172 183 135 386  58  17 240 388 206]


Results show a substantial improvement over the baseline: the average $R^2$ score during cross-validation is approximately 0.52, and the final test $R^2$ is about 0.48. It is a bit lower than the one seen during cross-validation, but is within three standard deviations, so this is probably just a test set which was harder to predict for the model. We can also see that the difference between the $R^2$ during training and testing is a lot smaller (particularly for the cross-validation means), which indicates that the model generalizes better and does not overfit as much as the baseline. The model explains nearly half of the variance in the output variable, though there is still room for improvement, possibly through more advanced models or further feature engineering.

## Validation on unseen data

In [None]:
new_data = pd.read_csv('validation_data.csv', sep=';')
X_val = new_data.drop(columns=['Class', 'Output']).to_numpy()
y_output_val = new_data['Output'].to_numpy()
y_class_val = new_data['Class'].to_numpy()
scaler = StandardScaler()

# Fit the models on the training data and compute the scores on the validation data

# Baseline regression
baseline_regression = LinearRegression()
baseline_regression.fit(X, y_output)
baseline_r2 = r2_score(y_output_val, baseline_regression.predict(X_val))
print(f'Baseline regression R^2 on validation set: {baseline_r2:.4f}')

# Advanced regression
X_scaled = scaler.fit_transform(X)
advanced_regression, selected_features = get_regressor(X_scaled, y_output)
advanced_r2 = r2_score(y_output_val, advanced_regression.predict(scaler.transform(X_val)[:, selected_features]))
print(f'Advanced regression R^2 on validation set: {advanced_r2:.4f}')

# Baseline classification
baseline_classification = LogisticRegression()
baseline_classification.fit(X, y_class)
baseline_accuracy = accuracy_score(y_class_val, baseline_classification.predict(X_val))
print(f'Baseline classification accuracy on validation set: {baseline_accuracy:.4f}')

# Advanced classification
X_scaled = scaler.fit_transform(X)
advanced_classification, selected_features = get_classifier(X_scaled, y_class)
advanced_accuracy = accuracy_score(y_class_val, advanced_classification.predict(scaler.transform(X_val)[:, selected_features]))
print(f'Advanced classification accuracy on validation set: {advanced_accuracy:.4f}')
