In [None]:
#EX 1

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

RUNS = 10
MLP_RANDOM_STATE = 0
data = pd.read_csv('parkinsons.csv', delimiter=',')
data.head()

X, y = data.drop('target', axis=1), np.ravel(data['target'])


mae_linear = []
mae_mlp_no_activation = []
mae_mlp_relu = []


for i in range(RUNS):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i+1)
    
    # Linear Regression model
    linear_regression = LinearRegression()
    linear_regression.fit(X_train, y_train)
    lr_predictions = linear_regression.predict(X_test)
    mae_linear.append(mean_absolute_error(y_test, lr_predictions))
    
    # MLP model without activation function
    mlp_no_activation = MLPRegressor(hidden_layer_sizes=(10,10), activation='identity', random_state=MLP_RANDOM_STATE)
    mlp_no_activation.fit(X_train, y_train)
    mlp_no_activation_predictions = mlp_no_activation.predict(X_test)
    mae_mlp_no_activation.append(mean_absolute_error(y_test, mlp_no_activation_predictions))
    
    # MLP model with ReLU activation function
    mlp_relu = MLPRegressor(hidden_layer_sizes=(10,10), activation='relu', random_state=MLP_RANDOM_STATE)
    mlp_relu = mlp_relu.fit(X_train, y_train)
    mlp_relu_predictions = mlp_relu.predict(X_test)
    mae_mlp_relu.append(mean_absolute_error(y_test, mlp_relu_predictions))
    
mae_data = pd.DataFrame({
    'Linear Regression': mae_linear,
    'MLP No Activation': mae_mlp_no_activation,
    'MLP ReLU Activation': mae_mlp_relu
})
plt.figure(figsize=(12, 6))
sns.boxplot(data=mae_data)
plt.title('MAE for Linear Regression and MLP models with and without ReLU activation function')
plt.show()


### Explanation ex2
Activation functions in MLP models enable the representation of non-linear functions, as they introduce the ability to identify non-linear patterns in the data. Without them, the model is simply composed of multiple layers of linear functions, meaning it is strictly a linear model, just like a linear regression model. Therefore, they will have similar outputs, as can be seen in the previous exercise by the similarity between their respective boxplots, which reveal similar MAE values.

In [None]:
#EX 3
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, make_scorer
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.colors import Normalize
import pandas as pd
import seaborn as sns
import numpy as np
import warnings

warnings.filterwarnings('ignore')

MLP_RANDOM_STATE = 0
EX7_RANDOM_STATE = 0
data = pd.read_csv('parkinsons.csv', delimiter=',')
data.head()

X, y = data.drop('target', axis=1), np.ravel(data['target'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=EX7_RANDOM_STATE)
mlp = MLPRegressor(hidden_layer_sizes=(10,10), random_state=MLP_RANDOM_STATE)

param_grid = {
    'alpha': [0.0001, 0.001, 0.01],            # L2 penalty (regularization term) 
    'learning_rate_init': [0.001, 0.01, 0.1],
    'batch_size': [32, 64, 128]
}
grid_search = GridSearchCV(estimator=mlp, refit=False, param_grid=param_grid,scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

best_train_params = grid_search.best_params_
best_train_score = -grid_search.best_score_ 
#print("Best train parameters found: ", best_train_params)
# This only gives the train MAE for the best combination of hyperparameters
# We want the test MAE for all combinations of hyperparameters
#print("Best train MAE: ", best_train_score)

test_results = []
test_mae_results = []
max_test_mae = float('-inf')
min_test_mae = float('inf')

for param in grid_search.cv_results_['params']:
    # To plot the Test MAE for each combination of hyperparameters
    # we need to calculate the Test MAE for each combination's model
    mlp.set_params(**param)
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    
    test_mae = mean_absolute_error(y_test, y_pred)
    test_results.append({'batch_size': param['batch_size'], 'alpha':param['alpha'], 'learning_rate':param['learning_rate_init'], 'test_mae': test_mae})

    if test_mae > max_test_mae:
        max_test_mae = test_mae
    if test_mae < min_test_mae:
        min_test_mae = test_mae

# Custom Settings for the heatmaps plot
# In order to have the same color scheme in all plots
# Since the color depends on the min and max values
# We define the same min and max values for all plots

norm = Normalize(vmin=min_test_mae, vmax=max_test_mae)
results_df = pd.DataFrame(test_results)
#print(results_df)
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)
batch_sizes = results_df['batch_size'].unique()

for i, batch_size in enumerate(batch_sizes):
    # Only use the results for the current batch_size
    # because we want a heatmap for each batch size
    filtered_df = results_df[results_df['batch_size'] == batch_size]
    
    pivot_table = filtered_df.pivot_table(values='test_mae', 
                                          index='alpha', 
                                          columns='learning_rate')

    sns.heatmap(pivot_table, annot=True, fmt=".3f", cmap="BrBG",norm=norm, ax=axes[i])
    axes[i].set_title(f'Batch Size: {batch_size}')
    axes[i].set_xlabel('Learning Rate')
    axes[i].set_ylabel('L2 Penalty (alpha)')


plt.suptitle('Test MAE for Each Combination of Hyperparameters', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])

plt.show()



The best combination of hyperparameters found was a learning rate of 0.01, an L2 penalty of 0.001, and a batch size of 32, which resulted in an MAE of 3.651.
The learning rate corresponds to how quickly the model updates the weights after each iteration, and it is the factor that most influences the MAE. A low learning rate allows the model to make smaller and more gradual adjustments, reducing the risk of skipping over global minima and, consequently, the magnitude of the MAE, although it increases training time. On the other hand, a high learning rate allows the model to make larger adjustments, but runs the risk of overshooting the global minimum.
The L2 penalty is a regularization term that penalizes larger weights, preventing overfitting and stopping the model from propagating noise from the training data to the test data. However, if it is too high, the model may no longer be able to capture complex patterns, causing underfitting.
Batch size is the number of samples the model uses to calculate the gradient and update the weights in the direction of the gradient to minimize the error. A small batch size allows the model to make more frequent updates, helping it escape local minima, but with more noise in the training process. A large batch size allows the model to make less frequent updates, but with less noise, which can cause it to get stuck in local minima, leading to slower convergence.
The best combination obtained, therefore, has an intermediate learning rate of 0.01. This choice allows the model to make relatively small and gradual adjustments, but without excessively long training times. This value balances convergence time and model accuracy, allowing good error minimization without losing training efficiency. The value found for the L2 penalty, 0.001, is also intermediate, which prevents overfitting while still allowing the model to capture important patterns in the data, so it does not cause underfitting. Additionally, a small batch size of 32 means the model updates more frequently with greater variation in gradients, which helps the model escape local minima, but at the cost of more noise in the training process.