# Part 2

### Logistic regression

In [8]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from data import *

np.random.seed(2)
torch.manual_seed(2)

# Define the Logistic Regression model in PyTorch.
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

# Convert the numpy arrays to PyTorch tensors
X_torch = torch.from_numpy(X.astype(np.float32))

# Convert the target to binary labels
y = (y > np.median(y)).astype(int)  # 1 if y is above the median, 0 otherwise
y_torch = torch.from_numpy(y.astype(np.float32)).view(-1, 1)

K1, K2 = 10, 10  # Number of folds in the outer and inner cross-validation loops respectively
outer_cv = KFold(n_splits=K1, shuffle=True)
inner_cv = KFold(n_splits=K2, shuffle=True)

lambdas = np.power(10.0, np.arange(-10, 0, 1)) # Range can not be higher than 0
test_errors = []

# Outer cross-validation loop
for train_val_idx, test_idx in outer_cv.split(X_torch):
    X_train_val_outer, X_test = X_torch[train_val_idx], X_torch[test_idx]
    y_train_val_outer, y_test = y_torch[train_val_idx], y_torch[test_idx]

    best_lambda = None
    best_val_error = float('inf')
    best_model = None

    # Inner cross-validation loop
    for train_idx, val_idx in inner_cv.split(X_train_val_outer):
        X_train, X_val = X_train_val_outer[train_idx], X_train_val_outer[val_idx]
        y_train, y_val = y_train_val_outer[train_idx], y_train_val_outer[val_idx]

        # Loop over lambda values
        for lambda_ in lambdas:
            model = LogisticRegressionModel(input_dim=X.shape[1])
            criterion = nn.BCELoss()
            optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=lambda_)

            # Train the model
            model.train()
            for epoch in range(100):
                optimizer.zero_grad()
                outputs = model(X_train)
                loss = criterion(outputs, y_train)
                loss.backward()
                optimizer.step()

            # Evaluate the model on the validation set
            model.eval()
            with torch.no_grad():
                predictions = model(X_val)
                predictions = (predictions > 0.5).float()
                val_error = 1 - torch.mean((predictions == y_val).float()).item()

            if val_error < best_val_error:
                best_val_error = val_error
                best_model = model
                best_lambda = lambda_

    # Train the best model with the best lambda on the entire training set of the outer fold
    best_model.train()
    for epoch in range(100):
        optimizer.zero_grad()
        outputs = best_model(X_train_val_outer)
        loss = criterion(outputs, y_train_val_outer)
        loss.backward()
        optimizer.step()

    # Evaluate the model on the test set
    best_model.eval()
    with torch.no_grad():
        predictions = best_model(X_test)
        predictions = (predictions > 0.5).float()
        test_error = 1 - torch.mean((predictions == y_test).float()).item()
    test_errors.append(test_error)

    print('best lambda', best_lambda, 'test error', test_error)

# Compute the mean test error across all outer folds
mean_test_error = np.mean(test_errors)
print('mean test error', mean_test_error)

best lambda 1e-07 test error 0.2888888716697693
best lambda 0.001 test error 0.42222219705581665
best lambda 1e-07 test error 0.29545456171035767
best lambda 1e-06 test error 0.34090906381607056
best lambda 1e-08 test error 0.20454543828964233
best lambda 0.1 test error 0.15909093618392944
best lambda 1e-09 test error 0.25
best lambda 1e-08 test error 0.20454543828964233
best lambda 1e-06 test error 0.27272725105285645
best lambda 1e-06 test error 0.22727274894714355
mean test error 0.2665656507015228


### ANN

In [11]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from data import *
np.random.seed(2)
torch.manual_seed(2)

class ANNModel(nn.Module):
    def __init__(self, input_dim, hidden_units):
        super(ANNModel, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_units)
        self.layer2 = nn.Linear(hidden_units, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        x = self.sigmoid(x)
        return x

hidden_units_values = [x for x in range(1, 15)]  # Define a range of hidden unit values

# Convert the numpy arrays to PyTorch tensors
X_torch = torch.from_numpy(X.astype(np.float32))

# Convert the target to binary labels
y = (y > np.median(y)).astype(int)  # 1 if y is above the median, 0 otherwise
y_torch = torch.from_numpy(y.astype(np.float32)).view(-1, 1)

K1, K2 = 10, 10  # Number of folds in the outer and inner cross-validation loops respectively
outer_cv = KFold(n_splits=K1, shuffle=True)
inner_cv = KFold(n_splits=K2, shuffle=True)

test_errors = []

# Outer cross-validation loop
for train_val_idx, test_idx in outer_cv.split(X_torch):
    X_train_val_outer, X_test = X_torch[train_val_idx], X_torch[test_idx]
    y_train_val_outer, y_test = y_torch[train_val_idx], y_torch[test_idx]

    best_hidden_units = None
    best_val_error = float('inf')
    best_model = None

    # Inner cross-validation loop
    for train_idx, val_idx in inner_cv.split(X_train_val_outer):
        X_train, X_val = X_train_val_outer[train_idx], X_train_val_outer[val_idx]
        y_train, y_val = y_train_val_outer[train_idx], y_train_val_outer[val_idx]

        # Loop over hidden unit values
        for hidden_units in hidden_units_values:
            model = ANNModel(input_dim=X.shape[1], hidden_units=hidden_units)
            criterion = nn.BCELoss()
            optimizer = optim.SGD(model.parameters(), lr=0.01)

            # Train the model
            model.train()
            for epoch in range(100):
                optimizer.zero_grad()
                outputs = model(X_train)
                loss = criterion(outputs, y_train)
                loss.backward()
                optimizer.step()

            # Evaluate the model on the validation set
            model.eval()
            with torch.no_grad():
                predictions = model(X_val)
                predictions = (predictions > 0.5).float()
                val_error = 1 - torch.mean((predictions == y_val).float()).item()


            if val_error < best_val_error:
                best_val_error = val_error
                best_model = model
                best_hidden_units = hidden_units

    # Train the best model with the best hidden units on the entire training set of the outer fold
    best_model.train()
    for epoch in range(100):
        optimizer.zero_grad()
        outputs = best_model(X_train_val_outer)
        loss = criterion(outputs, y_train_val_outer)
        loss.backward()
        optimizer.step()

    # Evaluate the model on the test set
    best_model.eval()
    with torch.no_grad():
        predictions = best_model(X_test)
        predictions = (predictions > 0.5).float()
        test_error = 1 - torch.mean((predictions == y_test).float()).item()
    test_errors.append(test_error)

    print('best hidden units', best_hidden_units, 'test error', test_error)

# Compute the mean test error across all outer folds
mean_test_error = np.mean(test_errors)
print('mean test error', mean_test_error)

best hidden units 9 test error 0.42222219705581665
best hidden units 3 test error 0.42222219705581665
best hidden units 14 test error 0.3863636255264282
best hidden units 10 test error 0.27272725105285645
best hidden units 8 test error 0.3181818127632141
best hidden units 6 test error 0.22727274894714355
best hidden units 6 test error 0.3863636255264282
best hidden units 7 test error 0.27272725105285645
best hidden units 8 test error 0.3863636255264282
best hidden units 5 test error 0.47727274894714355
mean test error 0.35717170834541323


### Baseline


In [10]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from data import *

# Convert the target back to numpy array for sklearn
y_np = y_torch.numpy()

# Create the dummy classifier
dummy_clf = DummyClassifier(strategy="most_frequent")

# Outer cross-validation loop
baseline_errors = []
for train_val_idx, test_idx in outer_cv.split(X_torch):
    X_train_val_outer, X_test = X_torch[train_val_idx], X_torch[test_idx]
    y_train_val_outer, y_test = y_np[train_val_idx], y_np[test_idx]

    # Train the dummy classifier
    dummy_clf.fit(X_train_val_outer, y_train_val_outer)

    # Predict on the test set
    y_pred = dummy_clf.predict(X_test)

    # Compute the error
    test_error = 1 - accuracy_score(y_test, y_pred)
    baseline_errors.append(test_error)

    print('test error', test_error)

# Compute the mean test error across all outer folds
mean_baseline_error = np.mean(baseline_errors)
print('mean baseline error', mean_baseline_error)

test error 0.5333333333333333
test error 0.5555555555555556
test error 0.5681818181818181
test error 0.5
test error 0.6363636363636364
test error 0.5227272727272727
test error 0.6136363636363636
test error 0.5227272727272727
test error 0.5454545454545454
test error 0.5681818181818181
mean baseline error 0.5566161616161616
