### Problem 1 (ii)

In [1]:
import numpy as np
from scipy.optimize import minimize

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

np.random.seed(42)
n = 1000000
x = np.random.normal(0, 1, n)
y_true = sigmoid(x)

def neural_network_2_layers(weights, x):
    w0, w1, w2 = weights
    z0 = w0 * x
    h1 = sigmoid(z0)
    z1 = w1 * h1
    h2 = sigmoid(z1)
    z2 = w2 * h2
    y_pred = z2
    return y_pred

def loss_function_2_layers(weights, x, y_true):
    y_pred = neural_network_2_layers(weights, x)
    loss = np.mean((y_pred - y_true) ** 2)
    return loss

def constraint_weights(weights):
    return np.sum(weights) - 1

def constraint_non_negative(weights):
    return weights

initial_weights_2_layers = np.array([1/3, 1/3, 1/3])
constraints_2_layers = [{'type': 'eq', 'fun': constraint_weights}, {'type': 'ineq', 'fun': constraint_non_negative}]
result_2_layers = minimize(loss_function_2_layers, initial_weights_2_layers, constraints=constraints_2_layers, args=(x, y_true))
print("2-Layer NN Training Error:", result_2_layers.fun)
print("Optimized Weights (2-Layer NN):", result_2_layers.x / np.sum(result_2_layers.x))


def neural_network_1_layer(weights, x):
    w0, w1 = weights
    z0 = w0 * x
    h1 = sigmoid(z0)
    z1 = w1 * h1
    y_pred = z1
    return y_pred

def loss_function_1_layer(weights, x, y_true):
    y_pred = neural_network_1_layer(weights, x)
    loss = np.mean((y_pred - y_true) ** 2)
    return loss

initial_weights_1_layer = np.array([0.5, 0.5])
constraints_1_layer = [{'type': 'eq', 'fun': constraint_weights}, {'type': 'ineq', 'fun': constraint_non_negative}]
result_1_layer = minimize(loss_function_1_layer, initial_weights_1_layer, constraints=constraints_1_layer, args=(x, y_true))
print("1-Layer NN Training Error:", result_1_layer.fun)
print("Optimized Weights (1-Layer NN):", result_1_layer.x / np.sum(result_1_layer.x))

2-Layer NN Training Error: 0.04340313295600554
Optimized Weights (2-Layer NN): [3.61214404e-19 1.27374368e-03 9.98726256e-01]
1-Layer NN Training Error: 0.036739220525162035
Optimized Weights (1-Layer NN): [0.13065422 0.86934578]


In this scenario, where the true data generation process follows a simple sigmoid function and weights are constrained to sum to 1 with non-negative values, adding an extra layer to the neural network increases training error likely because the additional layer introduces unnecessary complexity, leading to overfitting and fitting noise rather than capturing the inherent simplicity of the data distribution. The constraints and simplicity of the data favor a single-layer model, and the additional parameters in the two-layer model hinder its ability to generalize effectively, resulting in higher training error.

### Problem 2

In [2]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib as mpl
import matplotlib.pyplot as plt 
path="/Users/zhangyuanzhuo/425 ML"
import os
os.chdir(path)
df = pd.read_csv("card_transdata-1.csv")
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1,1,0,0,0
1,10.829943,0.175592,1.294219,1,0,0,0,0
2,5.091079,0.805153,0.427715,1,0,0,1,0
3,2.247564,5.600044,0.362663,1,1,0,1,0
4,44.190936,0.566486,2.222767,1,1,0,1,0


In [3]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor


X = df.drop('fraud', axis=1)
categorical_features_indices = [False, False, False, True, True,True,True]
y = df['fraud']

X_train = X.iloc[:500000,:]
y_train = y.iloc[:500000]
X_test = X.iloc[500000:,:]
y_test = y.iloc[500000:]

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [4]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 1), 
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

input_size = X_train.shape[1]
model = NeuralNetwork(input_size)


In [8]:
from sklearn.metrics import f1_score
learning_rate = 0.001
epochs = 10
loss_fn = nn.BCEWithLogitsLoss() 
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()
    for X_batch, y_batch in dataloader:
        optimizer.zero_grad()
        predictions = model(X_batch).squeeze()
        y_batch = y_batch.float() 
        loss = loss_fn(predictions, y_batch)
        loss.backward()
        optimizer.step()

def test_loop(dataloader, model, loss_fn):
    model.eval()
    correct = 0
    total = 0
    test_loss = 0.0
    y_true = []
    y_pred = []

    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            predictions = model(X_batch).squeeze()

            y_batch = y_batch.float()

            test_loss += loss_fn(predictions, y_batch).item()
            predicted_labels = (torch.sigmoid(predictions) >= 0.5).float()

            y_true.extend(y_batch.cpu().numpy())
            y_pred.extend(predicted_labels.cpu().numpy())

            correct += (predicted_labels == y_batch).sum().item()
            total += y_batch.size(0)

    accuracy = correct / total
    average_loss = test_loss / len(dataloader)
    f1 = f1_score(y_true, y_pred)
    return accuracy, average_loss, f1
 
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    accuracy, average_loss, f1 = test_loop(test_dataloader, model, loss_fn)
    print(f"Accuracy: {accuracy * 100:.2f}%, Average Loss: {average_loss:.4f}, F1 score: {f1:.4f}")

Epoch 1/10
Accuracy: 94.60%, Average Loss: 0.1263, F1 score: 0.6184
Epoch 2/10
Accuracy: 95.10%, Average Loss: 0.1296, F1 score: 0.6370
Epoch 3/10
Accuracy: 96.23%, Average Loss: 0.0894, F1 score: 0.7446
Epoch 4/10
Accuracy: 96.49%, Average Loss: 0.0840, F1 score: 0.7765
Epoch 5/10
Accuracy: 96.08%, Average Loss: 0.1227, F1 score: 0.7258
Epoch 6/10
Accuracy: 96.89%, Average Loss: 0.0754, F1 score: 0.8010
Epoch 7/10
Accuracy: 96.64%, Average Loss: 0.0744, F1 score: 0.7690
Epoch 8/10
Accuracy: 94.48%, Average Loss: 0.1703, F1 score: 0.7484
Epoch 9/10
Accuracy: 97.09%, Average Loss: 0.0693, F1 score: 0.8159
Epoch 10/10
Accuracy: 96.92%, Average Loss: 0.0705, F1 score: 0.7929


The result is worse than the simple decision tree becasue of lower F1 score.