# 1. Import

In [1806]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Optional
import itertools

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
from torch.utils.data import DataLoader, TensorDataset

# 2. Data Loading

In [1807]:
import numpy as np

def load_npz(file_path):
    with np.load(file_path) as data:
        return {key: data[key] for key in data}

train_data = load_npz(r'.\data\train.npz')
test_data = load_npz(r'.\data\test.npz')
train_emb1, train_emb2, train_labels = train_data['emb1'], train_data['emb2'], train_data['preference']
test_emb1, test_emb2 = test_data['emb1'], test_data['emb2']

# 3. Exploration

In [1808]:
train_data

{'uid': array([    0,     1,     2, ..., 18747, 18748, 18749], dtype=int64),
 'emb1': array([[-0.05075016, -0.03491386, -0.05787281, ...,  0.00020284,
          0.02388327, -0.02491781],
        [-0.12402835, -0.07631648, -0.05782915, ...,  0.02713838,
          0.01394665,  0.0186507 ],
        [-0.06794146, -0.0385992 ,  0.04476113, ...,  0.07999779,
          0.04943484,  0.00783883],
        ...,
        [ 0.02096516, -0.00752076, -0.06958353, ...,  0.01346127,
          0.01917063, -0.06059628],
        [-0.00901941,  0.01330765, -0.02343761, ..., -0.02690429,
          0.0084649 ,  0.01999134],
        [-0.05510234,  0.00251053, -0.01775946, ...,  0.00322949,
         -0.02700103,  0.01986161]], dtype=float32),
 'emb2': array([[-0.03255587,  0.01327268, -0.00508326, ..., -0.01196616,
         -0.03564733, -0.03713938],
        [-0.00014027,  0.03904634,  0.0592997 , ...,  0.00117963,
          0.04012304,  0.07394706],
        [-0.068197  , -0.0943828 ,  0.04236921, ...,  0.02259

In [1809]:
train_data.keys()

dict_keys(['uid', 'emb1', 'emb2', 'preference'])

In [1810]:
# x1
print(train_data['emb1'][0].shape) # (384,)
# x2
print(train_data['emb2'][0].shape) # (384,)
# y
print(train_data['preference'][0]) # 1
# train_data['emb1'][0]

(384,)
(384,)
1


# 4. Preprocessing

In [1811]:
## Parameters

# Preprocessing Parameters
validation_size = 0.1
RAND_STATE = 5780
shuffle_split = True
standardized = False
torch.manual_seed(RAND_STATE)
np.random.seed(RAND_STATE)

In [1812]:
def train_validation_split(Xs, Ys, validation_size: float=0.2):
    Xs_tr, Xs_va, Ys_tr, Ys_va = train_test_split(Xs, Ys, test_size=validation_size, random_state=RAND_STATE, shuffle=shuffle_split, stratify=Ys)
    return torch.Tensor(Xs_tr), torch.Tensor(Xs_va), torch.Tensor(Ys_tr).long(), torch.Tensor(Ys_va).long()

In [1813]:
def standardization(Xs):
    scaler = StandardScaler()
    Xs_scaled = scaler.fit_transform(Xs)
    return torch.Tensor(Xs_scaled)

In [1814]:
print(train_data['emb1'].shape) # (n x d): (18750, 384)
print(train_data['emb2'].shape) # (n x d): (18750, 384)

# Concatenate the input in to a single long vector
Xs = np.concatenate((train_data['emb1'], train_data['emb2']), axis=1)
Ys = train_data['preference']

# Train Validation Split
Xs_tr, Xs_va, Ys_tr, Ys_va = train_validation_split(Xs, Ys, validation_size)

if standardized:
    Xs_tr = standardization(Xs_tr)
    Xs_va = standardization(Xs_va)

# Convert to Torch
print(f'Xs_tr.shape: {Xs_tr.shape}') 
print(f'Ys_tr.shape: {Ys_tr.shape}')
print(f'Xs_va.shape: {Xs_va.shape}')
print(f'Ys_va.shape: {Ys_va.shape}')

(18750, 384)
(18750, 384)
Xs_tr.shape: torch.Size([16875, 768])
Ys_tr.shape: torch.Size([16875])
Xs_va.shape: torch.Size([1875, 768])
Ys_va.shape: torch.Size([1875])


In [1815]:
batch_size = 128
train_dataset = TensorDataset(Xs_tr, Ys_tr)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

validation_dataset = TensorDataset(Xs_va, Ys_va)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)

# 5. Model

In [1816]:
# Parameters
torch.use_deterministic_algorithms(True)
embedding_dim = 768
hidden_dim = 64
output_dim = 2
num_layers = 5
activation = "relu"

# Improvement
include_batch_norm = True
initialize_weights = False
dropout_rate = None

In [1817]:
# FFNN Model
class FFNN(nn.Module):
    def __init__(
        self, 
        embedding_dim: int, 
        hidden_dim: int,
        output_dim: int,
        activation: str = "relu",
        num_layers: int = 1,
        include_batch_norm: bool = False,
        initialize_weights: bool = False,
        dropout_rate: Optional[float] = None
    ) -> None:
        
        super().__init__()
        assert num_layers > 0

        # FFNN architecture attributes
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.activation = activation
        self.num_layers = num_layers

        # Layer attributes
        self.input_layer = nn.Linear(self.embedding_dim, self.hidden_dim)
        self.hidden_layers = nn.ModuleList()
        for _ in range(self.num_layers - 1):
            self.hidden_layers.append(nn.Linear(self.hidden_dim, self.hidden_dim))
        self.output_layer = nn.Linear(self.hidden_dim, self.output_dim)

        # Weight initialization attributes
        self.initialize_weights = initialize_weights
        if initialize_weights:
            init.xavier_normal_(self.input_layer.weight)
            for hidden_layer in self.hidden_layers:
                init.xavier_normal_(hidden_layer.weight)
            init.xavier_normal_(self.output_layer.weight)

        # FFNN performance improvement attributes
        self.dropout_rate = dropout_rate
        if dropout_rate is not None:
            self.dropout = nn.Dropout(p=self.dropout_rate)
        else:
            self.dropout = None
        self.include_batch_norm = include_batch_norm
        if include_batch_norm:
            self.batch_norm = nn.BatchNorm1d(self.hidden_dim)

    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
        x = self.input_layer(embeddings)
        for hidden_layer in self.hidden_layers:
            # Forward layer
            x = hidden_layer(x)

            # Batch normalization layer
            if self.include_batch_norm:
                x = self.batch_norm(x)

            # Non-linear layer
            if self.activation == "relu":
                x = F.relu(x)
            elif self.activation == "tanh":
                x = F.tanh(x)
            elif self.activation == "sigmoid":
                x = F.sigmoid(x)

            # Drop out regularization layer
            if self.dropout_rate is not None:
                x = self.dropout(x)
        output = self.output_layer(x)
        return output

In [1818]:
# Test
ffnn = FFNN(
    embedding_dim=embedding_dim, 
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    activation=activation,
    num_layers=num_layers, 
    include_batch_norm=include_batch_norm,
    initialize_weights=initialize_weights,
    dropout_rate=dropout_rate
)
ffnn

FFNN(
  (input_layer): Linear(in_features=768, out_features=64, bias=True)
  (hidden_layers): ModuleList(
    (0-3): 4 x Linear(in_features=64, out_features=64, bias=True)
  )
  (output_layer): Linear(in_features=64, out_features=1, bias=True)
  (batch_norm): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

# 6. Model Training

In [1819]:
# Parameters
epochs = 50
alpha = 0.01
rho1 = 0.9
rho2 = 0.99
grad_clip_max_norm = None

# Optimizers
# adam_optimizer = torch.optim.Adam(ffnn.parameters(), lr=alpha,  betas = [rho1, rho2])
adam_optimizer = torch.optim.Adam(ffnn.parameters(), lr=alpha)

# Loss functions
cross_entropy_loss_fn = torch.nn.CrossEntropyLoss()

In [1820]:
# evaluate a trained model on MNIST data
#
# dataloader    dataloader of examples to evaluate on
# model         trained PyTorch model
# loss_fn       loss function (e.g. torch.nn.CrossEntropyLoss)
#
# returns       tuple of (loss, accuracy), both python floats
@torch.no_grad()
def evaluate_model(validation_loader, model, loss_fn):
	model.eval()
	total_loss = 0.0
	total_correct = 0
	total_samples = 0

	for X, Y in validation_loader:
		Y_pred_prob = model(X)
		loss = loss_fn(Y_pred_prob, Y)
		total_loss += loss.item()
	
		Y_pred = torch.argmax(Y_pred_prob, dim=1)
		total_correct += torch.sum(Y_pred == Y).item()
		total_samples += Y.size(0)
	
	average_loss = total_loss / len(validation_loader)
	accuracy = total_correct / total_samples
	
	return average_loss, accuracy

In [1821]:
def train(
	train_loader, 
	validation_loader, 
	model, 
	loss_fn, 
	optimizer, 
	epochs, 
	batch_size, 
	grad_clip_max_norm: Optional[float] = None, 
	patience: Optional[int] = None
):
	validation_losses = []
	validation_accuracies = []
	best_validation_loss = float("inf")
	no_improvement_count = 0

	# Create DataLoader for batching
	for epoch in range(epochs):
		# Set to training mode
		model.train()
		
		for i, (X, Y) in enumerate(train_loader):
			total_loss = 0.0

			# Zero gradients for every batch
			optimizer.zero_grad()

			# Make predictions for this batch
			Y_pred_prob = model(X)

			# Compute the loss and its gradients
			loss = loss_fn(Y_pred_prob, Y)
			loss.backward()

			if grad_clip_max_norm is not None:
				nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_max_norm)

			# Adjust learning weights
			optimizer.step()

			# Gather data and report
			total_loss += loss.item()
		
		# Evaluate the model
		validation_loss, validation_accuracy = evaluate_model(validation_loader, model, loss_fn)
		validation_losses.append(validation_loss)
		validation_accuracies.append(validation_accuracy)
		# print(f"Epoch {epoch+1}/{epochs}, Training Loss: {loss.item()}, Validation Loss: {round(validation_loss,3)}, Validation Accuracy: {round(validation_accuracy,3)}")

        # Check for early stopping
		if validation_loss < best_validation_loss:
			best_validation_loss = validation_loss
			best_validation_accuracy = validation_accuracy
			no_improvement_count = 0
		else:
			no_improvement_count += 1

		if patience is not None:
			if no_improvement_count >= patience:
				break

	# best_validation_loss = min(validation_losses)
	# best_validation_accuracy = max(validation_accuracies)
	# print(f"Minimum Loss: {min_validation_loss}, Max Accuracy: {max_validation_accuracy}")
	return model, best_validation_loss, best_validation_accuracy


In [None]:
ffnn, best_validation_loss, best_validation_accuracy = train(
    train_loader,
    validation_loader,
    ffnn, 
    cross_entropy_loss_fn, 
    adam_optimizer, 
    epochs,
    batch_size,
    grad_clip_max_norm,
    patience=5
)
print(best_validation_accuracy)

In [1682]:
ffnn

FFNN(
  (input_layer): Linear(in_features=768, out_features=64, bias=True)
  (hidden_layers): ModuleList(
    (0-3): 4 x Linear(in_features=64, out_features=64, bias=True)
  )
  (output_layer): Linear(in_features=64, out_features=2, bias=True)
  (batch_norm): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

# 7. Hyperparameter Tuning

In [1630]:
# Architecture Parameters
embedding_dim = 768
hidden_dim = 128
output_dim = 2
num_layers = 1
activation = "relu"

# Architecture Improvement Parameters
dropout_rate = None
include_batch_norm = True
initialize_weights = False

# Training Parameters
epochs = 10
alpha = 0.01
# rho1 = 0.9
# rho2 = 0.999
grad_clip_max_norm = None

# Optimizers
# sgd_optimizer = torch.optim.SGD(ffnn.parameters(), lr=alpha)
adam_optimizer = torch.optim.Adam(ffnn.parameters(), lr=alpha)
# adamw_optimizer = torch.optim.AdamW(ffnn.parameters(), lr=alpha)
# rmsprop_optimizer = torch.optim.RMSprop(ffnn.parameters(), lr=alpha)

# Loss functions
cross_entropy_loss_fn = torch.nn.CrossEntropyLoss()

In [1661]:
num_range = 5
# FFNN
param_grid = {
    'hidden_dims': [64], 
    'activations': ["relu"], # relu
    'num_layers': [5], 
    'include_batch_norm': [True], # True
    'initialize_weights': [False], # True
    'dropout_rates': [None], # 0.5
    'batch_sizes': [128, 180], # 180
    'grad_clip_max_norms': [None], # 2
    'optimizer_types': ['adam'],
    'alphas': [0.01],
    'epochs': [10],
    'rho1': [0.9], # 0.9
    'rho2': [0.99] # 0.99
}

grid_search_combinations = list(itertools.product(*param_grid.values()))

results = []
for (
    hidden_dim, 
    activation, 
    num_layer, 
    include_batch_norm, 
    initialize_weights, 
    dropout_rate, 
    # batch_size, 
    grad_clip_max_norm,
    # optimizer_type,
    alpha,
    epoch,
    # rho1,
    # rho2,

    ) in grid_search_combinations:

    sum_best_validation_loss = 0
    sum_best_validation_accuracy = 0
    for i in range(num_range):
        # FFNN Architecture
        ffnn = FFNN(
            embedding_dim=embedding_dim, 
            hidden_dim=hidden_dim,
            output_dim=output_dim, 
            num_layers=num_layers, 
            include_batch_norm=include_batch_norm,
            initialize_weights=initialize_weights,
            dropout_rate=dropout_rate
        )

        optimizer = torch.optim.Adam(ffnn.parameters(), lr=alpha)

        # Training
        best_validation_loss, best_validation_accuracy = train(
            train_loader,
            validation_loader,
            ffnn, 
            cross_entropy_loss_fn, 
            optimizer, 
            epoch,
            batch_size,
            grad_clip_max_norm,
            patience=5
        )
        sum_best_validation_loss += best_validation_loss
        sum_best_validation_accuracy += best_validation_accuracy

    # Result
    result = dict(
        zip(
            param_grid.keys(), 
            (
                hidden_dim, 
                activation, 
                num_layer,
                include_batch_norm, 
                initialize_weights, 
                dropout_rate, 
                grad_clip_max_norm,
                alpha,
                epoch,
            )
        )
    )
    result["avg_best_validation_loss"] = sum_best_validation_loss / num_range
    result["avg_best_validation_accuracy"] = sum_best_validation_accuracy / num_range
    results.append(result)

In [1662]:
result_df = pd.DataFrame(results)
result_df.sort_values(by='avg_best_validation_accuracy', ascending=False).head(60)

Unnamed: 0,hidden_dims,activations,num_layers,include_batch_norm,initialize_weights,dropout_rates,grad_clip_max_norms,alphas,epochs,avg_best_validation_loss,avg_best_validation_accuracy
0,64,relu,5,True,False,,,0.01,10,0.247355,0.893973


In [842]:
result_df.loc[result_df.best_validation_accuracy == result_df.best_validation_accuracy.max(), :]

Unnamed: 0,hidden_dims,activations,num_layers,best_validation_loss,best_validation_accuracy
3,16,relu,4,0.289982,0.890667


In [701]:
result_df.best_validation_accuracy.max()

0.6096

In [710]:
result_df.to_csv("architecture_result.csv")

In [None]:
# FFNN
param_grid = {
    'hidden_dims': [8, 16], # 8
    'activations': ["relu"], # relu
    'num_layers': [1, 2], # 7
    'include_batch_norm': [True], # True
    'initialize_weights': [True], # True
    'dropout_rates': [0.5, 0.6], # 0.5
    'batch_sizes': [128, 180], # 180
    'grad_clip_max_norms': [1, 2], # 2
    'optimizer_types': ['adam'],
    'alphas': [0.001, 0.01],
    'epochs': [50, 100],
    'rho1': [0.9], # 0.9
    'rho2': [0.99] # 0.99
}

grid_search_combinations = list(itertools.product(*param_grid.values()))

results = []
for (
    hidden_dim, 
    activation, 
    num_layer, 
    include_batch_norm, 
    initialize_weights, 
    dropout_rate, 
    batch_size, 
    grad_clip_max_norm,
    optimizer_type,
    alpha,
    epoch,
    rho1,
    rho2,

    ) in grid_search_combinations:
    for i in range(5):
        # FFNN Architecture
        ffnn = FFNN(
            embedding_dim=embedding_dim, 
            hidden_dim=hidden_dim,
            output_dim=output_dim, 
            num_layers=num_layers, 
            include_batch_norm=include_batch_norm,
            initialize_weights=initialize_weights,
            dropout_rate=dropout_rate
        )

        # Optimizer
        # if optimizer_type == 'sgd':
        #     optimizer = torch.optim.SGD(ffnn.parameters(), lr=alpha)
        # elif optimizer_type == 'rmsprop':
        #     optimizer = torch.optim.RMSprop(ffnn.parameters(), lr=alpha)
        # elif optimizer_type == 'adam':
        #     optimizer = torch.optim.Adam(ffnn.parameters(), lr=alpha, betas = [rho1, rho2])
        # elif optimizer_type == 'adamw':
        #     optimizer = torch.optim.AdamW(ffnn.parameters(), lr=alpha)
        
        optimizer = torch.optim.Adam(ffnn.parameters(), lr=alpha)

        # Training
        best_validation_loss, best_validation_accuracy = train(
            train_loader,
            validation_loader,
            ffnn, 
            cross_entropy_loss_fn, 
            optimizer, 
            epochs,
            batch_size,
            grad_clip_max_norm,
            patience=5
        )

        # Result
        result = dict(
            zip(
                param_grid.keys(), 
                (
                    hidden_dim, 
                    activation, 
                    num_layer, 
                    include_batch_norm, 
                    initialize_weights, 
                    dropout_rate,
                    batch_size,
                    grad_clip_max_norm,
                    optimizer_type,
                    alpha,
                    epoch,
                    rho1,
                    rho2
                )
            )
        )
        result["best_validation_loss"] = best_validation_loss
        result["best_validation_accuracy"] = best_validation_accuracy
        results.append(result)

# 8. Submission

In [1803]:
def make_prediction(Xs_te, model):
    Y_preds_prob = model(Xs_te)
    Y_preds = torch.argmax(Y_preds_prob, axis = 1)
    return Y_preds

In [1804]:
def make_submission(uid, Y_preds):
    df = pd.DataFrame({'uid': uid, 'preference': Y_preds})
    df.to_csv('results.csv', index = False)

In [1805]:
Xs_te = np.concatenate((test_data['emb1'], test_data['emb2']), axis=1)
Xs_te = torch.Tensor(Xs_te)
Y_preds = make_prediction(Xs_te, ffnn)
make_submission(test_data['uid'], np.array(Y_preds))