In [1]:
# ----------------------------------------------------------------------------
# Author        :    Vasileios Perifanis
# Affiliation   :    Euclid team, Democritus University of Thrace, Dept. of Electrical & Computer Engineering
# Created Date  :    03/2022
# version       :    1.0
# ---------------------------------------------------------------------------

In [2]:
# !pip install optuna

In [3]:
import numpy as np # linear algebra
import pandas as pd # data analysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import preprocessing

import torch
from torch.utils.data import Dataset, DataLoader

import optuna

import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv("datasets/titanic_train.csv", header=0) # read csv

data['Initial']=0
for i in data:
    data['Initial']=data.Name.str.extract('([A-Za-z]+)\.') #extract the Salutations

# Assigning the NaN Values the mean ages
data.loc[(data.Age.isnull())&(data.Initial=='Mr'),'Age']=33
data.loc[(data.Age.isnull())&(data.Initial=='Mrs'),'Age']=36
data.loc[(data.Age.isnull())&(data.Initial=='Master'),'Age']=5
data.loc[(data.Age.isnull())&(data.Initial=='Miss'),'Age']=22
data.loc[(data.Age.isnull())&(data.Initial=='Other'),'Age']=46

data['Embarked'].fillna('S',inplace=True)

data['Age_band']=0
data.loc[data['Age']<=16,'Age_band']=0
data.loc[(data['Age']>16)&(data['Age']<=32),'Age_band']=1
data.loc[(data['Age']>32)&(data['Age']<=48),'Age_band']=2
data.loc[(data['Age']>48)&(data['Age']<=64),'Age_band']=3
data.loc[data['Age']>64,'Age_band']=4

data['Family_Size']=0
data['Family_Size']=data['Parch']+data['SibSp']#family size
data['Alone']=0
data.loc[data.Family_Size==0,'Alone']=1#Alone

data['Fare_Range']=pd.qcut(data['Fare'],5)
data['Fare_cat']=0
data.loc[data['Fare']<=7.91,'Fare_cat']=0
data.loc[(data['Fare']>7.91)&(data['Fare']<=14.454),'Fare_cat']=1
data.loc[(data['Fare']>14.454)&(data['Fare']<=31),'Fare_cat']=2
data.loc[(data['Fare']>31)&(data['Fare']<=513),'Fare_cat']=3

data['Sex'].replace(['male','female'],[0,1],inplace=True)
data['Embarked'].replace(['S','C','Q'],[0,1,2],inplace=True)
data['Initial'].replace(['Mr','Mrs','Miss','Master','Other'],[0,1,2,3,4],inplace=True)

data.drop(['Name','Age','Ticket','Fare','Cabin','Initial','Fare_Range','PassengerId'],axis=1,inplace=True)

data.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,Age_band,Family_Size,Alone,Fare_cat
0,0,3,0,1,0,0,1,1,0,0
1,1,1,1,1,0,1,2,1,0,3
2,1,3,1,0,0,0,1,0,1,1
3,1,1,1,1,0,0,2,1,0,3
4,0,3,0,0,0,0,2,0,1,1


In [5]:
data.shape

(891, 10)

In [6]:
def split(data):
    train,test=train_test_split(data,test_size=0.2,random_state=0,stratify=data['Survived'])
    return train, test

def to_numpy(train, test):
    train_X=train[train.columns[1:]].to_numpy()
    train_Y=train[train.columns[:1]].to_numpy().ravel()
    test_X=test[test.columns[1:]].to_numpy()
    test_Y=test[test.columns[:1]].to_numpy().ravel()
    return train_X, train_Y, test_X, test_Y

In [7]:
train, test = split(data)

In [8]:
train_X, train_Y, test_X, test_Y = to_numpy(train, test)
min_max_scaler = preprocessing.MinMaxScaler().fit(train_X)
train_X = min_max_scaler.transform(train_X)
test_X = min_max_scaler.transform(test_X)

In [9]:
class ClassifierDataset(Dataset):
    def __init__(self, x_data, y_data):
        self.x_data = x_data
        self.y_data = y_data

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return len(self.x_data)

In [10]:
def generate_dataset(train_x, train_y, batch_size, shuffle=True):
    dataset = ClassifierDataset(torch.from_numpy(train_x).float(), torch.from_numpy(train_y).long())
    loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle)

    return loader

In [11]:
train_loader = generate_dataset(train_X, train_Y, batch_size=1024, shuffle=True)
test_loader = generate_dataset(test_X, test_Y, batch_size=1, shuffle=False)
print(f"Training set size: ({train_X.shape},{train_Y.shape}), Testing set size: ({train_Y.shape},{test_Y.shape})")
criterion = torch.nn.CrossEntropyLoss()
epochs = 30

Training set size: ((712, 9),(712,)), Testing set size: ((712,),(179,))


# Initialize study

- direction: maximize ή minimize, αναλόγως τι θέλουμε να πετύχουμε.
- sampler:
    - GridSampler: grid search.
    - RandomSampler: random search.
    - TPESampler: Begins with random search. At each step, it suggests new values.
    
    
# Define study

- trial.suggest_x:
    - suggest_int: integers, e.g., το batch size
    - suggest_categorical: value in a range, e.g. optimizer.
    - suggest_uniform: value in a range
    - suggest_loguniform: value in a range, e.g. learning rate.
    
# Define Model

# Train και evaluate


In [12]:
def build_model(params):
    input_dim = 9
    return torch.nn.Sequential(
        torch.nn.Linear(input_dim, 128),
        torch.nn.ReLU(),
        torch.nn.Linear(128, 64),
        torch.nn.ReLU(),
        torch.nn.Linear(64, 2)
    )

In [13]:
def test(model, data, criterion):
    model.eval()
    y_true, y_pred = [], []
    loss, correct = 0., 0
    with torch.no_grad():
        for x, y in data:
            out = model(x)
            loss += criterion(out, y).item()
            pred = out.argmax(dim=1)
            # correct += int((pred == y).sum().item()) # torch accuracy method
            for i in range(len(pred)):
                y_pred.append(pred[i].item())
            for i in range(len(y)):
                y_true.append(y[i].item())
            
    loss = loss / len(data)
    #accuracy = correct / len(data.dataset) # torch accuracy method
    accuracy = accuracy_score(y_pred, y_true)
    precision = precision_score(y_pred, y_true)
    recall = recall_score(y_pred, y_true)
    f1 = f1_score(y_pred, y_true)

    return loss, accuracy, precision, recall, f1


def train(param, model, train_loader, test_loader, criterion, epochs):
    
    optimizer = getattr(torch.optim, param['optimizer'])(model.parameters(), lr= param['learning_rate'])

    for epoch in range(1, epochs+1):
            model.train()
            for data, target in train_loader:
                optimizer.zero_grad()
                y_pred = model(data)
                loss = criterion(y_pred, target)
                loss.backward()
                optimizer.step()
                
            train_loss, train_acc, train_prec, train_rec, train_f = test(model, train_loader, criterion)  # test the model
            test_loss, test_acc, test_pred, test_rec, test_f = test(model, test_loader, criterion)
            
            #if epoch % 1 == 0:  # log
                #print(f"Epoch: [{epoch}/{epochs}]: Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}, Precision: {train_prec:.4f}, Recall: {train_rec:.4f}, F1: {train_f:.4f}\n")
                #print(f"\tLoss: {test_loss:.4f}, Eval Accuracy: {test_acc:.4f}, Eval Precision: {train_prec:.4f}, Eval Recall: {train_rec:.4f}, Eval F1: {train_f:.4f}")

    return test_f

In [14]:
def objective(trial):
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-2),
        'optimizer': trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    }
    model = build_model(params)
    
    f1 = train(params, model, train_loader, test_loader, criterion, epochs)
    return f1

In [15]:
# specify optimization
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=30)

[32m[I 2022-04-01 10:38:34,666][0m A new study created in memory with name: no-name-4bd20dab-a51b-49b0-a47b-b7f35e02df41[0m
[32m[I 2022-04-01 10:38:35,532][0m Trial 0 finished with value: 0.49462365591397844 and parameters: {'learning_rate': 0.00010160028161024629, 'optimizer': 'RMSprop'}. Best is trial 0 with value: 0.49462365591397844.[0m
[32m[I 2022-04-01 10:38:36,342][0m Trial 1 finished with value: 0.54 and parameters: {'learning_rate': 0.0005103790203411218, 'optimizer': 'Adam'}. Best is trial 1 with value: 0.54.[0m
[32m[I 2022-04-01 10:38:37,219][0m Trial 2 finished with value: 0.4090909090909091 and parameters: {'learning_rate': 0.00031775147799741646, 'optimizer': 'Adam'}. Best is trial 1 with value: 0.54.[0m
[32m[I 2022-04-01 10:38:38,022][0m Trial 3 finished with value: 0.49462365591397844 and parameters: {'learning_rate': 0.00025086366594676316, 'optimizer': 'Adam'}. Best is trial 1 with value: 0.54.[0m
[32m[I 2022-04-01 10:38:38,893][0m Trial 4 finished wi

In [16]:
best_f = study.best_value

In [17]:
best_f

0.7244094488188977

In [18]:
best_trial = study.best_trial

In [19]:
best_trial

FrozenTrial(number=10, values=[0.7244094488188977], datetime_start=datetime.datetime(2022, 4, 1, 10, 38, 43, 21506), datetime_complete=datetime.datetime(2022, 4, 1, 10, 38, 43, 828510), params={'learning_rate': 0.0021755086787658444, 'optimizer': 'Adam'}, distributions={'learning_rate': LogUniformDistribution(high=0.01, low=0.0001), 'optimizer': CategoricalDistribution(choices=('Adam', 'RMSprop', 'SGD'))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=10, state=TrialState.COMPLETE, value=None)