# CNN Approach
Based on [Zeng et al.](https://aclanthology.org/C14-1220.pdf) and implementation from [here](https://github.com/onehaitao/CNN-relation-extraction).

The code is written for the evaluation of SemEval2010. Change the loader to get the results for SimEval2007.

### Training loop

In [1]:
def train(model, criterion, loader, config):
    train_loader, dev_loader, _ = loader
    optimizer = optim.Adam(model.parameters(), lr=config.lr,
                           weight_decay=config.L2_decay)

    print(model)
    print('traning model parameters:')
    for name, param in model.named_parameters():
        if param.requires_grad:
            print('%s :  %s' % (name, str(param.data.shape)))
    print('--------------------------------------')
    print('start to train the model ...')

    eval_tool = Eval(config)
    max_f1 = -float('inf')
    for epoch in range(1, config.epoch+1):
        for step, (data, label) in enumerate(train_loader):
            model.train()
            data = data.to(config.device)
            label = label.to(config.device)

            optimizer.zero_grad()
            logits = model(data)
            loss = criterion(logits, label)
            loss.backward()
            optimizer.step()

        _, train_loss, _ = eval_tool.evaluate(model, criterion, train_loader)
        f1, dev_loss, _ = eval_tool.evaluate(model, criterion, dev_loader)

        print('[%03d] train_loss: %.3f | dev_loss: %.3f | micro f1 on dev: %.4f'
              % (epoch, train_loss, dev_loss, f1), end=' ')
        if f1 > max_f1:
            max_f1 = f1
            torch.save(model.state_dict(), os.path.join(
                config.model_dir, 'model.pkl'))
            print('>>> save models!')
        else:
            print()


### Test evaluation during training

In [2]:
def test(model, criterion, loader, config):
    print('--------------------------------------')
    print('start test ...')

    _, _, test_loader = loader
    model.load_state_dict(torch.load(
        os.path.join(config.model_dir, 'model.pkl')))
    eval_tool = Eval(config)
    f1, test_loss, predict_label = eval_tool.evaluate(
        model, criterion, test_loader)
    print('test_loss: %.3f | micro f1 on test:  %.4f' % (test_loss, f1))
    return predict_label


### Prepare custom statistics with metrics from the [survey](https://link.springer.com/content/pdf/10.1007/s10115-022-01665-w.pdf).
They are also used for the baseline

In [3]:
import numpy as np
from CNN.config import Config
from CNN.utils import WordEmbeddingLoader, RelationLoader, SemEvalDataLoader
from CNN.model import CNN
from CNN.evaluate import Eval
config_test = Config()
config_test.batch_size = 1
config_test.embedding_path = "./CNN/embedding/hlbl-embeddings-scaled.EMBEDDING_SIZE=50.txt"
config_test.data_dir = "./CNN/data/simeval2010/"  # change to simeval2007 to get another loader
word2id, word_vec = WordEmbeddingLoader(config_test).load_embedding()
rel2id, id2rel, class_num = RelationLoader(config_test).get_relation()
loader = SemEvalDataLoader(rel2id, word2id, config_test)
def get_x_y_from_loader(loader):
    # upload train and test from dataloader
    X = []
    y = []
    for step, (data, label) in enumerate(loader):
        x = data.detach().numpy().flatten()
        x.astype(int)
        X.append(x)
        y.append(label.detach().numpy()[0])
    X = np.array(X)
    y = np.array(y)
    return X, y
X_test, y_test = get_x_y_from_loader(loader.get_test())
from custom_statistics import Statistics
stats = Statistics()

  from .autonotebook import tqdm as notebook_tqdm


### Train CNN model with different states (the same as for the baseline)

In [4]:
import os
import torch
import torch.nn as nn
import torch.optim as optim

config = Config()
config.data_dir = "./CNN/data/simeval2010/"  # change to simeval2007 to get another loader
config.embedding_path = "./CNN/embedding/hlbl-embeddings-scaled.EMBEDDING_SIZE=50.txt"
print('--------------------------------------')
print('some config:')
config.print_config()

print('--------------------------------------')
print('start to load data ...')
word2id, word_vec = WordEmbeddingLoader(config).load_embedding()
rel2id, id2rel, class_num = RelationLoader(config).get_relation()
loader = SemEvalDataLoader(rel2id, word2id, config)

train_loader, dev_loader = None, None
if config.mode == 1:  # train mode
    train_loader = loader.get_train()
    dev_loader = loader.get_dev()
test_loader = loader.get_test()
loader = [train_loader, dev_loader, test_loader]
print('finish!')

print('--------------------------------------')
random_states = [0, 1, 42, 100, 5782]
for random_state in random_states:
    print("RANDOM_SEED -> ", random_state)
    config.set_seed(random_state)
    model = CNN(word_vec=word_vec, class_num=class_num, config=config)
    model = model.to(config.device)
    criterion = nn.CrossEntropyLoss()

    if config.mode == 1:  # train mode
        train(model, criterion, loader, config)
    predict_label = test(model, criterion, loader, config)
    stats.add(predict_label, y_test)

--------------------------------------
some config:
data_dir = ./CNN/data/simeval2010/
output_dir = ./output
embedding_path = ./CNN/embedding/hlbl-embeddings-scaled.EMBEDDING_SIZE=50.txt
word_dim = 50
model_name = CNN
mode = 1
seed = 5782
cuda = -1
epoch = 20
dropout = 0.5
batch_size = 128
lr = 0.001
max_len = 100
pos_dis = 50
pos_dim = 5
hidden_size = 100
filter_num = 200
window = 3
L2_decay = 1e-05
device = cpu
model_dir = ./output/CNN
--------------------------------------
start to load data ...
finish!
--------------------------------------
RANDOM_SEED ->  0
CNN(
  (word_embedding): Embedding(246123, 50)
  (pos1_embedding): Embedding(103, 5)
  (pos2_embedding): Embedding(103, 5)
  (conv): Conv2d(1, 200, kernel_size=(3, 60), stride=(1, 1), padding=(1, 0))
  (maxpool): MaxPool2d(kernel_size=(100, 1), stride=(100, 1), padding=0, dilation=1, ceil_mode=False)
  (tanh): Tanh()
  (dropout): Dropout(p=0.5, inplace=False)
  (linear): Linear(in_features=200, out_features=100, bias=True)
  (d

In [7]:
stats.show('weighted')

Accuracy Score ->  (0.9704821494295178, 0.00216370091375768)
Precision Score ->  (0.9700893024042265, 0.002196022384812887)
Recall Score ->  (0.9704821494295178, 0.00216370091375768)
F1 Score ->  (0.9702090923752783, 0.0021913229745233838)
Matthews Correlation Coefficient ->  (0.8631673351739145, 0.010103334814723704)
G Mean Score ->  (0.8936479809254095, 0.008554961392298987)
