# Baseline for datasets
Insert train.json, test.json under the folder data to get data

In [1]:
from CNN.config import Config
from CNN.utils import WordEmbeddingLoader, RelationLoader, SemEvalDataLoader
import numpy as np
config = Config()
config.batch_size = 1
config.embedding_path = "./CNN/embedding/hlbl-embeddings-scaled.EMBEDDING_SIZE=50.txt"
config.data_dir = "./CNN/data/"
word2id, word_vec = WordEmbeddingLoader(config).load_embedding()
rel2id, id2rel, class_num = RelationLoader(config).get_relation()
loader = SemEvalDataLoader(rel2id, word2id, config)

## Get X and y vectors from dataset
NB: X is already a feature vector got from the word embedding

In [2]:
def get_x_y_from_loader(loader):
    # upload train and test from dataloader
    X = []
    y = []
    for step, (data, label) in enumerate(loader):
        x = data.detach().numpy().flatten()
        x.astype(int)
        X.append(x)
        y.append(label.detach().numpy()[0])
    X = np.array(X)
    y = np.array(y)
    return X, y

X_train, y_train = get_x_y_from_loader(loader.get_train())
X_test, y_test = get_x_y_from_loader(loader.get_test())

## Baseline: Bayessian Classifier
According to the [survey](https://link.springer.com/content/pdf/10.1007/s10115-022-01665-w.pdf), different baselines are possible. For instance, Logistic Regression, Decision Trees, SVM. [Sorgente et al.](http://ceur-ws.org/Vol-1109/paper4.pdf) used Naive Bayes.

### Naive Bayes 

In [3]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
predictions_nb = nb.predict(X_test)
nb.score(X_test, y_test)

0.5384615384615384

#### with train_test_split

In [4]:
from sklearn.model_selection import train_test_split

nb = MultinomialNB()
XX_train, XX_test, yy_train, yy_test = train_test_split(X_train, y_train, test_size=0.4, random_state=3)
nb.fit(XX_train, yy_train)
predictions_nb = nb.predict(X_test)
nb.score(X_test, y_test)

0.5123076923076924

#### with cross_val_score

In [5]:
from sklearn.model_selection import cross_val_score

cv = cross_val_score(nb, X_train, y_train, cv=20)
cv

array([0.53370787, 0.57303371, 0.59550562, 0.51123596, 0.54494382,
       0.53370787, 0.56179775, 0.52808989, 0.5       , 0.55617978,
       0.6011236 , 0.48022599, 0.52542373, 0.45762712, 0.44632768,
       0.58757062, 0.55367232, 0.49152542, 0.55932203, 0.50847458])

#### with GridSearchCV

In [6]:
nb.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

In [16]:
from sklearn.model_selection import GridSearchCV
import warnings

warnings.filterwarnings('ignore')

grid_nb = GridSearchCV(estimator=nb,
             param_grid={'alpha': [0, 1], 'fit_prior': ('True', 'FALSE')}, cv = 20)
grid_nb.fit(X_train, y_train)
grid_nb.score(X_train, y_train)

0.5696986764291749

In [8]:
grid_nb.score(X_test, y_test)

0.5384615384615384

### with RepeatedKFold

In [9]:
from sklearn.model_selection import RepeatedKFold

rkf = RepeatedKFold(n_splits=10, n_repeats=2, random_state=3)
for train, test in rkf.split(X_train):
    Xx_train, Xx_test = X_train[train], X_train[test]
    Yy_train, Yy_test = y_train[train], y_train[test]
    kf_nb = nb.fit(Xx_train, Yy_train)
    kf=kf_nb.score(Xx_train, Yy_train)
    print(kf)
kf = kf.mean()
nb.score(Xx_test, Yy_test)

0.5702660406885759
0.5716520650813517
0.5678973717146433
0.5635168961201502
0.5788485607008761
0.5682102628285357
0.5694618272841051
0.5653942428035044
0.5750938673341677
0.5704005006257822
0.5668231611893584
0.5691489361702128
0.5766583229036295
0.5729036295369212
0.5707133917396746
0.5772841051314143
0.5735294117647058
0.5666458072590739
0.5688360450563203
0.568523153942428


0.5352112676056338