# Baseline for datasets
Insert train.json, test.json under the folder data to get data

In [1]:
from CNN.config import Config
from CNN.utils import WordEmbeddingLoader, RelationLoader, SemEvalDataLoader
import numpy as np
config = Config()
config.batch_size = 1
config.embedding_path = "./CNN/embedding/hlbl-embeddings-scaled.EMBEDDING_SIZE=50.txt"
config.data_dir = "./CNN/data/"
word2id, word_vec = WordEmbeddingLoader(config).load_embedding()
rel2id, id2rel, class_num = RelationLoader(config).get_relation()
loader = SemEvalDataLoader(rel2id, word2id, config)

  from .autonotebook import tqdm as notebook_tqdm


## Get X and y vectors from dataset
NB: X is already a feature vector got from the word embedding

In [2]:
def get_x_y_from_loader(loader):
    # upload train and test from dataloader
    X = []
    y = []
    for step, (data, label) in enumerate(loader):
        x = data.detach().numpy().flatten()
        x.astype(int)
        X.append(x)
        y.append(label.detach().numpy()[0])
    X = np.array(X)
    y = np.array(y)
    return X, y

X_train, y_train = get_x_y_from_loader(loader.get_train())
X_test, y_test = get_x_y_from_loader(loader.get_test())

## Metrics
According to the [survey](https://link.springer.com/content/pdf/10.1007/s10115-022-01665-w.pdf), the main metrics are Accuracy, Precision, Recall and F1 Score. However, they may generate overoptimistic, misleading results on
imbalanced datasets, as they failed to consider the ratio between positive and negative classes. To mitigate this, Matthews Correlation Coefficient (It is high only when the classifier is doing well in both positive and negative
classes) and G-Mean-Score (a poor performance in positive examples prediction will lead to a low G-mean
value, even if negative instances are correctly classified by the classifier) are used.

In [3]:
from custom_statistics import Statistics
stats = Statistics()
random_states = [0, 1, 42, 100, 5782]

## Baseline: Bayessian Classifier
According to the [survey](https://link.springer.com/content/pdf/10.1007/s10115-022-01665-w.pdf), different baselines are possible. For instance, Logistic Regression, Decision Trees, SVM. [Sorgente et al.](http://ceur-ws.org/Vol-1109/paper4.pdf) used Naive Bayes.

### Naive Bayes

In [4]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
predictions_nb = nb.predict(X_test)
stats.get_metrics(predictions_nb, y_test, show=True)

Accuracy Score ->  0.5384615384615384
Precision Score ->  0.5448504620165983
Recall Score ->  0.5384615384615384
F1 Score ->  0.5403048664902214
Matthews Correlation Coefficient ->  0.07386140404851196
G Mean Score ->  0.5371427867762878


(0.5384615384615384,
 0.5448504620165983,
 0.5384615384615384,
 0.5403048664902214,
 0.07386140404851196,
 0.5371427867762878)

## Other possible baselines
### Logistic Regression

In [5]:
stats.clean()
from sklearn.linear_model import LogisticRegression
for random_state in random_states:
    logreg = LogisticRegression(solver='lbfgs', random_state=random_state, multi_class='multinomial', max_iter=100, n_jobs=16)
    logreg.fit(X_train, y_train)
    predictions_logreg = logreg.predict(X_test)
    stats.add(predictions_logreg, y_test)
stats.show()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy Score ->  (0.5764102564102564, 0.0)
Precision Score ->  (0.5629600107159805, 0.0)
Recall Score ->  (0.5764102564102564, 0.0)
F1 Score ->  (0.5253168971310564, 0.0)
Matthews Correlation Coefficient ->  (0.09116874220469899, 0.0)
G Mean Score ->  (0.4273144992890848, 5.551115123125783e-17)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### SVM
NB: for large datasets can have a long training period

In [6]:
from sklearn import svm
stats.clean()
for random_state in random_states:
    SVM = svm.SVC(random_state=random_state)
    SVM.fit(X_train, y_train)
    predictions_SVM = SVM.predict(X_test)
    stats.add(predictions_SVM, y_test)
stats.show()

Accuracy Score ->  (0.5856410256410256, 0.0)
Precision Score ->  (0.5808380194670032, 0.0)
Recall Score ->  (0.5856410256410256, 0.0)
F1 Score ->  (0.5252710362139439, 0.0)
Matthews Correlation Coefficient ->  (0.11398853783652776, 0.0)
G Mean Score ->  (0.41423219181172594, 5.551115123125783e-17)


### Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier
stats.clean()
for random_state in random_states:
    rfc = DecisionTreeClassifier(random_state=random_state)
    rfc.fit(X_train, y_train)
    predictions_rfc = rfc.predict(X_test)
    stats.add(predictions_rfc, y_test)
stats.show()

Accuracy Score ->  (0.7548717948717949, 0.00438753838691436)
Precision Score ->  (0.755012479837909, 0.004526298723414325)
Recall Score ->  (0.7548717948717949, 0.00438753838691436)
F1 Score ->  (0.7549340336119036, 0.004448417805382231)
Matthews Correlation Coefficient ->  (0.5016919039680318, 0.009194867614786688)
G Mean Score ->  (0.7503664849522422, 0.0048408545106849955)


### Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier
stats.clean()
for random_state in random_states:
    rfc = RandomForestClassifier(random_state=random_state)
    rfc.fit(X_train, y_train)
    predictions_rfc = rfc.predict(X_test)
    stats.add(predictions_rfc, y_test)
stats.show()

Accuracy Score ->  (0.7675897435897434, 0.005283793588673321)
Precision Score ->  (0.767525885951278, 0.005481712255094081)
Recall Score ->  (0.7675897435897434, 0.005283793588673321)
F1 Score ->  (0.7675367877432328, 0.0053925790854735265)
Matthews Correlation Coefficient ->  (0.5271254921220063, 0.011164864902839966)
G Mean Score ->  (0.762760531951507, 0.006034538734085988)
