# Baseline for datasets
Insert train.json, test.json under the folder data to get data

In [1]:
from CNN.config import Config
from CNN.utils import WordEmbeddingLoader, RelationLoader, SemEvalDataLoader
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef
from imblearn.metrics import geometric_mean_score
import torch
import numpy as np
config = Config()
config.batch_size = 1
config.embedding_path = "./CNN/embedding/hlbl-embeddings-scaled.EMBEDDING_SIZE=50.txt"
config.data_dir = "./CNN/data/"
word2id, word_vec = WordEmbeddingLoader(config).load_embedding()
rel2id, id2rel, class_num = RelationLoader(config).get_relation()
loader = SemEvalDataLoader(rel2id, word2id, config)

  from .autonotebook import tqdm as notebook_tqdm


## Get X and y vectors from dataset
NB: X is already a feature vector got from the word embedding

In [2]:
def get_x_y_from_loader(loader):
    # upload train and test from dataloader
    min_v, max_v = float('inf'), -float('inf')
    X = []
    y = []
    for step, (data, label) in enumerate(loader):
        x = data.detach().numpy().flatten()
        x.astype(int)
        X.append(x)
        y.append(label.detach().numpy()[0])
    X = np.array(X)
    y = np.array(y)
    return X, y

X_train, y_train = get_x_y_from_loader(loader.get_train())
X_test, y_test = get_x_y_from_loader(loader.get_test())

## Metrics
According to the [survey](https://link.springer.com/content/pdf/10.1007/s10115-022-01665-w.pdf), the main metrics are Accuracy, Precision, Recall and F1 Score. However, they may generate overoptimistic, misleading results on
imbalanced datasets, as they failed to consider the ratio between positive and negative classes. To mitigate this, Matthews Correlation Coefficient (It is high only when the classifier is doing well in both positive and negative
classes) and G-Mean-Score (a poor performance in positive examples prediction will lead to a low G-mean
value, even if negative instances are correctly classified by the classifier) are used.

In [10]:
def get_stats(prediction, y_test):
    print("Accuracy Score -> ", accuracy_score(y_test, prediction))
    print("Precision Score -> ", precision_score(y_test, prediction, average='weighted'))
    print("Recall Score -> ", recall_score(y_test, prediction, average='weighted'))
    print("F1 Score -> ", f1_score(y_test, prediction, average='weighted'))
    print("Matthews Correlation Coefficient -> ", matthews_corrcoef(y_test, prediction))
    print("G Mean Score -> ", geometric_mean_score(y_test, prediction))

## Baseline: Bayessian Classifier
According to the [survey](https://link.springer.com/content/pdf/10.1007/s10115-022-01665-w.pdf), different baselines are possible. For instance, Logistic Regression, Decision Trees, SVM. [Sorgente et al.](http://ceur-ws.org/Vol-1109/paper4.pdf) used Naive Bayes.

### Naive Bayes

In [11]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
predictions_nb = nb.predict(X_test)
get_stats(predictions_nb, y_test)

Accuracy Score ->  0.5384615384615384
Precision Score ->  0.5448504620165983
Recall Score ->  0.5384615384615384
F1 Score ->  0.5403048664902214
Matthews Correlation Coefficient ->  0.07386140404851196
G Mean Score ->  0.5371427867762878


## Other possible baselines
### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', random_state=42, multi_class='multinomial', max_iter=1000, n_jobs=16)
logreg.fit(X_train, y_train)
predictions_logreg = logreg.predict(X_test)
get_stats(predictions_logreg, y_test)

Accuracy Score ->  0.6502564102564102
Precision Score ->  0.647079810159557
Recall Score ->  0.6502564102564102
F1 Score ->  0.640937446579709
Matthews Correlation Coefficient ->  0.27505205602044247
G Mean Score ->  0.6110827849119087


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### SVM
NB: for large datasets can have a long training period

In [13]:
SVM = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
SVM.fit(X_train, y_train)
predictions_SVM = SVM.predict(X_test)
get_stats(predictions_SVM, y_test)

Accuracy Score ->  0.5651282051282052
Precision Score ->  0.7543387141334806
Recall Score ->  0.5651282051282052
F1 Score ->  0.40865823010162455
Matthews Correlation Coefficient ->  0.02579490274625982
G Mean Score ->  0.034319911152729005


### Decision Tree

In [14]:
from sklearn.tree import DecisionTreeClassifier
rfc = DecisionTreeClassifier(random_state=42)
rfc.fit(X_train, y_train)
predictions_rfc = rfc.predict(X_test)
get_stats(predictions_rfc, y_test)

Accuracy Score ->  0.7558974358974359
Precision Score ->  0.7561043881216917
Recall Score ->  0.7558974358974359
F1 Score ->  0.7559937204846002
Matthews Correlation Coefficient ->  0.5039141609561323
G Mean Score ->  0.7515985032826582


### Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)
predictions_rfc = rfc.predict(X_test)
get_stats(predictions_rfc, y_test)

Accuracy Score ->  0.7687179487179487
Precision Score ->  0.7687494035376388
Recall Score ->  0.7687179487179487
F1 Score ->  0.7687334726677805
Matthews Correlation Coefficient ->  0.5296432884430681
G Mean Score ->  0.7642735270782253
