# Notebook 03 - Creating a Simple Baseline Model

## Setup

In [48]:
# --- Configture Notebook ------
# show all outputs of cell
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

import black
import jupyter_black

jupyter_black.load(
    lab=True,
    line_length=100,
    verbosity="DEBUG",
    target_version=black.TargetVersion.PY310,
)

# enable automatic reloading
%load_ext autoreload
%autoreload 2

from pathlib import Path

from pandas.core.base import PandasObject
from arxiv_article_classifier.utils import display_fully

PandasObject.display_fully = display_fully

DATAFOLDER = Path().cwd().parent / "data"
DATAFOLDER_PROCESSED = DATAFOLDER / "processed" / "bow-model"

from arxiv_article_classifier.data.load import load_processed_data, load_taxonomy
from collections import Counter

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Let's Go!

In [49]:
# load dataset
(
    (X_train, X_val, _, y_train, y_val, _),
    labels,
) = load_processed_data(DATAFOLDER_PROCESSED)

# load taxonomy
taxonomy = load_taxonomy(DATAFOLDER / "raw" / "taxonomy.pkl")

In [62]:
# Find top 15 most common words for each label
top_n = 15
common_words = {}

for index, label in enumerate(labels):
    print(f"{label} - {taxonomy[label]}")
    print("----")

    most_common_word_counter = Counter(
        [word for abstract in X_train[y_train.T[index].astype("bool")] for word in abstract.split()]
    ).most_common(top_n)
    most_common_word_counter
    common_words[label] = set(pair[0] for pair in most_common_word_counter)

cs.CL - Computation and Language
----


[('model', 2144),
 ('language', 1289),
 ('use', 942),
 ('task', 844),
 ('speech', 686),
 ('text', 651),
 ('datum', 620),
 ('base', 609),
 ('method', 608),
 ('propose', 595),
 ('dataset', 587),
 ('performance', 579),
 ('large', 512),
 ('result', 486),
 ('llm', 483)]

cs.CV - Computer Vision and Pattern Recognition
----


[('model', 1495),
 ('image', 1309),
 ('method', 1171),
 ('propose', 935),
 ('dataset', 800),
 ('use', 760),
 ('base', 757),
 ('task', 625),
 ('datum', 624),
 ('feature', 597),
 ('performance', 560),
 ('result', 531),
 ('approach', 515),
 ('learning', 512),
 ('object', 490)]

cs.CY - Computers and Society
----


[('model', 737),
 ('ai', 716),
 ('use', 703),
 ('datum', 495),
 ('study', 421),
 ('system', 398),
 ('paper', 359),
 ('base', 352),
 ('user', 345),
 ('research', 335),
 ('provide', 312),
 ('social', 306),
 ('result', 294),
 ('fairness', 291),
 ('propose', 283)]

cs.GT - Computer Science and Game Theory
----


[('game', 969),
 ('agent', 834),
 ('model', 522),
 ('algorithm', 514),
 ('show', 491),
 ('problem', 476),
 ('equilibrium', 464),
 ('mechanism', 422),
 ('study', 382),
 ('strategy', 357),
 ('use', 339),
 ('result', 330),
 ('player', 326),
 ('propose', 306),
 ('base', 288)]

cs.LG - Machine Learning
----


[('model', 3490),
 ('use', 1835),
 ('method', 1819),
 ('learning', 1728),
 ('propose', 1658),
 ('datum', 1595),
 ('base', 1400),
 ('algorithm', 1297),
 ('network', 1267),
 ('performance', 1132),
 ('approach', 1129),
 ('result', 1123),
 ('task', 1118),
 ('show', 1118),
 ('agent', 1022)]

cs.MA - Multiagent Systems
----


[('agent', 1716),
 ('multi', 648),
 ('model', 540),
 ('propose', 516),
 ('algorithm', 491),
 ('learning', 483),
 ('base', 462),
 ('problem', 457),
 ('use', 453),
 ('system', 444),
 ('method', 415),
 ('policy', 361),
 ('task', 359),
 ('show', 349),
 ('environment', 330)]

cs.RO - Robotics
----


[('robot', 1057),
 ('use', 767),
 ('model', 746),
 ('propose', 694),
 ('task', 669),
 ('method', 667),
 ('base', 633),
 ('system', 571),
 ('approach', 532),
 ('environment', 470),
 ('agent', 448),
 ('control', 411),
 ('algorithm', 402),
 ('object', 400),
 ('real', 399)]

cs.SI - Social and Information Networks
----


[('network', 1032),
 ('graph', 933),
 ('model', 767),
 ('social', 624),
 ('use', 550),
 ('node', 450),
 ('user', 443),
 ('propose', 431),
 ('base', 399),
 ('method', 396),
 ('study', 389),
 ('datum', 366),
 ('information', 347),
 ('result', 327),
 ('community', 309)]

eess.AS - Audio and Speech Processing
----


[('model', 1400),
 ('speech', 1228),
 ('use', 667),
 ('audio', 626),
 ('propose', 589),
 ('method', 478),
 ('performance', 444),
 ('base', 443),
 ('datum', 388),
 ('dataset', 387),
 ('system', 385),
 ('task', 382),
 ('language', 377),
 ('speaker', 368),
 ('result', 346)]

eess.SP - Signal Processing
----


[('propose', 823),
 ('model', 622),
 ('base', 609),
 ('method', 558),
 ('use', 558),
 ('system', 536),
 ('signal', 534),
 ('performance', 468),
 ('network', 453),
 ('communication', 443),
 ('datum', 419),
 ('result', 415),
 ('algorithm', 408),
 ('channel', 369),
 ('paper', 348)]

eess.SY - Systems and Control
----


[('system', 1042),
 ('control', 804),
 ('model', 772),
 ('propose', 766),
 ('use', 615),
 ('base', 586),
 ('method', 478),
 ('problem', 460),
 ('time', 459),
 ('approach', 431),
 ('paper', 430),
 ('algorithm', 400),
 ('result', 391),
 ('network', 387),
 ('state', 357)]

math.NA - Numerical Analysis
----


[('method', 1125),
 ('problem', 666),
 ('numerical', 556),
 ('use', 523),
 ('equation', 522),
 ('model', 477),
 ('propose', 441),
 ('solution', 432),
 ('order', 410),
 ('time', 394),
 ('result', 357),
 ('base', 340),
 ('system', 331),
 ('approach', 330),
 ('scheme', 325)]

math.OC - Optimization and Control
----


[('problem', 1085),
 ('algorithm', 681),
 ('method', 615),
 ('optimization', 567),
 ('propose', 513),
 ('model', 497),
 ('function', 483),
 ('use', 477),
 ('system', 438),
 ('control', 431),
 ('time', 421),
 ('optimal', 419),
 ('result', 418),
 ('show', 384),
 ('paper', 359)]

math.ST - Statistics Theory
----


[('model', 655),
 ('estimator', 471),
 ('distribution', 442),
 ('datum', 404),
 ('method', 388),
 ('result', 387),
 ('use', 362),
 ('function', 351),
 ('show', 348),
 ('sample', 347),
 ('problem', 345),
 ('propose', 337),
 ('algorithm', 308),
 ('study', 306),
 ('base', 293)]

In [63]:
for label in labels:
    print(f"{label} - {taxonomy[label]}")
    base_vocab = common_words[label]
    for k, v in common_words.items():
        if k != label:
            base_vocab = base_vocab - v
    print(base_vocab)

cs.CL - Computation and Language
{'large', 'text', 'llm'}
cs.CV - Computer Vision and Pattern Recognition
{'feature', 'image'}
cs.CY - Computers and Society
{'fairness', 'research', 'ai', 'provide'}
cs.GT - Computer Science and Game Theory
{'mechanism', 'strategy', 'equilibrium', 'player', 'game'}
cs.LG - Machine Learning
set()
cs.MA - Multiagent Systems
{'policy', 'multi'}
cs.RO - Robotics
{'real', 'robot'}
cs.SI - Social and Information Networks
{'community', 'node', 'information', 'graph'}
eess.AS - Audio and Speech Processing
{'audio', 'speaker'}
eess.SP - Signal Processing
{'communication', 'signal', 'channel'}
eess.SY - Systems and Control
{'state'}
math.NA - Numerical Analysis
{'scheme', 'solution', 'numerical', 'order', 'equation'}
math.OC - Optimization and Control
{'optimal', 'optimization'}
math.ST - Statistics Theory
{'sample', 'estimator', 'distribution'}


In [64]:
print("label_keywords = {")
for label in labels:
    print(f"'{label}':")
    base_vocab = common_words[label]
    for k, v in common_words.items():
        if k != label:
            base_vocab = base_vocab - v
    print(base_vocab)
    print(",")
print("}")

label_keywords = {
'cs.CL':
{'large', 'text', 'llm'}
,
'cs.CV':
{'feature', 'image'}
,
'cs.CY':
{'fairness', 'research', 'ai', 'provide'}
,
'cs.GT':
{'mechanism', 'strategy', 'equilibrium', 'player', 'game'}
,
'cs.LG':
set()
,
'cs.MA':
{'policy', 'multi'}
,
'cs.RO':
{'real', 'robot'}
,
'cs.SI':
{'community', 'node', 'information', 'graph'}
,
'eess.AS':
{'audio', 'speaker'}
,
'eess.SP':
{'communication', 'signal', 'channel'}
,
'eess.SY':
{'state'}
,
'math.NA':
{'scheme', 'solution', 'numerical', 'order', 'equation'}
,
'math.OC':
{'optimal', 'optimization'}
,
'math.ST':
{'sample', 'estimator', 'distribution'}
,
}


In [65]:
# Define the keywords you want to use for each label
from sklearn.metrics import classification_report
from arxiv_article_classifier.model.baseline import DictionaryModel


label_keywords = {
    "cs.CL": {"large", "text", "llm"},
    "cs.CV": {"feature", "image"},
    "cs.CY": {"fairness", "research", "ai", "provide"},
    "cs.GT": {"mechanism", "strategy", "equilibrium", "player", "game"},
    "cs.LG": {"learning"},
    "cs.MA": {"policy", "multi"},
    "cs.RO": {"real", "robot"},
    "cs.SI": {"community", "node", "information", "graph"},
    "eess.AS": {"audio", "speaker"},
    "eess.SP": {"communication", "signal", "channel"},
    "eess.SY": {"state"},
    "math.NA": {"scheme", "solution", "numerical", "order", "equation"},
    "math.OC": {"optimal", "optimization"},
    "math.ST": {"sample", "estimator", "distribution"},
}


# Define baseline classifier
baseline_classifier = DictionaryModel(keywords=label_keywords, labelorder=labels)

# Predict and score
print("Performance on Train")
y_train_pred = baseline_classifier.predict(X_train)
print(classification_report(y_train, list(y_train_pred)))

print("Performance on Validation")
y_val_pred = baseline_classifier.predict(X_val)
print(classification_report(y_val, list(y_val_pred)))

Performance on Train


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.31      0.62      0.41       886
           1       0.40      0.69      0.51       817
           2       0.16      0.69      0.26       661
           3       0.27      0.72      0.39       607
           4       0.53      0.50      0.51      1990
           5       0.25      0.61      0.35       600
           6       0.32      0.66      0.43       726
           7       0.20      0.68      0.30       612
           8       0.91      0.51      0.65       601
           9       0.39      0.64      0.48       634
          10       0.13      0.29      0.18       686
          11       0.20      0.86      0.33       611
          12       0.28      0.59      0.38       710
          13       0.27      0.68      0.38       603

   micro avg       0.28      0.61      0.38     10744
   macro avg       0.33      0.62      0.40     10744
weighted avg       0.35      0.61      0.41     10744
 samples avg       0.31   

  _warn_prf(average, modifier, msg_start, len(result))


Not too great, but given that each label only applies to about 8% of papers, not too shabby neither. A quick win here could be too manually delete the terms which are clearly very broad. 

Let's do it.

In [86]:
# Define the keywords you want to use for each label
from sklearn.metrics import classification_report
from arxiv_article_classifier.model.baseline import DictionaryModel


label_keywords = {
    "cs.CL": {"text", "llm"},
    "cs.CV": {"feature", "image"},
    "cs.CY": {"fairness"},
    "cs.GT": {"equilibrium", "player", "game"},
    "cs.LG": {"learning"},
    "cs.MA": {"policy"},
    "cs.RO": {"robot"},
    "cs.SI": {"node", "graph"},
    "eess.AS": {"audio", "speaker"},
    "eess.SP": {"communication", "signal", "channel"},
    "eess.SY": {"state"},
    "math.NA": {"scheme", "numerical"},
    "math.OC": {"optimal", "optimization"},
    "math.ST": {"sample", "estimator", "distribution"},
}


# Define baseline classifier
baseline_classifier = DictionaryModel(keywords=label_keywords, labelorder=labels)

# Predict and score
print("Performance on Train")
y_train_pred = baseline_classifier.predict(X_train)
print(classification_report(y_train, list(y_train_pred)))

print("Performance on Validation")
y_val_pred = baseline_classifier.predict(X_val)
print(classification_report(y_val, list(y_val_pred)))

Performance on Train


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.64      0.45      0.53       886
           1       0.40      0.69      0.51       817
           2       0.49      0.13      0.21       661
           3       0.59      0.55      0.57       607
           4       0.53      0.50      0.51      1990
           5       0.32      0.26      0.29       600
           6       0.95      0.46      0.62       726
           7       0.35      0.43      0.39       612
           8       0.91      0.51      0.65       601
           9       0.39      0.64      0.48       634
          10       0.13      0.29      0.18       686
          11       0.32      0.63      0.42       611
          12       0.28      0.59      0.38       710
          13       0.27      0.68      0.38       603

   micro avg       0.39      0.49      0.44     10744
   macro avg       0.47      0.49      0.44     10744
weighted avg       0.48      0.49      0.45     10744
 samples avg       0.38   

  _warn_prf(average, modifier, msg_start, len(result))


Ah, already better. The f1-score of most categories improved. There is certainly still optimization potential, but since this is only meant to be a baseline, let's move on to the next model. In the next notebook, we will take a look at a tfidft model.