# Title

## Setup

In [40]:
# --- Configture Notebook ------
# show all outputs of cell
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

import black
import jupyter_black

jupyter_black.load(
    lab=True,
    line_length=100,
    verbosity="DEBUG",
    target_version=black.TargetVersion.PY310,
)

# enable automatic reloading
%load_ext autoreload
%autoreload 2

from pathlib import Path

from arxiv_article_classifier.utils import display_fully
from arxiv_article_classifier.data.scrape_arxiv import CATEGORIES_OF_INTEREST

from arxiv_article_classifier.data.prepare_data import make_interim_data, load_processed_data

from pandas.core.base import PandasObject

PandasObject.display_fully = display_fully

DATAFOLDER = Path().cwd().parent / "data"

import pandas as pd
import ast
import pickle

from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
import plotly.express as px

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
df = pd.read_csv(DATAFOLDER / "raw" / "articles.csv").assign(
    tags=lambda df: df["tags"].apply(lambda x: ast.literal_eval(x))
)
df.head()

Unnamed: 0,ids,titles,abstracts,tags
0,http://arxiv.org/abs/2210.00982v1,Assuring safety of vision-based swarm formatio...,Vision-based formation control systems recentl...,"[cs.MA, cs.RO, cs.SE]"
1,http://arxiv.org/abs/2210.01163v2,Agent swarms: cooperation and coordination und...,Here we consider the communications tactics ap...,[cs.MA]
2,http://arxiv.org/abs/2210.01314v1,Meta Navigation Functions: Adaptive Associatio...,"In this paper, we introduce a new class of pot...","[cs.RO, cs.MA]"
3,http://arxiv.org/abs/2210.01452v1,Federated Reinforcement Learning for Real-Time...,With the recent advances in mobile energy stor...,"[eess.SY, cs.LG, cs.MA, cs.SY]"
4,http://arxiv.org/abs/2210.01662v1,DGORL: Distributed Graph Optimization based Re...,An optimization problem is at the heart of man...,"[cs.RO, cs.MA]"


In [24]:
unique_tags = {cat for catlist in df["tags"] for cat in catlist}
len(unique_tags)

list(unique_tags)[:10]

1425

['94A14, 94A40',
 '65K10, 65M60, 65M12, 90C25',
 'K.4.1; K.2; J.5',
 'Primary: 47A30, Secondary: 47A20, 47A25',
 '14T90, 26B25, 52A30, 90B85, 92B10',
 '35R30, 34L25, 78A46',
 '90C90 (Primary) 90C20, 91B28 (Secondary)',
 '62M15, 62M10, 62G10',
 'H.4.0; K.4.0',
 '60J10, 60J27, 91A11']

In [25]:
with open(DATAFOLDER / "raw" / "taxonomy.pkl", "rb") as f:
    taxonomy = pickle.load(f)
categories = taxonomy.keys()

df["tags"] = df["tags"].map(lambda tags: [tag for tag in tags if tag in categories])

print(
    f"{len({cat for catlist in df['tags'] for cat in catlist})} found out of {len(categories)} existing categories."
)

mlb = MultiLabelBinarizer()

message_tags_matrix = pd.DataFrame(mlb.fit_transform(df["tags"]), columns=mlb.classes_)
# pd.DataFrame(
#    pd.DataFrame(message_tags_matrix, columns=mlb.classes_).sum()
#    / message_tags_matrix.shape[0]
#    * 100,
#    columns=["perc_articles"],
# ).assign(category_of_interest=lambda df: df.index.map(lambda x: x in CATEGORIES_OF_INTEREST))

fig = px.bar(
    pd.DataFrame(
        pd.DataFrame(message_tags_matrix, columns=mlb.classes_).sum()
        / message_tags_matrix.shape[0]
        * 100,
        columns=["perc_articles"],
    )
    .sort_values(by="perc_articles", ascending=False)
    .assign(is_category_of_interest=lambda df: df.index.map(lambda x: x in CATEGORIES_OF_INTEREST)),
    title="Which percentage of abstracts has which tag?",
    color="is_category_of_interest",
)
_ = fig.update_xaxes(tickangle=45)
fig.show()

fig = px.bar(
    pd.DataFrame(message_tags_matrix.sum(axis=1)).groupby(0).size()
    / message_tags_matrix.shape[0]
    * 100,
    title="Percentage of articles with n tags",
)
fig.show()

144 found out of 155 existing categories.


In the first figure, we notice that there are a few tags that have a lot of associated articles even though these tags were not used when creating the dataset. These are in particular cs.SY (systems and control), cs.NA (numerical analysis) and stat.TH (statistics theory) which are aliases for eees.SY, math.NA and math.ST respectively and to some extend cs.SD (sound), which should overlap with audio and speech processing and stat.ML (machine learning).

In [26]:
df_sum = (
    pd.DataFrame(
        pd.DataFrame(message_tags_matrix, columns=mlb.classes_).sum(), columns=["n_articles"]
    )
    .sort_values(by="n_articles", ascending=False)
    .cumsum()
    .assign(perc_tags=lambda df: df["n_articles"] / df["n_articles"].max())
)
df_sum.head(15)

Unnamed: 0,n_articles,perc_tags
cs.LG,3608,0.110202
cs.AI,6095,0.186164
cs.CV,8077,0.246701
cs.CL,9560,0.291998
cs.RO,10774,0.329078
math.OC,11958,0.365241
cs.SY,13104,0.400244
eess.SY,14248,0.435186
cs.CY,15349,0.468815
eess.SP,16407,0.50113


In [27]:
fig = px.area(df_sum, y="perc_tags", title="Percentage of tags captured")
fig.show()
_ = fig.update_xaxes(tickangle=45)

Let's clean the dataset by reducing the tags to the tags of interest. 

In [29]:
df["tags_filtered"] = df["tags"].map(
    lambda tags: [tag for tag in tags if tag in CATEGORIES_OF_INTEREST]
)

print(
    f"{len({cat for catlist in df['tags_filtered'] for cat in catlist})} found out of {len(categories)} existing categories."
)

mlb = MultiLabelBinarizer()

message_tags_matrix = pd.DataFrame(mlb.fit_transform(df["tags_filtered"]), columns=mlb.classes_)

fig = px.bar(
    pd.DataFrame(
        pd.DataFrame(message_tags_matrix, columns=mlb.classes_).sum()
        / message_tags_matrix.shape[0]
        * 100,
        columns=["perc_articles"],
    ).sort_values(by="perc_articles", ascending=False),
    title="Which percentage of abstracts has which tag?",
)
_ = fig.update_xaxes(tickangle=45)
fig.show()

fig = px.bar(
    pd.DataFrame(message_tags_matrix.sum(axis=1)).groupby(0).size()
    / message_tags_matrix.shape[0]
    * 100,
    title="Percentage of articles with n tags",
)
fig.show()

16 found out of 155 existing categories.


## Split into Train, Dev and Test

This is a multilabel dataset. There are multiple ways of how one can pose a multilabel classification problem:

* convert a 
* 
*

Here, I will convert the multilabel into multiple, single-label classification tasks. For each single classification task, the data is quite imbalanced. Therefore, we need to apply a stratified 

In [39]:
make_interim_data(
    input_file=DATAFOLDER / "raw" / "articles.csv",
    output_folder=DATAFOLDER / "interim",
    categories_to_keep=CATEGORIES_OF_INTEREST,
)

Quick check that the stratified split produced reasonable results:

In [42]:
from collections import Counter


(_, _, _, y_train, y_val, y_test), _ = load_processed_data(Path().cwd().parent / "data" / "interim")

pd.DataFrame(
    {
        "train": Counter(
            str(combination)
            for row in get_combination_wise_output_matrix(y_train, order=2)
            for combination in row
        ),
        "validation": Counter(
            str(combination)
            for row in get_combination_wise_output_matrix(y_val, order=2)
            for combination in row
        ),
        "test": Counter(
            str(combination)
            for row in get_combination_wise_output_matrix(y_test, order=2)
            for combination in row
        ),
    }
).fillna(0).display_fully()

Unnamed: 0,train,validation,test
"(6, 6)",600.0,200.0,200.0
"(6, 7)",124.0,41.0,41.0
"(7, 7)",728.0,243.0,243.0
"(5, 5)",2165.0,721.0,722.0
"(0, 0)",1492.0,497.0,498.0
"(0, 6)",232.0,77.0,77.0
"(5, 6)",208.0,69.0,70.0
"(0, 5)",664.0,221.0,222.0
"(0, 7)",184.0,61.0,61.0
"(8, 8)",612.0,204.0,204.0


Looks good. Let's go to the next notebook, where we explore the text data.