# Title

## Setup

In [11]:
# --- Configture Notebook ------
# show all outputs of cell
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

import black
import jupyter_black

jupyter_black.load(
    lab=True,
    line_length=100,
    verbosity="DEBUG",
    target_version=black.TargetVersion.PY310,
)

# enable automatic reloading
%load_ext autoreload
%autoreload 2

import plotly.express as px
from sklearn.pipeline import FunctionTransformer, Pipeline
from arxiv_article_classifier.data.load import load_processed_data
import pandas as pd


from pathlib import Path

from pandas.core.base import PandasObject
from arxiv_article_classifier.utils import display_fully

PandasObject.display_fully = display_fully

DATAFOLDER = Path().cwd().parent / "data"



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
# Constants
DATAFOLDER_INTERIM = Path().cwd().parent / "data" / "interim"
DATAFOLDER_PROCESSED = Path().cwd().parent / "data" / "processed"

In [14]:
(X_train_raw, _, _, y_train, _, _), labels = load_processed_data(DATAFOLDER_INTERIM)

In [15]:
X_train_raw

array(['Vision-based formation control systems recently have attracted attentions\nfrom both the research community and the industry for its applicability in\nGPS-denied environments. The safety assurance for such systems is challenging\ndue to the lack of formal specifications for computer vision systems and the\ncomplex impact of imprecise estimations on distributed control. We propose a\ntechnique for safety assurance of vision-based formation control. Our technique\ncombines (1) the construction of a piecewise approximation of the worst-case\nerror of perception and (2) a classical Lyapunov-based safety analysis of the\nconsensus control algorithm. The analysis provides the ultimate bound on the\nrelative distance between drones. This ultimate bound can then be used to\nguarantee safe separation of all drones. We implement an instance of the\nvision-based control system on top of the photo-realistic AirSim simulator. We\nconstruct the piecewise approximation for varying perception 

In [40]:
import numpy as np
from arxiv_article_classifier.data.make_processed_data_bow import (
    LATEX_REGEX,
    LINEBREAK_REGEX,
    NLP,
    PUNCTUATION_DELETION_TABLE,
    STOPLIST,
    delete_regular_expression,
    lemmatize_document,
    remove_stopwords,
)


linebreak_cleaner = FunctionTransformer(
    lambda X: np.array([delete_regular_expression(x, LINEBREAK_REGEX) for x in X])
)

lower_case_converter = FunctionTransformer(lambda X: np.array([x.lower() for x in X]))
whitespace_deleter = FunctionTransformer(lambda X: np.array([" ".join(x.split()) for x in X]))

lemmatizer = FunctionTransformer(lambda X: np.array([lemmatize_document(x, NLP) for x in X]))

punctuation_deleter = FunctionTransformer(
    lambda X: np.array([x.translate(PUNCTUATION_DELETION_TABLE) for x in X])
)

stopword_remover = FunctionTransformer(
    lambda X: np.array([remove_stopwords(x, STOPLIST) for x in X])
)

latex_remover = FunctionTransformer(
    lambda X: np.array([delete_regular_expression(x, LATEX_REGEX) for x in X])
)

cleaning_pipeline = Pipeline(
    [
        ("clean_linebreaks", linebreak_cleaner),
        ("remove_latex", latex_remover),
        ("lemmatize", lemmatizer),
        ("convert_to_lowercase", lower_case_converter),
        ("delete_punctuation", punctuation_deleter),
        ("delete_whitespace", whitespace_deleter),
        ("remove_stopwords", stopword_remover),
    ]
)

In [41]:
X_train = cleaning_pipeline.transform(X_train_raw)

In [42]:
abstracts = (
    pd.DataFrame(y_train, columns=labels)
    .assign(abstract=X_train)
    .assign(nwords=lambda df: df["abstract"].map(lambda abstract: len(abstract.split())))
)

px.bar(abstracts["nwords"].value_counts())

Abstracts have between 7 and 213 words after stop-word removal and the distribution does not show any obvious outliers. Let's have a quick look at both ends if they look reasonable.

In [33]:
abstracts[abstracts["nwords"] < 20][["abstract"]].display_fully()

Unnamed: 0,abstract
402,paper show initial stage development first principle formal logic characterise explore issue broadly define idea veracity
755,summarize dynamic behavioral interaction introduce possible node embed base solution question temporal egonet subgraph transition
1033,short note establish positionality mean payoff game infinite game graph construct well found monotone universal graph
1658,present optimal transport framework perform regression covariate response probability distribution compact euclidean subset
2549,introduce explicit family good interpolation point interpolation triangle may use either polynomial interpolation certain rational interpolation give explicit formula
2580,propose ziv zakai type low bound bayesian error estimate parameter betatheta mathbb r parameter space general need linear function
4213,paper prove dalembert lagrange principle point masse use lagrange mach mechanical construction yield weighted balancing condition unit vector
4296,prove coefficient sum square entry symmetric matrix
4483,standard fractional projection extend binary two mode network weight two mode network interesting property extended projection prove
5622,effectiveness compression text classification gzip recently garner lot attention note show bag word approach achieve similar well result efficient


In [43]:
abstracts[abstracts["nwords"] > 200][["abstract"]].display_fully()

Unnamed: 0,abstract
169,paper consider partial gathering problem mobile agent synchronous dynamic bidirectional ring network k agent distribute network partial gathering problem require give positive integer g k agent terminate configuration either least g agent agent exist node far partial gathering problem consider static graph paper start consider partial gathering dynamic graph first step consider problem 1 interval connect ring one link ring may miss time step network focus relationship value k g fully characterize solvability partial gathering problem analyze move complexity propose algorithm problem solve first show g partial gathering problem unsolvable k 2 g second show problem solve log g time total number ogn log g move 2 g 1 k 3 g 2 third show problem solve time total number okn move 3 g 1 k 8 g 4 notice since k og hold 3 g 1 k 8 g 4 move complexity okn case represent also ogn finally show problem solve time total number ogn move k 8 g 3 result mean partial gathering problem solve also dynamic ring k 2 g 1 addition agent require total number omegagn move solve partial resp total gathering problem thus k 3 g 1 agent solve partial gathering problem asymptotically optimal total number ogn move
1492,paper integrate nonlinear manifold reduced order model nm roms domain decomposition dd nm roms approximate fom state nonlinear manifold train shallow sparse autoencoder use fom snapshot datum nm rom advantageous linear subspace roms ls roms problem slowly decay kolmogorov width however number nm rom parameter need train scale size fom moreover extreme scale problem storage high dimensional fom snapshot alone make rom training expensive alleviate training cost paper apply dd fom compute nm rom subdomain couple obtain global nm rom approach several advantage subdomain nm rom train parallel involve parameter train global nm rom require small subdomain fom dimensional training datum training subdomain nm rom tailor subdomain specific feature fom shallow sparse architecture autoencoder use subdomain nm rom allow application hyper reduction hr reduce complexity cause nonlinearity yield computational speedup nm rom paper provide first application nm rom hr dd problem particular detail algebraic dd formulation fom train nm rom hr subdomain develop sequential quadratic programming sqp solver evaluate coupled global nm rom theoretical convergence result sqp method priori posteriori error estimate dd nm rom hr provide propose dd nm rom hr approach numerically compare dd ls rom hr 2d steady state burgers equation show order magnitude improvement accuracy propose dd nm rom dd ls rom
2805,monte carlo mc sampling popular method estimate statistic eg expectation variance random variable slow convergence lead emergence advanced technique reduce variance mc estimator output computationally expensive solver control variate cv method correct mc estimator term derive auxiliary random variable highly correlate original random variable auxiliary variable may come surrogate model surrogate base cv strategy extend multilevel monte carlo mlmc framework rely sequence level correspond numerical simulator increase accuracy computational cost mlmc combine output sample obtain across level telescopic sum difference mc estimator successive fidelity paper introduce three multilevel variance reduction strategy rely surrogate base cv mlmc mlcv present extension cv correction term devise surrogate model simulator different level add mlmc cv improve mlmc estimator use cv base surrogate correction term level variance reduction achieve use surrogate base cv level mlmc mlcv strategy alternative solution reduce subset surrogate use multilevel estimation also introduce propose method test test case literature consist spectral discretization uncertain 1d heat equation statistic interest expect value integrate temperature along domain give time result assess term accuracy computational cost multilevel estimator depend whether construction surrogate associate computational cost precede evaluation estimator show low fidelity output strongly correlate high fidelity output significant variance reduction obtain use surrogate model coarser level also show take advantage pre existing surrogate model prove even efficient strategy
2824,hand gesture recognition base surface electromyographic semg signal promising approach develop human machine interfaces hmis natural control intuitive robot interface poly articulate prosthesis however real world application limit reliability problem due motion artefact postural temporal variability sensor positioning master thesis first application deep learning unibo inail dataset first public semg dataset explore variability subject session arm posture collect datum 8 session 7 able bodied subject execute 6 hand gesture 4 arm posture recent study address variability strategy base training set composition improve inter posture inter day generalization non deep machine learn classifier among rbf kernel svm yield high accuracy deep architecture realize work 1d cnn inspire 2d cnn report perform well public benchmark database 1d cnn various training strategy base training set composition implement test multi session training prove yield high inter session validation accuracy single session training two posture training prove good postural training prove benefit training one posture yield 812 inter posture test accuracy five day training prove good multi day training yield 759 inter day test accuracy result close baseline moreover result multi day training highlight phenomenon user adaptation indicate training also prioritize recent datum though well baseline achieve classification accuracy rightfully place 1d cnn among candidate research
4036,background mr base subchondral bone effectively predict knee osteoarthritis however clinical application limit cost time mr purpose aim develop novel distillation learning base method name srrd subchondral bone microstructural analysis use easily acquire ct image leverage pair mr image enhance ct base analysis model training material methods knee joint image ct mr modality collect october 2020 may 2021 firstly develop gan base generative model transform mr image ct image use establish anatomical correspondence two modality next obtain numerous patch subchondral bone region mr image together trabecular parameter bv tv tb th tb sp tb n correspond ct image patch via regression distillation learning technique use train regression model transfer mr structural information ct base model regress trabecular parameter far use knee osteoarthritis classification result total 80 participant evaluate ct base regression result trabecular parameter achieve intra class correlation coefficient iccs 0804 0773 0711 0622 bv tv tb th tb sp tb n respectively use distillation learning significantly improve performance ct base knee osteoarthritis classification method use cnn approach yield auc score 0767 95 ci 0681 0853 instead 0658 95 ci 0574 0742 p001 conclusion propose srrd method show high reliability validity mr ct registration regression knee osteoarthritis classification indicate feasibility subchondral bone microstructural analysis base ct image
5063,present design textitin silico evaluation closed loop insulin delivery algorithm treat type 1 diabetes t1d consist data drive multi step ahead blood glucose bg predictor integrate linear time varying ltv model predictive control mpc framework instead identify open loop model glucoregulatory system available datum propose directly fit entire bg prediction predefine prediction horizon use mpc nonlinear function past input ouput datum affine function future insulin control input nonlinear part long short term memory lstm network propose affine component linear regression model choose assess benefit drawback compare traditional linear mpc base auto regressive exogenous arx input model identify datum evaluate propose lstm mpc controller three simulation scenario nominal case 3 meal per day random meal disturbance case meal generate recently publish meal generator case 25 decrease insulin sensitivity far scenario feedforward meal bolus administer challenging random meal generation scenario mean standard deviation percent time range 70 180 mg dl 7499 709 vs 5415 1489 mean standard deviation percent time tight range 70 140 mg dl 4778 855 vs 3462 904 mean standard deviation percent time sever hypoglycemia ie 54 mg dl 100 318 vs 945 1171 propose lstm mpc controller traditional arx mpc respectively approach provide accurate prediction future glucose concentration good closed loop performance overall mpc controller
5183,magnetic recording device still competitive storage density race solid state device thank new technology two dimensional magnetic recording tdmr advanced datum processing scheme need guarantee reliability tdmr datum pattern bit surround complementary bit four position manhattan distance tdmr grid call plus isolation pis pattern error prone recently introduce lexicographically order constrain loco code namely optimal plus loco op loco code prevent pattern write tdmr device however high density regime low energy regime additional error prone pattern emerge specifically datum pattern bit surround complementary bit three position manhattan distance call incomplete plus isolation ipis pattern paper present capacity achieve code forbid pis ipis pattern tdmr system wide read head collectively call pis ipis pattern rotate isolation rtis pattern call new code optimal loco ot loco code analyze ot loco code present simple encoding decode rule allow reconfigurability also present novel bridging idea code far increase rate simulation result demonstrate ot loco code capable eliminate medium noise effect entirely practical td density high rate far preserve storage capacity suggest use op loco code early device lifetime employ reconfiguration property switch ot loco code later point reconfiguration density energy axis decide manually moment next step use machine learning take decision base tdmr device status
6841,open vocabulary segmentation challenging task require segmenting recognize object open set category one way address challenge leverage multi modal model clip provide image text feature share embed space bridge gap closed vocabulary open vocabulary recognition hence exist method often adopt two stage framework tackle problem input first go mask generator clip model along predict mask process involve extract feature image multiple time ineffective inefficient contrast propose build everything single stage framework use share frozen convolutional clip backbone significantly simplify current two stage pipeline also remarkably yield well accuracy cost trade propose fc clip benefit follow observation frozen clip backbone maintain ability open vocabulary classification also serve strong mask generator convolutional clip generalize well large input resolution one use contrastive image text pretraining train coco panoptic datum test zero shot manner fc clip achieve 268 pq 168 ap 341 miou ade20 k 182 pq 279 miou mapillary vistas 440 pq 268 ap 562 miou cityscapes outperform prior art 42 pq 24 ap 42 miou ade20 k 40 pq mapillary vistas 201 pq cityscapes respectively additionally training testing time fc clip 75x 66x significantly fast prior art use 59x parameter fc clip also set new state art performance across various open vocabulary semantic segmentation dataset code httpsgithubcombytedancefcclip
7817,background prostate cancer pc mri base risk calculator commonly base biological eg psa mri marker eg volume patient age whilst patient age measure amount year individual exist biological age ba might well reflect physiology individual however surrogate prostate mri linkage clinically significant pc cspc remain explore purpose obtain evaluate prostate age gap pag mri marker tool cspc risk study type retrospective population total 7243 prostate mri slice 468 participant undergo prostate biopsy deep learning model train 3223 mri slice crop around gland 81 low grade pc ncspc gleason score 6 131 negative case test remain 256 participant assessment chronological age define age participant time visit use train deep learning model predict age patient follow obtain pag define model predict age minus patient chronological age multivariate logistic regression model use estimate association odd ratio predictive value pag compare psa level pi rads3 statistical test test mann whitney u test permutation test roc curve analysis result multivariate adjust model show significant difference odd clinically significant pc cspc gleason score 7 378 95 confidence interval ci232 616 p 001 pag show well predictive ability compare pi rads3 adjust risk factor include psa level auc 0981 vs auc 0704 p001 conclusion pag significantly associate risk clinically significant pc outperform well establish pc risk factor


This looks reasonable. Let's proceed to create the dataset and train our first baseline algorithm. 

In [53]:
from arxiv_article_classifier.data.make_processed_data_bow import convert_interim_to_processed_data
datafolder_processed_bow = DATAFOLDER_PROCESSED/'bow-model'
datafolder_processed_bow.mkdir(exist_ok=True)
convert_interim_to_processed_data(DATAFOLDER_INTERIM, datafolder_processed_bow)