# Title

## Setup

In [7]:
# --- Configture Notebook ------
# show all outputs of cell
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

import black
import jupyter_black

jupyter_black.load(
    lab=True,
    line_length=100,
    verbosity="DEBUG",
    target_version=black.TargetVersion.PY310,
)

# enable automatic reloading
%load_ext autoreload
%autoreload 2

import plotly.express as px
from sklearn.pipeline import FunctionTransformer, Pipeline
from arxiv_article_classifier.data.load import load_processed_data
import pandas as pd


from pathlib import Path

from pandas.core.base import PandasObject
from arxiv_article_classifier.utils import display_fully

PandasObject.display_fully = display_fully

DATAFOLDER = Path().cwd().parent / "data"



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
# Constants
DATAFOLDER_INTERIM = Path().cwd().parent / "data" / "interim"
DATAFOLDER_PROCESSED = Path().cwd().parent / "data" / "processed"

In [9]:
(X_train_raw, _, _, y_train, _, _), labels = load_processed_data(DATAFOLDER_INTERIM)

In [10]:
X_train_raw

array(['Vision-based formation control systems recently have attracted attentions\nfrom both the research community and the industry for its applicability in\nGPS-denied environments. The safety assurance for such systems is challenging\ndue to the lack of formal specifications for computer vision systems and the\ncomplex impact of imprecise estimations on distributed control. We propose a\ntechnique for safety assurance of vision-based formation control. Our technique\ncombines (1) the construction of a piecewise approximation of the worst-case\nerror of perception and (2) a classical Lyapunov-based safety analysis of the\nconsensus control algorithm. The analysis provides the ultimate bound on the\nrelative distance between drones. This ultimate bound can then be used to\nguarantee safe separation of all drones. We implement an instance of the\nvision-based control system on top of the photo-realistic AirSim simulator. We\nconstruct the piecewise approximation for varying perception 

In [11]:
import numpy as np
from arxiv_article_classifier.data.make_processed_data_bow import (
    LATEX_REGEX,
    LINEBREAK_REGEX,
    NLP,
    PUNCTUATION_DELETION_TABLE,
    STOPLIST,
    delete_regular_expression,
    lemmatize_document,
    remove_stopwords,
)


linebreak_cleaner = FunctionTransformer(
    lambda X: np.array([delete_regular_expression(x, LINEBREAK_REGEX) for x in X])
)

lower_case_converter = FunctionTransformer(lambda X: np.array([x.lower() for x in X]))
whitespace_deleter = FunctionTransformer(lambda X: np.array([" ".join(x.split()) for x in X]))

lemmatizer = FunctionTransformer(lambda X: np.array([lemmatize_document(x, NLP) for x in X]))

punctuation_deleter = FunctionTransformer(
    lambda X: np.array([x.translate(PUNCTUATION_DELETION_TABLE) for x in X])
)

stopword_remover = FunctionTransformer(
    lambda X: np.array([remove_stopwords(x, STOPLIST) for x in X])
)

latex_remover = FunctionTransformer(
    lambda X: np.array([delete_regular_expression(x, LATEX_REGEX) for x in X])
)

cleaning_pipeline = Pipeline(
    [
        ("clean_linebreaks", linebreak_cleaner),
        ("remove_latex", latex_remover),
        ("lemmatize", lemmatizer),
        ("convert_to_lowercase", lower_case_converter),
        ("delete_punctuation", punctuation_deleter),
        ("delete_whitespace", whitespace_deleter),
        ("remove_stopwords", stopword_remover),
    ]
)

In [12]:
X_train = cleaning_pipeline.transform(X_train_raw)

In [13]:
abstracts = (
    pd.DataFrame(y_train, columns=labels)
    .assign(abstract=X_train)
    .assign(nwords=lambda df: df["abstract"].map(lambda abstract: len(abstract.split())))
)

px.bar(abstracts["nwords"].value_counts())

Abstracts have between 7 and 213 words after stop-word removal and the distribution does not show any obvious outliers. Let's have a quick look at both ends if they look reasonable.

In [17]:
abstracts[abstracts["nwords"] < 20][["abstract"]].display_fully()

Unnamed: 0,abstract
406,paper show initial stage development first principle formal logic characterise explore issue broadly define idea veracity
745,work consider estimation method sparse poisson model inspire 1 provide novel sign consistency result mild condition
1027,short note establish positionality mean payoff game infinite game graph construct well found monotone universal graph
1214,note uncover three connection metric distortion problem voting method axiom social choice literature
1628,present optimal transport framework perform regression covariate response probability distribution compact euclidean subset
3318,paper argue synthetic datum produce differentially private generative model sufficiently anonymize therefore anonymous datum regulatory compliant
3932,paper prove dalembert lagrange principle point masse use lagrange mach mechanical construction yield weighted balancing condition unit vector
4003,prove coefficient sum square entry symmetric matrix
4176,standard fractional projection extend binary two mode network weight two mode network interesting property extended projection prove
4700,work tackle problem online camera robot pose estimation single view successive frame image sequence crucial task robot interact world


In [15]:
abstracts[abstracts["nwords"] > 200][["abstract"]].display_fully()

Unnamed: 0,abstract
1341,complexity increasingly tight coupling supply chain pose major logistical challenge lead company another challenge lead company -- pressure consumer critical public legislative measure supply chain law -- take responsibility supplier labour standard paper discuss new approach lead company use try address challenge algorithmic prediction business risk also environmental social risk describe technical cultural condition algorithmic prediction explain -- perspective lead company -- help address challenge develop scenario kind social consequence algorithmic prediction use lead company scenario derive policy option different stakeholder group help develop algorithmic prediction towards improve labour standard worker voice -- die komplexitat und zunehmend enge kopplung vieler lieferketten stellt eine grosse logistische herausforderung fur leitunternehmen dar eine weitere herausforderung besteht darin dass leitunternehmen -- gedrangt durch konsument innen eine kritische offentlichkeit und gesetzgeberische massnahman wie die lieferkettengesetze -- starker al bisher verantwortung fur arbeitsstandards ihren zulieferbetrieben ubernehmen mussen diesem beitrag diskutieren wir einen neuen ansatz mit dem leitunternehmen versuchen diese herausforderungen zu bearbeiten die algorithmische vorhersage von betriebswirtschaftlichen aber auch okologischen und sozialen risiken wir beschreiben die technischen und kulturellen bedingungen fur algorithmische vorhersage und erklaren wie diese -- aus perspektive von leitunternehmen -- bei der bearbeitung beider herausforderungen hilft anschliessend entwickeln wir szenarien wie und mit welchen sozialen konsequenzen algorithmische vorhersage durch leitunternehmen eingesetzt werden kann aus den szenarien leiten wir handlungsoptionen fur verschiedene stakeholder gruppen ab die dabei helfen sollen algorithmische vorhersage sinne einer verbesserung von arbeitsstandards und workers voice weiterzuentwickeln
1483,paper integrate nonlinear manifold reduced order model nm roms domain decomposition dd nm roms approximate fom state nonlinear manifold train shallow sparse autoencoder use fom snapshot datum nm rom advantageous linear subspace roms ls roms problem slowly decay kolmogorov -width however number nm rom parameter need train scale size fom moreover extreme scale problem storage high dimensional fom snapshot alone make rom training expensive alleviate training cost paper apply dd fom compute nm rom subdomain couple obtain global nm rom approach several advantage subdomain nm rom train parallel involve parameter train global nm rom require small subdomain fom dimensional training datum training subdomain nm rom tailor subdomain specific feature fom shallow sparse architecture autoencoder use subdomain nm rom allow application hyper reduction hr reduce complexity cause nonlinearity yield computational speedup nm rom paper provide first application nm rom hr dd problem particular detail algebraic dd formulation fom train nm rom hr subdomain develop sequential quadratic programming sqp solver evaluate coupled global nm rom theoretical convergence result sqp method priori posteriori error estimate dd nm rom hr provide propose dd nm rom hr approach numerically compare dd ls rom hr 2d steady state burgers equation show order magnitude improvement accuracy propose dd nm rom dd ls rom
1837,promising solution improve communication quality unmanned aerial vehicle uav widely integrate wireless network paper sake enhance message exchange rate user1 u1 user2 u2 intelligent reflective surface irs-and uav- assist two way amplify forward af relay wireless system propose u1 u2 communicate via uav mount irs af relay besides optimization problem maximize minimum rate cast variable namely af relay beamforme matrix irs phase shift two time slot need optimize achieve maximum rate low complexity alternately iterative ai scheme base zero forcing successive convex approximation lc zf sca algorithm put forward expression af relay beamforme matrix derive semi closed form zf method irs phase shift vector two time slot respectively optimize utilize sca algorithm obtain significant rate enhancement high performance ai method base one step semidefinite programming penalty sca ons sdp psca propose beamforme matrix af relay firstly solve singular value decomposition ons method irs phase shift matrix two time slot optimize sdp psca algorithm simulation result present rate performance propose lc zf sca ons sdp psca method surpass random phase af relay particular total transmit power equal 30dbm propose two method harvest 685 rate gain compare random phase af relay meanwhile rate performance ons sdp psca method cost extremely high complexity superior lc zf sca method
2346,doctoral thesis develop new method set base state estimation active fault diagnosis afd nonlinear discrete time system ii discrete time nonlinear system whose trajectory satisfy nonlinear equality constraint call invariant iii linear descriptor system iv joint state parameter estimation nonlinear descriptor system set base estimation aim compute tight enclosure possible system state time step subject unknown bound uncertainty address issue present doctoral thesis propose new method efficiently propagate constrain zonotope czs nonlinear mapping besides thesis improve standard prediction update framework system invariant use new algorithm refine cz base nonlinear constraint addition thesis introduce new approach set base afd class nonlinear discrete time system affine parametrization reachable set obtain design optimal input set base afd addition thesis present new method base cz set value state estimation afd linear descriptor system linear static constraint state variable directly incorporate cz moreover thesis propose new representation unbounded set base zonotope allow develop method state estimation afd also unstable linear descriptor system without knowledge enclosure trajectory system thesis also develop new method set base joint state parameter estimation nonlinear descriptor system use cz unified framework lastly manuscript apply propose set base state estimation afd method use cz unmanned aerial vehicle water distribution network lithium ion cell
2670,hand gesture recognition base surface electromyographic semg signal promising approach develop human machine interfaces hmis natural control intuitive robot interface poly articulate prosthesis however real world application limit reliability problem due motion artefact postural temporal variability sensor positioning master thesis first application deep learning unibo inail dataset first public semg dataset explore variability subject session arm posture collect datum 8 session 7 able bodied subject execute 6 hand gesture 4 arm posture recent study address variability strategy base training set composition improve inter posture inter day generalization non deep machine learn classifier among rbf kernel svm yield high accuracy deep architecture realize work 1d cnn inspire 2d cnn report perform well public benchmark database 1d cnn various training strategy base training set composition implement test multi session training prove yield high inter session validation accuracy single session training two posture training prove good postural training prove benefit training one posture yield 812 inter posture test accuracy five day training prove good multi day training yield 759 inter day test accuracy result close baseline moreover result multi day training highlight phenomenon user adaptation indicate training also prioritize recent datum though well baseline achieve classification accuracy rightfully place 1d cnn among candidate research
3818,background estimation temporospatial clinical feature gait cf step count length step duration step frequency gait speed distance travel important component community base mobility evaluation use wearable accelerometer however challenge arise device complexity availability cost analytical methodology limit widespread application tool research question accelerometer datum commercially available smartphone use extract gait cf across broad range attainable gait velocity child duchenne muscular dystrophy dmd typically develop control td use machine learning ml-based method method fifteen child dmd 15 td underwent supervise clinical testing across range gait speed use 10 25 run walk 10mrw 25mrw 100 run walk 100mrw 6 minute walk 6mwt free walk fw evaluation wear mobile phone base accelerometer waist near body center mass gait cf extract accelerometer datum use multi step machine learning base process result compare ground truth observation datum result model prediction vs observed value step count distance travel step length show strong correlation pearson r -09929 09986 p00001 estimate demonstrate mean sd percentage error 149 704 step count 118 991 distance travel 037 752 step length compare ground truth observation combine 6mwt 100mrw fw task significance study finding indicate single accelerometer place near body center mass accurately measure cf across different gait speed td dmd peer suggest potential accurately measure cf community consumer level smartphone
4693,present design textitin silico evaluation closed loop insulin delivery algorithm treat type 1 diabetes t1d consist data drive multi step ahead blood glucose bg predictor integrate linear time varying ltv model predictive control mpc framework instead identify open loop model glucoregulatory system available datum propose directly fit entire bg prediction predefine prediction horizon use mpc nonlinear function past input ouput datum affine function future insulin control input nonlinear part long short term memory lstm network propose affine component linear regression model choose assess benefit drawback compare traditional linear mpc base auto regressive exogenous arx input model identify datum evaluate propose lstm mpc controller three simulation scenario nominal case 3 meal per day random meal disturbance case meal generate recently publish meal generator case 25 decrease insulin sensitivity far scenario feedforward meal bolus administer challenging random meal generation scenario mean standard deviation percent time range 70 180 mg dl 7499 709 vs 5415 1489 mean standard deviation percent time tight range 70 140 mg dl 4778 855 vs 3462 904 mean standard deviation percent time sever hypoglycemia ie 54 mg dl 100 318 vs 945 1171 propose lstm mpc controller traditional arx mpc respectively approach provide accurate prediction future glucose concentration good closed loop performance overall mpc controller
4786,magnetic recording device still competitive storage density race solid state device thank new technology two dimensional magnetic recording tdmr advanced datum processing scheme need guarantee reliability tdmr datum pattern bit surround complementary bit four position manhattan distance tdmr grid call plus isolation pis pattern error prone recently introduce lexicographically order constrain loco code namely optimal plus loco op loco code prevent pattern write tdmr device however high density regime low energy regime additional error prone pattern emerge specifically datum pattern bit surround complementary bit three position manhattan distance call incomplete plus isolation ipis pattern paper present capacity achieve code forbid pis ipis pattern tdmr system wide read head collectively call pis ipis pattern rotate isolation rtis pattern call new code optimal loco ot loco code analyze ot loco code present simple encoding decode rule allow reconfigurability also present novel bridging idea code far increase rate simulation result demonstrate ot loco code capable eliminate medium noise effect entirely practical td density high rate far preserve storage capacity suggest use op loco code early device lifetime employ reconfiguration property switch ot loco code later point reconfiguration density energy axis decide manually moment next step use machine learning take decision base tdmr device status
6295,open vocabulary segmentation challenging task require segmenting recognize object open set category one way address challenge leverage multi modal model clip provide image text feature share embed space bridge gap closed vocabulary open vocabulary recognition hence exist method often adopt two stage framework tackle problem input first go mask generator clip model along predict mask process involve extract feature image multiple time ineffective inefficient contrast propose build everything single stage framework use share frozen convolutional clip backbone significantly simplify current two stage pipeline also remarkably yield well accuracy cost trade propose fc clip benefit follow observation frozen clip backbone maintain ability open vocabulary classification also serve strong mask generator convolutional clip generalize well large input resolution one use contrastive image text pretraining train coco panoptic datum test zero shot manner fc clip achieve 268 pq 168 ap 341 miou ade20 k 182 pq 279 miou mapillary vistas 440 pq 268 ap 562 miou cityscapes outperform prior art 42 pq 24 ap 42 miou ade20 k 40 pq mapillary vistas 201 pq cityscapes respectively additionally training testing time fc clip 75x 66x significantly fast prior art use 59x parameter fc clip also set new state art performance across various open vocabulary semantic segmentation dataset code httpsgithubcombytedancefc-clip
7231,background prostate cancer pc mri base risk calculator commonly base biological eg psa mri marker eg volume patient age whilst patient age measure amount year individual exist biological age ba might well reflect physiology individual however surrogate prostate mri linkage clinically significant pc cspc remain explore purpose obtain evaluate prostate age gap pag mri marker tool cspc risk study type retrospective population total 7243 prostate mri slice 468 participant undergo prostate biopsy deep learning model train 3223 mri slice crop around gland 81 low grade pc ncspc gleason score 6 131 negative case test remain 256 participant assessment chronological age define age participant time visit use train deep learning model predict age patient follow obtain pag define model predict age minus patient chronological age multivariate logistic regression model use estimate association odd ratio predictive value pag compare psa level pi rads3 statistical test test mann whitney u test permutation test roc curve analysis result multivariate adjust model show significant difference odd clinically significant pc cspc gleason score 7 378 95 confidence interval ci232 616 p 001 pag show well predictive ability compare pi rads3 adjust risk factor include psa level auc 0981 vs auc 0704 p001 conclusion pag significantly associate risk clinically significant pc outperform well establish pc risk factor


This looks all reasonable apart from that one, which gives its abstract both in English and in German. Let's ignore this for now though and proceed to create the dataset and train our first baseline algorithm. 

In [18]:
X_train_raw[1341]

'The complexity and increasingly tight coupling of supply chains poses a major\nlogistical challenge for leading companies. Another challenge is that leading\ncompanies -- under pressure from consumers, a critical public and legislative\nmeasures such as supply chain laws -- have to take more responsibility than\nbefore for their suppliers\' labour standards. In this paper, we discuss a new\napproach that leading companies are using to try to address these challenges:\nalgorithmic prediction of business risks, but also environmental and social\nrisks. We describe the technical and cultural conditions for algorithmic\nprediction and explain how -- from the perspective of leading companies -- it\nhelps to address both challenges. We then develop scenarios on how and with\nwhat kind of social consequences algorithmic prediction can be used by leading\ncompanies. From the scenarios, we derive policy options for different\nstakeholder groups to help develop algorithmic prediction towards im

This looks reasonable. Let's proceed to create the dataset and train our first baseline algorithm. 

In [16]:
# uncomment the following code or use make processed-data instead

# from arxiv_article_classifier.data.make_processed_data_bow import convert_interim_to_processed_data
#
# datafolder_processed_bow = DATAFOLDER_PROCESSED / "bow-model"
# datafolder_processed_bow.mkdir(exist_ok=True)
# convert_interim_to_processed_data(DATAFOLDER_INTERIM, datafolder_processed_bow)