# where do all the good links go?

extractling links and definitions from markdown using `markdown_it` tokens

In [1]:
    @__import__("functools").lru_cache
    def get_exporter(key="mkdocs", **kw):
        with __import__("importnb").Notebook():
            from tonyfast.xxii.__markdownish_notebook import template, HEAD, replace_attachments, PidgyExporter
        kw.setdefault("template_file", key)
        exporter = PidgyExporter(**kw)
        exporter.environment.filters.setdefault("attachment", replace_attachments)
        from jinja2 import DictLoader
        for loader in exporter.environment.loader.loaders:
            if isinstance(loader, DictLoader):
                loader.mapping[key] = template
                loader.mapping["HEAD"] = HEAD
                break
        return exporter

In [2]:
    with __import__("importnb").Notebook():
        from tonyfast.xxiii.__duckdb_search import *
        from tonyfast.xxii.__markdownish_notebook import PidgyExporter, template   

    from midgy import Python
    import nbformat
    from markdown_it.tree import SyntaxTreeNode

In [3]:
    
    @dataclasses.dataclass
    class Finder:
        dir: str = ".."
        include: str = "*.ipynb\n*.md"
        exclude: str = ".ipynb_checkpoints"
        
        def get_files_stats(self, path):
            stat = path.stat()
            return dict(path=path, suffix=path.suffix, created_at=stat.st_ctime, modified_at=stat.st_mtime, size=stat.st_size)
        
        def get_files(self) -> list[dict]:
            return list(map(self.get_files_stats, iter_files(self.dir, self.include, self.exclude)))
        
        def __iter__(self):
            yield from self.get_files()
            
        def to_frame(self, updated_from=None):
            df = pandas.DataFrame(self)
            if updated_from is not None:
                return df[df.modified_at.ne(updated_from.modified_at)]
            return df
        
        def to_dask(self):
            from dask.dataframe import from_pandas
            return from_pandas(df := self.to_frame(), npartitions=len(df))

In [4]:
    order = dict([("cells", "O"), ("metadata", "O"), ("nbformat", int), ("nbformat_minor", int)])

In [5]:
    (
        ddf := Finder().to_dask()
    )
    ddf = ddf.assign(loader=ddf.suffix.apply({".md": get_markdown_file, ".ipynb": nbformat.v4.reads}.get, meta=("loader", "O")))
    ddf = ddf.assign(
        data=ddf.apply(lambda s: s.loader(s.path.read_text()), axis=1, meta=("data", "O"))
    )
    ddf = ddf.assign(md = ddf.data.apply(
        compose_left(get_exporter().from_notebook_node, first), meta=("md", "O")))
    ddf = ddf.assign(tokens=ddf.md.apply(Python().parse, meta=("tokens", "O")))
    ddf

Unnamed: 0_level_0,path,suffix,created_at,modified_at,size,loader,data,md,tokens
npartitions=49,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,object,object,float64,float64,int64,object,object,object,object
1,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...
48,...,...,...,...,...,...,...,...,...
48,...,...,...,...,...,...,...,...,...


## how many tokens are there?

In [6]:
    s = ddf.tokens.apply(compose_left(
        SyntaxTreeNode, operator.methodcaller("walk"), list
    ), meta=("token", "O")).explode()
    s.apply(operator.attrgetter("type"), meta=("type", "O")).value_counts().compute()

text                  4658
html_inline           3739
inline                1019
paragraph              767
softbreak              615
code_inline            573
fence                  345
html_block             304
heading                239
code_block             202
list_item              186
link                   102
bullet_list             61
definition              50
root                    49
strong                  41
em                      16
blockquote              13
td                       9
ordered_list             8
image                    7
tr                       4
th                       3
dd                       2
footnote_reference       1
footnote_ref             1
dt                       1
table                    1
tbody                    1
dl                       1
thead                    1
Name: type, dtype: int64

## all the links

In [7]:
    links = s[
        s.apply(compose_left(operator.attrgetter("type"), "link image definition".split().__contains__), meta=("link", bool))
    ].compute()

In [8]:
    links.apply(compose_left(operator.attrgetter("attrs", "meta"), merge, pandas.Series))

Unnamed: 0,href,title,id,url,label,src,alt
0,https://duckdb.org/docs/extensions/full_text_s...,,,,,,
0,https://duckdb.org/docs/guides/python/import_p...,,,,,,
1,https://mermaid.js.org/syntax/flowchart.html,the mermaid.js documentation,,,,,
1,https://github.blog/2022-02-14-include-diagram...,announcement that github renders mermaid.js,,,,,
1,https://github.blog/2022-02-14-include-diagram...,announcement that github renders mermaid.js,,,,,
...,...,...,...,...,...,...,...
48,,,MKDOCS-JUPYTER,https://github.com/danielfrg/mkdocs-jupyter,mkdocs-jupyter,,
48,,,PLUGIN,https://www.mkdocs.org/dev-guide/plugins/,plugin,,
48,https://www.mkdocs.org/dev-guide/plugins/,,,,,,
48,,,MKDOCS-JUPYTER,https://github.com/danielfrg/mkdocs-jupyter,mkdocs-jupyter,,
