# Tools for AI contexts

In [None]:
from aix import contexts
from oa.chats import ChatDacc  # currently broken
from scraped import markdown_of_site, download_site, scrape_multiple_sites
import hubcap


## github discussions (etc.)

In [1]:
from hubcap import RepoReader

url = 'https://github.com/thorwhalen/sonification'
r = RepoReader(url)

In [2]:
discussions = r['discussions']
list(discussions)

[4, 1]

In [3]:
# jdict = dict(discussions)  # get all discussions as a dict
jdict = discussions[4]


In [7]:
list(jdict)

['number', 'title', 'body', 'author', 'createdAt', 'updatedAt', 'comments']

In [13]:
subdict = jdict['comments'][-1]
subdict

{'body': '## Python Libraries for Sonification and Music Synthesis\r\n\r\nWe’ll look into Python libraries and tools that can help transform numerical feature vectors representing sentiments into musical or auditory forms. This includes tools that support MIDI generation, audio synthesis, real-time playback, and symbolic music generation, while enabling both symbolic and acoustic mappings. We’ll focus on options that support polyphonic outputs and span from high-level abstractions to low-level sound design libraries\r\n\r\nThis list includes Python tools for converting numerical feature vectors (e.g., emotion scores) into sound or music using both symbolic (MIDI, notes, instruments) and acoustic (pitch, volume, timbre) mappings. All tools support polyphony and span a range of abstraction levels.\r\n\r\n---\r\n\r\n### 🎵 High-Level Sonification Frameworks\r\n\r\n- **[Astronify](https://github.com/spacetelescope/astronify)**  \r\n  Time-series data to musical sound. Originally for astrono

In [16]:
from hubcap import create_markdown_from_discussion_jdict

# TODO: Make it work for subdict of discussion jdic
# discussion_md = create_markdown_from_discussion_jdict(subdict)

discussion_md = create_markdown_from_discussion_jdict(jdict)
len(discussion_md)

76326

In [None]:
# copy the markdown (to paste it elsewhere)
from pyperclip import copy
copy(discussion_md)

In [None]:
# ... or save it to a file
# import pathlib 
# pathlib.Path('sonification_discussion.md').write_text(discussion_md)


## Extract urls from markdown

In [24]:
content = jdict['comments'][-1]['body']
print(content[:1500], '...\n...')

## Python Libraries for Sonification and Music Synthesis

We’ll look into Python libraries and tools that can help transform numerical feature vectors representing sentiments into musical or auditory forms. This includes tools that support MIDI generation, audio synthesis, real-time playback, and symbolic music generation, while enabling both symbolic and acoustic mappings. We’ll focus on options that support polyphonic outputs and span from high-level abstractions to low-level sound design libraries

This list includes Python tools for converting numerical feature vectors (e.g., emotion scores) into sound or music using both symbolic (MIDI, notes, instruments) and acoustic (pitch, volume, timbre) mappings. All tools support polyphony and span a range of abstraction levels.

---

### 🎵 High-Level Sonification Frameworks

- **[Astronify](https://github.com/spacetelescope/astronify)**  
  Time-series data to musical sound. Originally for astronomy but works with any 1D data. Maps values 

In [25]:
from aix.contexts import extract_urls

[x for x in dir(extract_urls) if not x.startswith('_')]

['html_links', 'only_urls', 'with_surrounding_context']

In [41]:
list(extract_urls.only_urls(content))[:4]

[('', 'https://github.com/spacetelescope/astronify)**'),
 ('', 'https://github.com/mrahim/sci-sonify)**'),
 ('', 'https://github.com/therevoman/miditime)**'),
 ('', 'https://github.com/FlorianWilhelm/audio-plot-lib)**')]

In [33]:
# list(extract_urls.with_surrounding_context(content))

# Extracting info from notebooks

Sometimes we want to get some jupyter notebook stuff into the AI context. 
Notebooks are stored as quite verbose jsons, so what I often do is to convert the notebook to markdown. 
Even then, I might get more than I want, which will go over AI context limits, or at the very least infect 
the signal-to-noise ratio. 

So I want to filter stuff (in and/or out). 

I can do that pre-conversion (on the json) or post-conversion (on the markdown). 
Both have their place. 

One con of doing this on the json is that it's more information rich, so can be more complicated to find 
what you need to filter in or out. 
One pro is that the json is structured, so it's easier to express many of your filtering needs
(less need for regular expressions etc.)

In [None]:
from test2doc.notebook_utils import clear_outputs_of_largest_output_cells, ensure_notebook_dict

In [None]:
notebook_filepath = '/Users/thorwhalen/Dropbox/py/proj/notebooks/003 - Scrap 2025.ipynb'

import json
nb = ensure_notebook_dict(notebook_filepath)
len(json.dumps(nb['cells']))

15416894

In [None]:

nb = ensure_notebook_dict(notebook_filepath)
n = 5
print(f"{len(json.dumps(nb['cells']))=}")

import pathlib
import json
from dol import Pipe

target_filepath = '/Users/thorwhalen/Dropbox/py/proj/notebooks/_003 - Scrap 2025.ipynb'
egress = Pipe(
    json.dumps,
    pathlib.Path(target_filepath).write_text
)
clear_outputs_of_largest_output_cells(nb, n, egress=egress)
print(f"{len(json.dumps(nb['cells']))=}")


len(json.dumps(nb['cells']))=15415262
len(json.dumps(nb['cells']))=372927


In [None]:

import json

nb = ensure_notebook_dict(notebook_filepath)
n = 5
print(f"{len(json.dumps(nb['cells']))=}")

sorted_cells = sort_notebook_cells(
    nb,
    key=get_output_size,
    reverse=True,
    cell_egress=lambda x: x
)

def empty_output(cell):
    if 'outputs' in cell:
        cell['outputs'] = []
    return cell

for i in range(n):
    c = next(sorted_cells)
    empty_output(c)

print(f"{len(json.dumps(nb['cells']))=}")

In [None]:
import pathlib 

target_notebook_filepath = '/Users/thorwhalen/Dropbox/py/proj/notebooks/_003 - Scrap 2025.ipynb'
pathlib.Path(target_notebook_filepath).write_text(json.dumps(nb))

375341

In [None]:
c

{'cell_type': 'code',
 'execution_count': 7,
 'id': '89a92820',
 'metadata': {},
 'outputs': [],
 'source': ['import pandas as pd \n', '\n', "pd.DataFrame(r['results'])"]}

In [None]:
c = ensure_notebook_dict('/Users/thorwhalen/Dropbox/py/proj/notebooks/003 - Scrap 2025.ipynb')
len(c)

4

In [None]:
len(c['cells'])
c['cells'][0]

{'cell_type': 'code',
 'execution_count': 1,
 'id': '75a6a00a',
 'metadata': {},
 'outputs': [{'name': 'stderr',
   'output_type': 'stream',
 'source': ['import qo']}

# Scrap

In [None]:
from aix import bytes_store_to_markdown_store
from aix.contexts import extensions_not_supported_by_converters

src_dir = 'ENTER_YOUR_SOURCE_DIR_HERE'

In [None]:
# If you want to 
targ_dir = None  # also enter your target folder here

if targ_dir is not None:
    assert not extensions_not_supported_by_converters(src_dir), (
        "some extensions are not supported by the converters"
    )

    target_store = bytes_store_to_markdown_store(src_dir, targ_dir)

    print(f"You now have a folder with {len(target_store)} markdown files in it: {targ_dir}")


In [None]:
# doing it in memory
in_memory_target_store = bytes_store_to_markdown_store(src_dir, target_store={})
len(in_memory_target_store)

50

In [8]:
from aix.contexts import aggregate_store

md_string = aggregate_store(in_memory_target_store)
len(md_string)

14450386

In [9]:
from aix.contexts import get_extension

extensions = set(map(lambda x: '.'.join(x.split('.')[-2:]), in_memory_target_store))
extensions

{'docx.md', 'pdf.md', 'pptx.md', 'xlsx.md'}

In [None]:
aggregate_stores_by_ext = {}

for ext in extensions:
    substore = {k: v for k, v in in_memory_target_store.items() if k.endswith(ext)}
    aggregate_key = "aggregate.{}".format(ext)
    aggregate_stores_by_ext[aggregate_key] = aggregate_store(substore)
    print(f"aggregate for extension {ext} has {len(aggregate_stores_by_ext[aggregate_key])} characters")


aggregate for extension pptx.md has 72835 characters
aggregate for extension xlsx.md has 1981094 characters
aggregate for extension pdf.md has 1577353 characters
aggregate for extension docx.md has 10819098 characters


In [14]:
# if you want to save this to disk
import dol 

target_sub_stores_folders = '~'  # change here
aggregate_stores_by_ext_files = dol.TextFiles(target_sub_stores_folders)

aggregate_stores_by_ext_files.update(aggregate_stores_by_ext)
# and now you have some files saved to target_sub_stores_folders