Skip to content

Fix: Remove unused attachment data from ipynb files when attachment links are deleted #251152

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 45 additions & 2 deletions extensions/ipynb/src/serializers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,23 @@ function createRawCellFromNotebookCell(cell: NotebookCellData): nbformat.IRawCel
source: splitMultilineString(cell.value.replace(/\r\n/g, '\n')),
metadata: cellMetadata?.metadata || {} // This cannot be empty.
};

// Only include attachments that are actually referenced in the cell content
if (cellMetadata?.attachments) {
rawCell.attachments = cellMetadata.attachments;
const referencedAttachments = getReferencedAttachmentNames(cell.value);
const usedAttachments: { [key: string]: any } = {};

for (const [filename, attachmentData] of Object.entries(cellMetadata.attachments)) {
if (referencedAttachments.has(filename)) {
usedAttachments[filename] = attachmentData;
}
}

if (Object.keys(usedAttachments).length > 0) {
rawCell.attachments = usedAttachments;
}
}

if (cellMetadata?.id) {
rawCell.id = cellMetadata.id;
}
Expand All @@ -149,6 +163,21 @@ function splitMultilineString(source: nbformat.MultilineString): string[] {
return [];
}

function getReferencedAttachmentNames(source: string): Set<string> {
const filenames = new Set<string>();
// Match: ![...](attachment:filename) or ![...](<attachment:filename>)
// Uses non-greedy match to stop at first ) or > that ends the link
const re = /!\[.*?\]\(<?attachment:(.*?)>?\)/gm;

let match;
while ((match = re.exec(source))) {
if (match[1]) {
filenames.add(match[1]);
}
}
return filenames;
}

function translateCellDisplayOutput(output: NotebookCellOutput): JupyterOutput {
const customMetadata = output.metadata as CellOutputMetadata | undefined;
let result: JupyterOutput;
Expand Down Expand Up @@ -371,9 +400,23 @@ export function createMarkdownCellFromNotebookCell(cell: NotebookCellData): nbfo
source: splitMultilineString(cell.value.replace(/\r\n/g, '\n')),
metadata: cellMetadata?.metadata || {} // This cannot be empty.
};

// Only include attachments that are actually referenced in the cell content
if (cellMetadata?.attachments) {
markdownCell.attachments = cellMetadata.attachments;
const referencedAttachments = getReferencedAttachmentNames(cell.value);
const usedAttachments: { [key: string]: any } = {};

for (const [filename, attachmentData] of Object.entries(cellMetadata.attachments)) {
if (referencedAttachments.has(filename)) {
usedAttachments[filename] = attachmentData;
}
}

if (Object.keys(usedAttachments).length > 0) {
markdownCell.attachments = usedAttachments;
}
}

if (cellMetadata?.id) {
markdownCell.id = cellMetadata.id;
}
Expand Down
71 changes: 68 additions & 3 deletions extensions/ipynb/src/test/serializers.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ suite(`ipynb serializer`, () => {


test('Serialize', async () => {
const markdownCell = new vscode.NotebookCellData(vscode.NotebookCellKind.Markup, '# header1', 'markdown');
const markdownCell = new vscode.NotebookCellData(vscode.NotebookCellKind.Markup, '# header1\n![img](attachment:image.png)', 'markdown');
markdownCell.metadata = {
attachments: {
'image.png': {
Expand All @@ -103,7 +103,7 @@ suite(`ipynb serializer`, () => {
}
});

const markdownCell2 = new vscode.NotebookCellData(vscode.NotebookCellKind.Markup, '# header1', 'markdown');
const markdownCell2 = new vscode.NotebookCellData(vscode.NotebookCellKind.Markup, '# header1\n![img](attachment:image.png)', 'markdown');
markdownCell2.metadata = {
id: '123',
metadata: {
Expand All @@ -122,7 +122,7 @@ suite(`ipynb serializer`, () => {

assert.deepStrictEqual(nbMarkdownCell, {
cell_type: 'markdown',
source: ['# header1'],
source: ['# header1\n', '![img](attachment:image.png)'],
metadata: {
foo: 'bar',
},
Expand All @@ -135,6 +135,71 @@ suite(`ipynb serializer`, () => {
});
});

test('Serialize with unused attachments should not include them', async () => {
// Test case: markdown cell has attachment metadata but no attachment reference in content
const markdownCell = new vscode.NotebookCellData(vscode.NotebookCellKind.Markup, '# header1', 'markdown');
markdownCell.metadata = {
attachments: {
'unused-image.png': {
'image/png': 'unusedImageData'
}
},
id: '123',
metadata: {
foo: 'bar'
}
};

const nbMarkdownCell = createMarkdownCellFromNotebookCell(markdownCell);

// Should not include attachments since they're not referenced in the content
assert.deepStrictEqual(nbMarkdownCell, {
cell_type: 'markdown',
source: ['# header1'],
metadata: {
foo: 'bar',
},
id: '123'
});
});

test('Serialize with used attachments should include them', async () => {
// Test case: markdown cell has attachment metadata and attachment reference in content
const markdownCell = new vscode.NotebookCellData(vscode.NotebookCellKind.Markup,
'# header1\n![alt text](attachment:used-image.png)', 'markdown');
markdownCell.metadata = {
attachments: {
'used-image.png': {
'image/png': 'usedImageData'
},
'unused-image.png': {
'image/png': 'unusedImageData'
}
},
id: '123',
metadata: {
foo: 'bar'
}
};

const nbMarkdownCell = createMarkdownCellFromNotebookCell(markdownCell);

// Should only include attachments that are referenced in the content
assert.deepStrictEqual(nbMarkdownCell, {
cell_type: 'markdown',
source: ['# header1\n', '![alt text](attachment:used-image.png)'],
metadata: {
foo: 'bar',
},
attachments: {
'used-image.png': {
'image/png': 'usedImageData'
}
},
id: '123'
});
});

suite('Outputs', () => {
function validateCellOutputTranslation(
outputs: nbformat.IOutput[],
Expand Down