In [None]:
# Known issues:
# Issue markdown library/Notion export: Empty bullet points lead to headers
# Issue Notion export: Database as Untitled database
# Issue Notion export: Database CSV has no unique reference to pages for database entries (by id)
# Issue Notion export: round and square brackets in URLs are not encoded 

In [None]:
import hashlib
import binascii
import evernote.edam.userstore.constants as UserStoreConstants
import evernote.edam.notestore.NoteStore as NoteStore
import evernote.edam.type.ttypes as Types
import configparser
import re
from urllib.parse import urlparse, unquote, parse_qsl, parse_qs
import mimetypes
from pathlib import Path
import markdown
from evernote.api.client import EvernoteClient
import pandas as pd

In [None]:
# Real applications authenticate with Evernote using OAuth, but for the
# purpose of exploring the API, you can get a developer token that allows
# you to access your own Evernote account. To get a developer token, visit
# https://sandbox.evernote.com/api/DeveloperToken.action

settings_path = Path('settings.ini')
config = configparser.ConfigParser(allow_no_value=True)
config.read(settings_path)

data_dir = Path(config['AppSettings']['data_dir'])

## When using a developer token:
# auth_token = config['AppSettings']['auth_token']
## When using oauth:
consumer_key = config['AppSettings']['consumer_key']
consumer_secret = config['AppSettings']['consumer_secret']
## When previous oauth already provided a token
oauth_token = config['UserSettings']['oauth_token']

# if auth_token == "your developer token":
#     print("Please fill in your developer token")
#     print("To get a developer token, visit " \
#           "https://sandbox.evernote.com/api/DeveloperToken.action")
#     exit(1)

# Initial development is performed on our sandbox server. To use the production
# service, change sandbox=False and replace your
# developer token above with a token from
# https://www.evernote.com/api/DeveloperToken.action
# To access Sandbox service, set sandbox to True
# To access production (International) service, set both sandbox and china to False
# To access production (China) service, set sandbox to False and china to True

sandbox=True
china=False
# client = EvernoteClient(token=auth_token, sandbox=sandbox,china=china)
client = EvernoteClient(
    sandbox=sandbox,
    china=china,
    consumer_key=consumer_key,
    consumer_secret=consumer_secret,
    token=oauth_token)

In [None]:
if client.token is None:
    temporary_token = client.get_request_token('https://wardweistra.nl')
    auth_url = f"https://sandbox.evernote.com/OAuth.action?oauth_token={temporary_token['oauth_token']}"
    print(f"Go to this URL to authenticate:")
    print(auth_url)

    url = input("And paste the URL you are redirected to here: ")
    parsed = urlparse(url)
    parsed_query = parse_qs(parsed.query)

    oauth_token=parsed_query['oauth_token'][0]
    oauth_token_secret=temporary_token['oauth_token_secret']
    oauth_verifier=parsed_query['oauth_verifier'][0]

    access_token = client.get_access_token(
        oauth_token=parsed_query['oauth_token'],
        oauth_token_secret=temporary_token['oauth_token_secret'],
        oauth_verifier=parsed_query['oauth_verifier'],
    )
    config.set('UserSettings', 'oauth_token', access_token)
    config.write(settings_path.open("w"))
    print("Oauth token collected and stored")
else:
    print("Oauth token already exists")

In [None]:
user_store = client.get_user_store()
note_store = client.get_note_store()

version_ok = user_store.checkVersion(
    "Evernote EDAMTest (Python)",
    UserStoreConstants.EDAM_VERSION_MAJOR,
    UserStoreConstants.EDAM_VERSION_MINOR
)
print("Is my Evernote API version up to date? ", str(version_ok))
print("")
if not version_ok:
    exit(1)

In [None]:
placeholder_checkbox_unchecked = "[NOTION-CHECKBOX-UNCHECKED]"
placeholder_checkbox_checked = "[NOTION-CHECKBOX-CHECKED]"

In [None]:
# List all of the notebooks in the user's account
notebooks = note_store.listNotebooks()
print("Found ", len(notebooks), " notebooks:")
for notebook in notebooks:
    print("  * ", notebook.name)

In [None]:
# Create or reuse unique Notebook for our imported notes

def getNotebookGuid(notebook_name):
    notebook_guid = None

    for notebook in note_store.listNotebooks():
        if notebook.name == notebook_name:
            notebook_guid = notebook.guid
            break

    if notebook_guid is None:
        notebook = Types.Notebook()
        notebook.name = notebook_name
        new_notebook = note_store.createNotebook(notebook)
        notebook_guid = new_notebook.guid

    return notebook_guid

notebook_guid = getNotebookGuid(notebook_name='Imported from Notion')

# Create or reuse unique tag for our imported notes

def getTagGuid(tag_name, tag_parent_guid=None):
    tag_guid = None

    for tag in note_store.listTags():
        if tag.name == tag_name:
            if tag_parent_guid is not None:
                if tag.parentGuid != tag_parent_guid:
                    continue
            tag_guid = tag.guid
            break

    if tag_guid is None:
        tag = Types.Tag()
        tag.name = tag_name
        if tag_parent_guid is not None:
            tag.parentGuid = tag_parent_guid
        
        i=1
        new_tag = None
        while new_tag is None:
            try:
                new_tag = note_store.createTag(tag)
            except:
                i+=1
                tag_suffix = f" [{i}]"
                tag.name = tag_name+tag_suffix

        tag_guid = new_tag.guid

    return tag_guid

In [None]:
# # Delete the 250 oldest notes from our Notebook
# noteFilter = NoteStore.NoteFilter()
# noteFilter.notebookGuid = notebook_guid
# noteFilter.ascending = True

# spec = NoteStore.NotesMetadataResultSpec()
# spec.includeTitle = True

# ourNoteList = note_store.findNotesMetadata(noteFilter, 0, 250, spec)
# for note in ourNoteList.notes:
#     note_store.deleteNote(note.guid)

# # Remove all tags (not supported on Oauth token)

# # for tag in note_store.listTags():
# #     note_store.expungeTag(str(tag.guid))

In [None]:
# Find all todos in the markdown
def placeholders_for_todos(markdown_content):
    markdown_content = markdown_content.replace("- [ ]  ", placeholder_checkbox_unchecked)
    markdown_content = markdown_content.replace("- [x]  ", placeholder_checkbox_checked)

    return markdown_content

In [None]:
def fix_links_including_brackets(markdown_content):
    
    links_with_brackets_regex = r'(\[[^\]]*?\]\((\S*?(?:\(|\))\S*?)[a-z0-9]{32}\S*?\.[a-z]{2,5}\))'
    def convert_brackets(match_obj):
        return match_obj.group(1).replace(
            match_obj.group(2),
            match_obj.group(2).replace('(', '%28').replace(')', '%29')
        )
    markdown_content = re.sub(
        links_with_brackets_regex, convert_brackets, markdown_content)

    links_with_brackets_regex2 = r'(\[[^\]]*?\]\(\S*?[a-z0-9]{32}(\S*?(?:\(|\))\S*?)\.[a-z]{2,5}\))'
    markdown_content = re.sub(
        links_with_brackets_regex2, convert_brackets, markdown_content)
    
    images_with_numbers_regex = r'(!\[[^\]]*?\]\((.*?\([0-9]\)\..*?)\))'
    
    def convert_image_brackets(match_obj):
        return match_obj.group(1).replace(
            match_obj.group(2),
            match_obj.group(2).replace('(', '%28').replace(')', '%29')
        )
    markdown_content = re.sub(
        images_with_numbers_regex, convert_image_brackets, markdown_content)

    empty_links_regex = r'\[(.*?)\]\(\)'
    markdown_content = re.sub(
        empty_links_regex, r'\1', markdown_content)
    
    return markdown_content

In [None]:
# Find all images in the markdown, specifically to find internal links
def placeholders_for_images(markdown_content):
    name_regex = "[^]]+"
    url_regex = "[^)]+"
    markup_regex = f'(\!\[({name_regex})]\(\s*({url_regex})\s*\))'

    embedded_files = {}

    for match in re.findall(markup_regex, markdown_content):
        if not bool(urlparse(match[2]).netloc):
            placeholder = f"[NOTION-EMBEDDED-IMAGE={match[2]}]"
            embedded_files[match[2]] = {'placeholder': placeholder}
            markdown_content = markdown_content.replace(match[0], placeholder)
    
    return markdown_content, embedded_files

In [None]:
# Find all links in the markdown, specifically to find internal links
def placeholders_for_links(markdown_content, embedded_files):
    name_regex = "[^]]+"
    url_regex = "[^)]+"
    markup_regex = f'(\[({name_regex})]\(\s*({url_regex})\s*\))'

    internal_links = {}
    link_id = 0

    for match in re.findall(markup_regex, markdown_content):
        if not bool(urlparse(match[2]).netloc):
            placeholder = f"[NOTION-INTERNAL-LINK={str(link_id)}]"

            link_path = Path(match[2])
            if link_path.suffix in ['.md', '.csv']:
                _, link_notion_id = get_note_name_and_guid(link_path)
                internal_links[str(link_id)] = {
                    'link_notion_id': link_notion_id,
                    'link_text': match[1],
                    'placeholder': placeholder,
                }
                markdown_content = markdown_content.replace(match[0], placeholder)

                link_id += 1
            elif match[2].startswith('mailto:'):
                    pass
            else:
                placeholder = f"[NOTION-EMBEDDED-FILE={match[2]}]"
                embedded_files[match[2]] = {'placeholder': placeholder}
                markdown_content = markdown_content.replace(match[0], placeholder)
                
        elif urlparse(match[2]).netloc == 'www.notion.so':
            print(f"Warning - Absolute link to www.notion.so found: {match[2]}. This might be a link to a note that was not included in your export.")

    links_as_text_regex = r'((?<!\()[^\s>]*?([a-z0-9]{32})\.md(?!\)))'

    for match in re.findall(links_as_text_regex, markdown_content):
        if not bool(urlparse(match[0]).netloc):
            placeholder = f"[NOTION-INTERNAL-LINK-AS-TEXT={str(link_id)}]"
            internal_links[str(link_id)] = {
                'link_notion_id': match[1],
                'placeholder': placeholder,
            }
            markdown_content = markdown_content.replace(match[0], placeholder)

            link_id += 1

    return markdown_content, internal_links, embedded_files

In [None]:
def create_note_object(notion_note_guid, note_title, tag_guid=None):
    note = Types.Note()
    note.title = note_title
    if tag_guid is not None:
        note.tagGuids = [tag_guid]

    note_attributes = Types.NoteAttributes()
    note_attributes.source = "Imported from Notion"
    note_attributes.sourceURL = f"https://notion.so/{notion_note_guid}"
    note.attributes = note_attributes

    return note

In [None]:
def get_note_name_and_guid(note_path):
    notion_note_guid = note_path.stem[-32:]
    notion_note_name = note_path.stem[:-32].strip().strip('%20')

    guid_pattern = re.compile(r"[0-9a-z]{32}")
    assert guid_pattern.search(notion_note_guid)
    return notion_note_name, notion_note_guid

In [None]:
def update_todos_placeholders(note_content):
    note_content = note_content.replace(placeholder_checkbox_unchecked, '<br /><en-todo />')
    note_content = note_content.replace(placeholder_checkbox_checked, '<br /><en-todo checked="true"/>')
    return note_content

In [None]:
def update_images_placeholders(note_content, note, embedded_images, current_dir):
    if len(embedded_images)>0:
        note.resources = []

    for image_path in embedded_images:
        image_file = current_dir / unquote(image_path)

        image = image_file.read_bytes()

        mime_type = mimetypes.guess_type(image_path)[0]
        image_filename = image_file.name

        md5 = hashlib.md5()
        md5.update(image)
        hash = md5.digest()

        data = Types.Data()
        data.size = len(image)
        data.bodyHash = hash
        data.body = image

        resource = Types.Resource()
        resource.mime = mime_type
        resource.data = data

        resource_attributes = Types.ResourceAttributes()
        resource_attributes.fileName = image_filename
        resource.attributes = resource_attributes

        # Now, add the new Resource to the note's list of resources
        note.resources += [resource]

        # To display the Resource as part of the note's content, include an <en-media>
        # tag in the note's ENML content. The en-media tag identifies the corresponding
        # Resource using the MD5 hash.
        hash_hex = binascii.hexlify(hash)
        hash_str = hash_hex.decode("UTF-8")

        image_string = f'<en-media type="{mime_type}" hash="{hash_str}"/>'
        note_content = note_content.replace(embedded_images[image_path]['placeholder'], image_string)

    return note_content, note

In [None]:
def create_note(note_content, note):
    # The content of an Evernote note is represented using Evernote Markup Language
    # (ENML). The full ENML specification can be found in the Evernote API Overview
    # at http://dev.evernote.com/documentation/cloud/chapters/ENML.php
    note.content = '<?xml version="1.0" encoding="UTF-8"?>'
    note.content += '<!DOCTYPE en-note SYSTEM ' \
                    '"http://xml.evernote.com/pub/enml2.dtd">'
    note.content += '<en-note>'
    note.content += note_content
    note.content += '</en-note>'

    note.notebookGuid = notebook_guid

    # Finally, send the new note to Evernote using the createNote method
    # The new Note object that is returned will contain server-generated
    # attributes such as the new note's unique GUID.
    created_note = None
    while created_note == None:
        created_note = note_store.createNote(note)

    print("Successfully created a new note with GUID: ", created_note.guid)
    return created_note.guid

In [None]:
# from lxml import etree

# def validate(xmlfile,dtdfile):
#     print(1)
#     try:
#         tree = etree.parse(xmlfile)
#     except etree.XMLSyntaxError as err:
#         print("XMLSyntaxError:%s" %err)
#         exit(1)
#     print(2)

#     if (dtdfile == None):
#         print("%s is well-formed xml"%xmlfile)
#         return
#     dtd = etree.DTD(dtdfile)
#     root = tree.getroot()
#     status = dtd.validate(root)  # status is a Boolean
#     print(3)

#     if status:
#         print("ok")
#     else:
#         errmsg=dtd.error_log.filter_from_errors()[0]
#         print("Problem validating")
#         print(errmsg)

# validate('test.html', 'enml2.dtd')

# # parser = etree.XMLParser(dtd_validation=True)
# # etree.parse('test.html')

In [None]:
def store_note(note_path, tag_guid):
    print(f"Parsing page {note_path}")

    markdown_content = note_path.read_text()
    note_title = markdown_content.splitlines()[0].strip(' #')
    markdown_content = placeholders_for_todos(markdown_content)
    markdown_content = fix_links_including_brackets(markdown_content)
    markdown_content = markdown_content.replace('<', '&#60;').replace('>', '&#62;')
    markdown_content, embedded_files = placeholders_for_images(markdown_content)
    markdown_content, internal_links, embedded_files = \
        placeholders_for_links(markdown_content, embedded_files)
    
    # Convert MD to XHTML
    note_content = markdown.markdown(markdown_content)

    _, notion_note_guid = get_note_name_and_guid(note_path)
    note = create_note_object(notion_note_guid, note_title, tag_guid)

    note_content = update_todos_placeholders(note_content)
    note_content, note = update_images_placeholders(note_content, note, embedded_files, note_path.parent)

    return notion_note_guid, note_title, note, note_content, internal_links

In [None]:
def store_database(note_path, tag_guid):
    csv_content = pd.read_csv(note_path)
    
    note_title, notion_note_guid = get_note_name_and_guid(note_path)
    
    note = create_note_object(notion_note_guid, note_title, tag_guid)
    note_content = csv_content.to_html(na_rep='', index=False, border=1, classes=None)
    note_content = note_content.replace(
        '\\n',
        '<br />'
    )
    note_content = note_content.replace(
        '<table border="1" class="dataframe">',
        '<table style="border-collapse: collapse; min-width: 100%;">'
    )
    note_content = note_content.replace(
        '<td>',
        '<td style="padding: 8px; border: 1px solid;">'
    )

    note_content = fix_links_including_brackets(note_content)
    note_content, embedded_files = placeholders_for_images(note_content)
    note_content, internal_links, embedded_files = \
        placeholders_for_links(note_content, embedded_files)

    note_content, note = update_images_placeholders(note_content, note, embedded_files, note_path.parent)

    return notion_note_guid, note_title, note, note_content, internal_links

In [None]:
note_hierarchy = {}

note_flat_list = {}

def iterate_folders(current_folder, tag_parent_guid=None):
    notes_md = [x for x in current_folder.glob('*.md')]
    notes_csv = [x for x in current_folder.glob('*.csv')]

    if len(notes_md)+len(notes_csv)>0:

        children = {}
        
        tag_guid = None
        if tag_parent_guid is None:
            tag_guid = getTagGuid("Notion")
        else:
            folder_name = get_note_name_and_guid(current_folder)[0]
            tag_guid = getTagGuid(folder_name, tag_parent_guid)
        
        if len(notes_md)>0:
            for note_path in notes_md:
                print(note_path)
                notion_note_guid, note_title, note, \
                    note_content, internal_links = \
                    store_note(note_path, tag_guid)

                note_flat_list[notion_note_guid] = {
                    'note_title': note_title,
                    'type': 'page',
                    'note': note,
                    'note_content': note_content,
                    'internal_links': internal_links,
                }

                children[notion_note_guid] = {
                    'name': note_title,
                    'type': 'page',
                }
        if len(notes_csv)>0:
            for note_path in notes_csv:
                print(note_path)
                notion_note_guid, note_title, note, note_content, internal_links = \
                    store_database(note_path, tag_guid)

                note_flat_list[notion_note_guid] = {
                    'note_title': note_title,
                    'type': 'database',
                    'note': note,
                    'note_content': note_content,
                    'internal_links': internal_links,
                }

                children[notion_note_guid] = {
                    'name': note_title,
                    'type': 'database',
                }
    
        for dir in [x for x in current_folder.iterdir() if x.is_dir()]:
            note_children = iterate_folders(dir, tag_guid)
            if note_children != {}:
                _, dir_guid = get_note_name_and_guid(dir)
                children[dir_guid]['children'] = note_children
        
        return children

In [None]:
iterate_folders(data_dir)

In [None]:
# Store all notes
# You'll likely run into a rate limit when having a decent amount of notes.
# If so: Just run this cell again and it will pick up where it left off.

for note_id in note_flat_list:
    note_entry = note_flat_list[note_id]
    if 'evernote_note_guid' not in note_entry:
        print(note_entry['note_title'])
        evernote_note_guid = create_note(note_entry['note_content'], note_entry['note'])
        note_entry['evernote_note_guid'] = evernote_note_guid

In [None]:
# Update internal links

user = user_store.getUser()
user_id = user.id
shard_id = user.shardId

# service = 'sandbox.evernote.com'
# internal_link = f'evernote:///view/{userId}/{shardId}/{noteGuid}/{noteGuid}/'
# internal_link = f'https://{service}/shard/{shardId}/nl/{userId}/{noteGuid}/'

for note_id in note_flat_list:
    note_entry = note_flat_list[note_id]
    if 'evernote_note_guid' not in note_entry:
        raise ValueError('Note has not been added to Evernote yet')
    else:
        if 'internal_links' in note_entry:
            if 'links_updated' not in note_entry:
                print(note_entry['note_title'])
                note_content = note_store.getNoteContent(note_entry['evernote_note_guid'])
                for internal_link_id in note_entry['internal_links']:
                    internal_link = note_entry['internal_links'][internal_link_id]
                    link_evernote_guid = \
                        note_flat_list[internal_link['link_notion_id']]['evernote_note_guid']

                    link_url = f'evernote:///view/{user_id}/{shard_id}/{link_evernote_guid}/{link_evernote_guid}/'
                    
                    if 'link_text' in internal_link:
                        link_text = internal_link["link_text"]
                    else:
                        link_text = \
                            note_flat_list[internal_link['link_notion_id']]['note_title']
                    link_text = link_text.replace('&', '&amp;')
                    link_html = f'<a href="{link_url}">{link_text}</a>'

                    note_content = note_content.replace(
                        internal_link['placeholder'],
                        link_html
                        )

                note = Types.Note()
                note.title = note_entry['note_title']
                note.guid = note_entry['evernote_note_guid']
                note.content = note_content
                note_store.updateNote(note)

                note_entry['links_updated'] = True