#!/usr/bin/env python
# Copyright 2011 Peter Odding <>
# This program is licensed under the MIT license.
# This Python script can be used by the notes.vim plug-in to perform fast
# keyword searches in the user's notes. It has two advantages over just using
# Vim's internal :vimgrep command to search all of the user's notes:
# - Very large notes don't slow searching down so much;
# - Hundreds of notes can be searched in less than a second.
# For more information about the Vim plug-in see
# Standard library modules.
import fnmatch
import getopt
import os
import pickle
import re
import sys
import Levenshtein
levenshtein_supported = True
except ImportError:
levenshtein_supported = False
class NotesIndex:
def __init__(self):
''' Entry point to the notes search. '''
keywords = self.parse_args()
if self.dirty:
if self.keyword_filter is not None:
matches = self.search_index(keywords)
print self.encode('\n'.join(sorted(matches)))
def parse_args(self):
''' Parse the command line arguments. '''
opts, keywords = getopt.getopt(sys.argv[1:], 'l:d:n:e:h',
['list=', 'database=', 'notes=', 'encoding=', 'help'])
except getopt.GetoptError, error:
print str(error)
# Define the command line option defaults.
self.database_file = '~/.vim/misc/notes/index.pickle'
self.user_directory = '~/.vim/misc/notes/user/'
self.character_encoding = 'UTF-8'
self.keyword_filter = None
# Map command line options to variables.
for opt, arg in opts:
if opt in ('-l', '--list'):
self.keyword_filter = arg.strip().lower()
elif opt in ('-d', '--database'):
self.database_file = arg
elif opt in ('-n', '--notes'):
self.user_directory = arg
elif opt in ('-e', '--encoding'):
self.character_encoding = arg
elif opt in ('-h', '--help'):
assert False, "Unhandled option"
if self.keyword_filter is not None:
self.keyword_filter = self.decode(self.keyword_filter)
# Canonicalize pathnames, check validity.
self.database_file = self.munge_path(self.database_file)
self.user_directory = self.munge_path(self.user_directory)
if not os.path.isdir(self.user_directory):
sys.stderr.write("Notes directory %s doesn't exist!\n" % self.user_directory)
# Return tokenized keyword arguments.
return self.tokenize(' '.join(keywords))
def load_index(self):
''' Load the keyword index or start with an empty one. '''
with open(self.database_file) as handle:
self.index = pickle.load(handle)
assert self.index['version'] == 1
self.first_use = False
self.dirty = False
self.first_use = True
self.dirty = True
self.index = {'keywords': {}, 'files': {}, 'version': 1}
def save_index(self):
''' Save the keyword index to disk. '''
with open(self.database_file, 'w') as handle:
pickle.dump(self.index, handle)
def update_index(self):
''' Update the keyword index by scanning the notes directory. '''
# First we find the filenames and last modified times of the notes on disk.
notes_on_disk = {}
for filename in os.listdir(self.user_directory):
if filename != '.swp' and not fnmatch.fnmatch(filename, '.*.s??'): # (Vim swap files are ignored)
abspath = os.path.join(self.user_directory, filename)
if os.path.isfile(abspath):
notes_on_disk[abspath] = os.path.getmtime(abspath)
# Then we either scan the whole bunch or be a bit more subtle.
if self.first_use:
for filename, last_modified in notes_on_disk.iteritems():
self.add_note(filename, last_modified)
# Check for updated and/or deleted notes since the last run.
for filename, last_modified_in_db in self.index['files'].iteritems():
if filename not in notes_on_disk:
# Forget a deleted note.
# Check whether previously seen note has changed?
last_modified_on_disk = notes_on_disk[filename]
if last_modified_on_disk > last_modified_in_db:
self.add_note(filename, last_modified_on_disk)
def add_note(self, filename, last_modified):
''' Add a note to the index (assumes the note is not already indexed). '''
sys.stderr.write("Scanning %s ..\n" % filename)
self.index['files'][filename] = last_modified
with open(filename) as handle:
for kw in self.tokenize(
if kw not in self.index['keywords']:
self.index['keywords'][kw] = [filename]
self.dirty = True
def delete_note(self, filename):
''' Remove a note from the index. '''
del self.index['files'][filename]
for kw in self.index['keywords']:
filter(lambda x: x != filename, self.index['keywords'][kw])
self.dirty = True
def search_index(self, keywords):
''' Return names of files containing all of the given keywords. '''
matches = None
for kw in keywords:
filenames = self.index['keywords'].get(kw, [])
if matches is None:
matches = set(filenames)
matches &= set(filenames)
return list(matches) if matches else []
def list_keywords(self, substring, limit=25):
''' Print all (matching) keywords to standard output. '''
decorated = []
for kw, filenames in self.index['keywords'].iteritems():
if substring in kw.lower():
if levenshtein_supported:
decorated.append((Levenshtein.distance(kw.lower(), substring), -len(filenames), kw))
decorated.append((-len(filenames), kw))
selection = [d[-1] for d in decorated[:limit]]
print self.encode('\n'.join(selection))
def tokenize(self, text):
''' Tokenize a string into a list of normalized, unique keywords. '''
words = set()
text = self.decode(text).lower()
for word in re.findall(r'\w+', text, re.UNICODE):
word = word.strip()
if word != '' and not word.isspace():
return words
def encode(self, text):
''' Encode a string in the user's preferred character encoding. '''
return text.encode(self.character_encoding, 'ignore')
def decode(self, text):
''' Decode a string in the user's preferred character encoding. '''
return text.decode(self.character_encoding, 'ignore')
def munge_path(self, path):
''' Canonicalize user-defined path, making it absolute. '''
return os.path.abspath(os.path.expanduser(path))
def usage(self):
print '''
search-notes [OPTIONS] KEYWORD...
Search a directory of plain text files using a full text index,
updated automatically during each invocation of the program.
Valid options include:
-l, --list=SUBSTR list keywords matching substring
-d, --database=FILE set path to keywords index file
-n, --notes=DIR set directory with user notes
-e, --encoding=NAME set character encoding of notes
-h, --help show this message and exit
For more information see
if __name__ == '__main__':
# vim: ts=2 sw=2 et
