In [20]:
import nltk, re

In [21]:
def split0(sents):
    result = []
    for s in sents:
        result.append(s.split())
    return result

def split1(sents):
    return [s.split() for s in sents]

def split2(sents):
    return [re.split(' ', s) for s in sents]

def split3(sents):
    return [re.split('\s+', s) for s in sents]

def split4(sents):
    return [nltk.word_tokenize(s) for s in sents]

In [22]:
with open('data/brown_small.txt', 'r') as ifile:
    sents = ifile.readlines()

In [23]:
%timeit -n 10000 tok = split0(sents)

10000 loops, best of 3: 49.4 µs per loop


In [24]:
%timeit -n 10000 tok = split1(sents)

10000 loops, best of 3: 46.5 µs per loop


In [25]:
%timeit -n 10000 tok = split2(sents)

10000 loops, best of 3: 119 µs per loop


In [26]:
%timeit -n 10000 tok = split3(sents)

10000 loops, best of 3: 277 µs per loop


In [28]:
%timeit -n 100 tok = split4(sents)

100 loops, best of 3: 6.8 ms per loop


Conclusion:

Using str.split() as a list comprehension is the fastest solution for splitting a sentence into words based on whitespace.

In [36]:
def fancysplit0(sents):
    def addstuff(sent):
        return "SENT " + sent + " *"
    return [addstuff(s).split() for s in sents]

def fancysplit1(sents):
    result = [s.split() for s in sents]
    [s.append("*") for s in result]
    [s.insert(0, "SENT") for s in result]
    return result

def fancysplit2(sents):
    result = [s.split() for s in sents]
    for r in result:
        r.append("*")
        r.insert(0, "SENT")
    return result

def fancysplit3(sents):
    return [("SENT " + s + " *").split() for s in sents]

def fancysplit4(sents):
    return [("SENT %s *" % s).split() for s in sents]

In [30]:
%timeit -n 10000 toks = fancysplit0(sents)

10000 loops, best of 3: 57 µs per loop


In [31]:
%timeit -n 10000 toks = fancysplit1(sents)

10000 loops, best of 3: 60.4 µs per loop


In [32]:
%timeit -n 10000 toks = fancysplit2(sents)

10000 loops, best of 3: 57.9 µs per loop


In [33]:
%timeit -n 10000 toks = fancysplit3(sents)

10000 loops, best of 3: 53 µs per loop


In [37]:
%timeit -n 10000 toks = fancysplit4(sents)

10000 loops, best of 3: 60.9 µs per loop


Conclusion:

Looks like appending to the string inside a list comprehension is best, but the improvement is marginal.