Files() >> Lines() >> Normalize() >> Contains(keywords) >> Write()

In [3]:
from nutsflow import *

In [5]:
@nut_source
def Files():
    from glob import glob
    for path in glob('data/*.txt'):
        with open(path) as f:
            yield path, f

In [6]:
Files()

<nutsflow.factory.nut_source.<locals>.Wrapper at 0x1f535659130>

In [7]:
Files() >> Collect()           

[('data\\parrot.txt',
  <_io.TextIOWrapper name='data\\parrot.txt' mode='r' encoding='cp1252'>),
 ('data\\zen.txt',
  <_io.TextIOWrapper name='data\\zen.txt' mode='r' encoding='cp1252'>)]

In [8]:
@nut_processor
def Lines(files):     
    for path, f in files:
        for line in f:
            yield path, line                 

In [9]:
Files() >> Lines() >> Head(3)                

[('data\\parrot.txt', 'The Dead Parrot Sketch\n'),
 ('data\\parrot.txt', "Customer: 'Ello, I wish to register a complaint.\n"),
 ('data\\parrot.txt', "C: 'Ello, Miss?\n")]

In [10]:
@nut_function
def Normalize(line):
    path, text = line
    for ch in {'.', '?',  ':', ','}:
        text = text.replace(ch, ' ')
    return path, text.strip().lower()

In [11]:
Files() >> Lines() >> Normalize() >> Head(3)

[('data\\parrot.txt', 'the dead parrot sketch'),
 ('data\\parrot.txt', "customer  'ello  i wish to register a complaint"),
 ('data\\parrot.txt', "c  'ello  miss")]

In [12]:
@nut_filter
def Contains(line, keywords):
    path, text = line
    return keywords & set(text.split())

In [13]:
@nut_sink 
def Write(lines, outpath):
    with open(outpath, 'w') as f:
        for path, text in lines:
            f.write('%s: %s\n' % (path, text))

In [15]:
keywords = {'polly', 'complex'}
Files() >> Lines() >> Normalize() >> Contains(keywords) >> Print() >> Write('data/lines.txt')

('data\\parrot.txt', "'ello  mister polly parrot! i've got a lovely fresh cuttle")
('data\\zen.txt', 'simple is better than complex')
('data\\zen.txt', 'complex is better than complicated')
