wireservice · qauge · Apr 5, 2016 · Apr 5, 2016 · Apr 5, 2016 · Apr 5, 2016
diff --git a/csvkit/sed.py b/csvkit/sed.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+A stream-oriented CSV modification tool. Like a stripped-down "sed"
+command, but for tabular data.
+"""
+
+import re
+import subprocess
+import sys
+
+from csvkit.exceptions import ColumnIdentifierError
+
+import six
+
+class InvalidModifier(Exception):
+    def __init__(self, message):
+        super(InvalidModifier, self).__init__('Invalid modifier: %s' % message)
+
+class CSVModifier(six.Iterator):
+    """
+    On-the-fly modifies CSV records coming from a csvkit reader object.
+
+    :Parameters:
+
+    reader : iter
+
+      The CSV record source - must support the `next()` call, which
+      should return a list of values.
+
+    modifiers : { list, dict }
+
+      Specifies a set of modifiers to apply to the `reader`, which can
+      be either a sequence or dictionary of modifiers to apply. If
+      it is a sequence, then the modifiers are applied to the
+      equivalently positioned cells in the input records. If it is a
+      dictionary, the keys can be integers (column position) or
+      strings (column name). In all cases, the modifiers can be one of
+      the following:
+
+      * function : takes a single string argument and returns a string
+      * string : a sed-like modifier
+
+      Currently supported modification modifiers:
+
+      * Substitution: "s/REGEX/REPL/FLAGS"
+
+        Replaces regular expression `REGEX` with replacement string
+        `REPL`, which can use back references. Supports the following
+        flags:
+
+        * i: case-insensitive
+        * g: global replacement (otherwise only the first is replaced)
+        * l: uses locale-dependent character classes
+        * m: enables multiline matching for "^" and "$"
+        * s: "." also matches the newline character
+        * u: enables unicode escape sequences
+        * x: `REGEX` uses verbose descriptors & comments
+
+      * Transliteration: "y/SRC/DST/FLAGS"
+
+        (This is a slightly modified version of sed's "y" command.)
+
+        Each character in `SRC` is replaced with the corresponding
+        character in `DST`. The dash character ("-") indicates a range
+        of characters (e.g. "a-z" for all alphabetic characters).    If
+        the dash is needed literally, then it must be the first or
+        last character, or escaped with "\". The "\" character escapes
+        itself. Only the "i" flag, indicating case-insensitive
+        matching of `SRC`, is supported.
+
+      Note that the "/" character can be any character as long as it
+      is used consistently and not used within the modifier,
+      e.g. ``s|a|b|`` is equivalent to ``s/a/b/``.
+
+    header : bool, optional, default: true
+
+      If truthy (the default), then the first row will not be modified.
+    """
+    def __init__(self, reader, modifiers, header=True):
+        self.reader = reader
+        self.header = header
+        self.column_names = next(reader) if header else None
+        self.modifiers = standardize_modifiers(self.column_names, modifiers)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.header:
+            self.header = False
+            return self.column_names
+        row = next(self.reader)
+        for col, mod in self.modifiers.items():
+            row[col] = mod(row[col])
+        return row
+
+def standardize_modifiers(column_names, modifiers):
+    """
+    Given modifiers in any of the permitted input forms, return a dict whose keys
+    are column indices and whose values are functions which return a modified value.
+    If modifiers is a dictionary and any of its keys are values in column_names, the
+    returned dictionary will have those keys replaced with the integer position of
+    that value in column_names
+    """
+    try:
+        # Dictionary of modifiers
+        modifiers = dict((k, modifier_as_function(v)) for k, v in modifiers.items())
+        if not column_names:
+            return modifiers
+        p2 = {}
+        for k in modifiers:
+            if k in column_names:
+                idx = column_names.index(k)
+                if idx in modifiers:
+                    raise ColumnIdentifierError("Column %s has index %i which already has a pattern." % (k, idx))
+                p2[idx] = modifiers[k]
+            else:
+                p2[k] = modifiers[k]
+        return p2
+    except AttributeError:
+        # Sequence of modifiers
+        return dict((idx, modifier_as_function(x)) for idx, x in enumerate(modifiers.values()))
+
+def modifier_as_function(modifier):
+    """
+    Given a modifier (string or callable), return a callable modifier. If the modifier is a string, return the
+    appropriate callable modifier by examinating the modifier type (first character).
+    """
+    # modifier is a callable modifier
+    if hasattr(modifier, '__call__'):
+        callable_modifier = modifier
+
+    # modifier is a string modifier
+    else:
+        supported_modifier_types = ['s', 'y']
+        if not modifier:
+            raise InvalidModifier('empty modifier')
+        modifier_type = modifier[0]
+        if modifier_type not in supported_modifier_types:
+            raise InvalidModifier('unsupported type `%s` in modifier `%s`; supported modifier types are %s' % (modifier_type, modifier, ', '.join(supported_modifier_types)))
+        # perform dispatch
+        callable_modifier = eval('%sModifier' % modifier_type.upper())(modifier)
+
+    return callable_modifier
+
+class Modifier(object):
+    """
+    Abstract modifier class, from which all modifier classes shall inherit. Perform common checks on the supplied modifier,
+    to ease the subsequent operations in subclasses.
+    """
+    def __init__(self, modifier):
+        if len(modifier) < 4:
+            raise InvalidModifier('modifier is too short: `%s`' % modifier)
+
+        modifier_type = modifier[0]
+
+        ref_modifier_type = self.modifier_form[0] if len(self.modifier_form) > 0 else None
+        if modifier_type != ref_modifier_type:
+            raise InvalidModifier('expected type `%s`, got `%s` in `%s`' % (ref_modifier_type, modifier_type, modifier))
+
+        modifier_sep = modifier[1]
+        modifier_parts = modifier.split(modifier_sep)
+        if len(modifier_parts) != 4:
+            modifier_form = self.modifier_form.replace('/', modifier_sep)
+            raise InvalidModifier('expected modifier of the form `%s`, got `%s`' % (modifier_form, modifier))
+
+        modifier_lhs = modifier_parts[1]
+        if not modifier_lhs:
+            raise InvalidModifier('%s: no previous regular expression' % modifier)
+        self.modifier_lhs = modifier_lhs
+
+        modifier_rhs = modifier_parts[2]
+        self.modifier_rhs = modifier_rhs
+
+        flags = modifier_parts[3]
+        for flag in flags:
+            if flag not in self.supported_flags:
+                message = 'invalid flag `%s` in `%s`' % (flag, modifier)
+                if len(self.supported_flags) == 0:
+                    message += '; no flag is supported for type `%s`' % modifier_type
+                if len(self.supported_flags) == 1:
+                    message += '; the only supported flag for type `%s` is %s' % (modifier_type, self.supported_flags[0])
+                if len(self.supported_flags) > 1:
+                    message += '; supported flags for type `%s` are %s' % (modifier_type, ', '.join(self.supported_flags))
+                raise InvalidModifier(message)
+        self.modifier_flags = flags
+
+class SModifier(Modifier):
+    """
+    The "substitution" modifier ("s/REGEX/REPL/FLAGS").
+
+    Replaces regular expression `REGEX` with replacement string
+    `REPL`, which can use back references. Supports the following
+    flags:
+
+    * i: case-insensitive
+    * g: global replacement (otherwise only the first is replaced)
+    * l: uses locale-dependent character classes
+    * m: enables multiline matching for "^" and "$"
+    * s: "." also matches the newline character
+    * u: enables unicode escape sequences
+    * x: `REGEX` uses verbose descriptors & comments
+
+    Note that the "/" character can be any character as long as it
+      is used consistently and not used within the modifier,
+      e.g. ``s|a|b|`` is equivalent to ``s/a/b/``.
+    """
+    def __init__(self, modifier):
+        self.modifier_form = 's/REGEX/REPL/FLAGS'
+        self.supported_flags = ['i', 'g', 'l', 'm', 's', 'u', 'x']
+
+        super(SModifier, self).__init__(modifier)
+
+        self.repl = self.modifier_rhs
+
+        re_flags = 0
+        for flag in self.modifier_flags:
+            re_flags |= getattr(re, flag.upper(), 0)
+
+        try:
+            self.regex = re.compile(self.modifier_lhs, re_flags)
+        except re.error as e:
+            raise InvalidModifier('%s in `%s`' % (e.message, modifier))
+
+        self.count = 0 if 'g' in self.modifier_flags else 1
+
+    def __call__(self, value):
+        return self.regex.sub(self.repl, value, count=self.count)
+
+def cranges(pattern):
+    """
+    Given a pattern, expands it to a range of characters (crange).
+
+    The dash character ("-") indicates a range
+    of characters (e.g. "a-z" for all alphabetic characters).    If
+    the dash is needed literally, then it must be the first or
+    last character, or escaped with "\". The "\" character escapes
+    itself. Only the "i" flag, indicating case-insensitive
+    matching of `SRC`, is supported.
+
+    Examples:
+      [pattern]  ->  [crange]
+      'a-f'      ->  'abcdef'
+      'a\-f'     ->  'a-f'
+      'abc-'     ->  'abc-'
+      '-abc'     ->  '-abc')
+      'a-c-e-g'  ->  'abcdefg'
+    """
+    ret = ''
+    idx = 0
+    while idx < len(pattern):
+        c = pattern[idx]
+        idx += 1
+        if c == '-' and len(ret) > 0 and len(pattern) > idx:
+            for i in range(ord(ret[-1]) + 1, ord(pattern[idx]) + 1):
+                ret += chr(i)
+            idx += 1
+            continue
+        if c == '\\' and len(pattern) > idx:
+            c = pattern[idx]
+            idx += 1
+        ret += c
+    return ret
+
+class YModifier(Modifier):
+    """
+    The "transliterate" modifier ("y/SRC/DST/FLAGS").
+
+    (This is a slightly modified version of sed's "y" command.)
+
+    Each character in `SRC` is replaced with the corresponding character in `DST`.
+    Character ranges are supported in SRC and DST for the "transliterate" modifier.
+
+    Note that the "/" character can be any character as long as it
+      is used consistently and not used within the modifier,
+      e.g. ``s|a|b|`` is equivalent to ``s/a/b/``.
+    """
+    def __init__(self, modifier):
+        self.modifier_form = 'y/SRC/DST/FLAGS'
+        self.supported_flags = ['i']
+        super(YModifier, self).__init__(modifier)
+
+        src = cranges(self.modifier_lhs)
+        dst = cranges(self.modifier_rhs)
+
+        if len(src) != len(dst):
+            raise InvalidModifier('expecting source and destination to have the same length, but %i != %i, got `%s`' % (src, dst, modifier))
+
+        if 'i' in self.modifier_flags:
+            src = src.lower() + src.upper()
+            dst = 2 * dst
+
+        self.table = {ord(src_char) : ord(dst_char) for src_char, dst_char in zip(src, dst)}
+
+    def __call__(self, value):
+        return value.translate(self.table)
diff --git a/csvkit/utilities/csvsed.py b/csvkit/utilities/csvsed.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Command-line interface to `csvsed.sed`.
+"""
+
+import agate
+from csvkit.cli import CSVKitUtility
+from csvkit.sed import CSVModifier
+
+class CSVSed(CSVKitUtility):
+
+    description = 'A stream-oriented CSV modification tool. Like a ' \
+                  ' stripped-down "sed" command, but for tabular data.'
+
+    def add_arguments(self):
+        self.argparser.add_argument('-n', '--names', dest='names_only', action='store_true',
+                                    help='Display column names and indices from the input CSV and exit.')
+        self.argparser.add_argument('-c', '--columns', dest='columns',
+                                    help='A comma separated list of column indices or names to be modified.')
+        self.argparser.add_argument('-m', '--modifier', dest='modifier',
+                                    help='If specified, the "sed" modifier to evaluate: currently supports substitution '
+                                      '(s/REGEX/REPL/FLAGS) and transliteration (y/SRC/DEST/FLAGS).')
+
+    def main(self):
+        if self.args.names_only:
+            self.print_column_names()
+            return
+
+        if not self.args.columns:
+            self.argparser.error('You must specify at least one column to search using the -c option.')
+
+        if self.args.modifier is None:
+            self.argparser.error('-m must be specified, unless using the -n option.')
+
+        try:
+          # decode if necessary, to work exclusively with unicode modifiers
+          if isinstance(self.args.modifier, str):
+              self.args.modifier = self.args.modifier.decode('utf-8')
+        except AttributeError:
+            # Ignore Python 3 error: 'str' object has no attribute 'decode'
+            pass
+
+        reader_kwargs = self.reader_kwargs
+        writer_kwargs = self.writer_kwargs
+        if writer_kwargs.pop('line_numbers', False):
+            reader_kwargs = {'line_numbers': True}
+
+        rows, column_names, column_ids = self.get_rows_and_column_names_and_column_ids(**reader_kwargs)
+
+        modifiers = {idx: self.args.modifier for idx in column_ids}
+        reader = CSVModifier(rows, modifiers, header=False)
+
+        output = agate.csv.writer(self.output_file, **writer_kwargs)
+        output.writerow(column_names)
+
+        for row in reader:
+            output.writerow(row)
+
+def launch_new_instance():
+    utility = CSVSed()
+    utility.main()
+
+if __name__ == '__main__':
+    launch_new_instance()
diff --git a/docs/cli.rst b/docs/cli.rst
@@ -23,6 +23,7 @@ Processing
     scripts/csvcut
     scripts/csvgrep
     scripts/csvjoin
+    scripts/csvsed
     scripts/csvsort
     scripts/csvstack