diff --git a/csvkit/table.py b/csvkit/table.py index 168aebe83..777284e67 100644 --- a/csvkit/table.py +++ b/csvkit/table.py @@ -6,6 +6,7 @@ from csvkit import CSVKitReader, CSVKitWriter from csvkit import sniffer from csvkit import typeinference +from csvkit.cli import parse_column_identifiers class InvalidType(object): """ @@ -179,7 +180,7 @@ def row(self, i): return row_data @classmethod - def from_csv(cls, f, name='from_csv_table', snifflimit=None, **kwargs): + def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, **kwargs): """ Creates a new Table from a file-like object containing CSV data. """ @@ -198,20 +199,30 @@ def from_csv(cls, f, name='from_csv_table', snifflimit=None, **kwargs): reader = CSVKitReader(f, dialect=dialect, **kwargs) headers = reader.next() - - data_columns = [[] for c in headers] + + # Prepare the proper number of containers + if column_ids: + column_ids = parse_column_identifiers(column_ids, headers) + # Spin off list of chosen column names + headers_copy = list(headers) + for i, c in enumerate(column_ids): + headers[i] = headers_copy[c] + data_columns = [[] for c in column_ids] + else: + column_ids = [i for i in range(len(headers))] + data_columns = [[] for c in headers] for row in reader: for i, d in enumerate(row): try: - data_columns[i].append(d.strip()) + data_columns[i].append(row[column_ids[i]].strip()) except IndexError: # Non-rectangular data is truncated break columns = [] - for i, c in enumerate(data_columns): + for i, c in enumerate(data_columns): columns.append(Column(i, headers[i], c)) return Table(columns, name=name) diff --git a/csvkit/utilities/csvstat.py b/csvkit/utilities/csvstat.py index f0f1a2a20..83b36fc36 100644 --- a/csvkit/utilities/csvstat.py +++ b/csvkit/utilities/csvstat.py @@ -3,7 +3,7 @@ import datetime from csvkit import table -from csvkit.cli import CSVKitUtility +from csvkit.cli import CSVKitUtility from heapq import nlargest from operator import itemgetter @@ -14,9 +14,11 @@ class CSVStat(CSVKitUtility): def add_arguments(self): self.argparser.add_argument('-y', '--snifflimit', dest='snifflimit', type=int, help='Limit CSV dialect sniffing to the specified number of bytes.') + self.argparser.add_argument('-c', '--columns', dest='columns', + help='A comma separated list of column indices or names to be examined. Defaults to all columns.') def main(self): - tab = table.Table.from_csv(self.args.file, snifflimit=self.args.snifflimit, **self.reader_kwargs) + tab = table.Table.from_csv(self.args.file, snifflimit=self.args.snifflimit, column_ids=self.args.columns, **self.reader_kwargs) null_excluder = lambda i: i is not None diff --git a/docs/tutorial/examining_the_data.rst b/docs/tutorial/examining_the_data.rst index e6c216652..c37f7b34c 100644 --- a/docs/tutorial/examining_the_data.rst +++ b/docs/tutorial/examining_the_data.rst @@ -78,7 +78,7 @@ Searching for rows with csvgrep After reviewing the summary statistics you might wonder where your home state falls in the order. To get a simple answer to the question we can use :doc:`/scripts/csvgrep` to search for the state's name amongst the rows. Let's also use csvcut to just look at the columns we care about:: - $ csvcut -c 1,"TOTAL" 2009.csv | csvgrep -c 1 ILLINOIS + $ csvcut -c 1,"TOTAL" 2009.csv | csvgrep -c 1 -m ILLINOIS State Name,TOTAL ILLINOIS,"21,964" @@ -117,7 +117,6 @@ Now we can use :doc:`/scripts/csvsort` to sort the rows by the first column:: 40402,TEXAS 36394,FLORIDA 33986,ARIZONA - 21964,ILLINOIS The -r tells ``csvsort`` to sort in descending order.