Skip to content

Commit

Permalink
csvjoin supports --no-header-row, closes #404
Browse files Browse the repository at this point in the history
  • Loading branch information
James McKinney committed Feb 9, 2016
1 parent 50757a9 commit 3927ddd
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 71 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Expand Up @@ -17,6 +17,7 @@ Improvements:

* "import csvkit as csv" will now defer to agate readers/writers.
* csvgrep supports --no-header-row.
* csvjoin supports --no-header-row.
* csvjson supports --snifflimit and --no-inference.
* csvlook supports --snifflimit and --no-inference.
* csvsql supports custom SQLAlchemy dialects.
Expand Down
102 changes: 53 additions & 49 deletions csvkit/join.py
Expand Up @@ -22,47 +22,49 @@ def _get_mapped_keys(rows, column_index):
return mapped_keys


def sequential_join(left_table, right_table):
def sequential_join(left_rows, right_rows, header=True):
"""
Join two tables by aligning them horizontally without performing any filtering.
"""
# Grab headers
left_headers = left_table[0]
right_headers = right_table[0]
left_rows = left_table[1:]
right_rows = iter(right_table[1:])
len_left_headers = len(left_rows[0])
len_right_headers = len(right_rows[0])

output = [left_headers + right_headers]
if header:
output = [left_rows[0] + right_rows[0]]
left_rows = left_rows[1:]
right_rows = iter(right_rows[1:])
else:
output = []

for left_row in left_rows:
try:
right_row = next(right_rows)
output.append(left_row + right_row)
except StopIteration:
output.append(left_row + [u''] * len(right_headers))
output.append(left_row + [u''] * len_right_headers)

for right_row in right_rows:
output.append([u''] * len(left_headers) + right_row)
output.append([u''] * len_left_headers + right_row)

return output


def inner_join(left_table, left_column_id, right_table, right_column_id):
def inner_join(left_rows, left_column_id, right_rows, right_column_id, header=True):
"""
Execute an inner join on two tables and return the combined table.
"""
# Grab headers
left_headers = left_table[0]
len_left_headers = len(left_headers)
right_headers = right_table[0]
left_rows = left_table[1:]
right_rows = right_table[1:]
len_left_headers = len(left_rows[0])

if header:
output = [left_rows[0] + right_rows[0]]
left_rows = left_rows[1:]
right_rows = right_rows[1:]
else:
output = []

# Map right rows to keys
right_mapped_keys = _get_mapped_keys(right_rows, right_column_id)

output = [left_headers + right_headers]

for left_row in left_rows:
len_left_row = len(left_row)

Expand All @@ -78,25 +80,26 @@ def inner_join(left_table, left_column_id, right_table, right_column_id):
return output


def full_outer_join(left_table, left_column_id, right_table, right_column_id):
def full_outer_join(left_rows, left_column_id, right_rows, right_column_id, header=True):
"""
Execute full outer join on two tables and return the combined table.
"""
# Grab headers
left_headers = left_table[0]
len_left_headers = len(left_headers)
right_headers = right_table[0]
left_rows = left_table[1:]
right_rows = right_table[1:]
len_left_headers = len(left_rows[0])
len_right_headers = len(right_rows[0])

if header:
output = [left_rows[0] + right_rows[0]]
left_rows = left_rows[1:]
right_rows = right_rows[1:]
else:
output = []

# Get ordered keys
left_ordered_keys = _get_ordered_keys(left_rows, left_column_id)

# Get mapped keys
right_mapped_keys = _get_mapped_keys(right_rows, right_column_id)

output = [left_headers + right_headers]

for left_row in left_rows:
len_left_row = len(left_row)
left_key = left_row[left_column_id]
Expand All @@ -108,33 +111,34 @@ def full_outer_join(left_table, left_column_id, right_table, right_column_id):
for right_row in right_mapped_keys[left_key]:
output.append(left_row + right_row)
else:
output.append(left_row + ([u''] * len(right_headers)))
output.append(left_row + ([u''] * len_right_headers))

for right_row in right_rows:
right_key = right_row[right_column_id]

if right_key not in left_ordered_keys:
output.append(([u''] * len(left_headers)) + right_row)
output.append(([u''] * len_left_headers) + right_row)

return output


def left_outer_join(left_table, left_column_id, right_table, right_column_id):
def left_outer_join(left_rows, left_column_id, right_rows, right_column_id, header=True):
"""
Execute left outer join on two tables and return the combined table.
"""
# Grab headers
left_headers = left_table[0]
len_left_headers = len(left_headers)
right_headers = right_table[0]
left_rows = left_table[1:]
right_rows = right_table[1:]
len_left_headers = len(left_rows[0])
len_right_headers = len(right_rows[0])

if header:
output = [left_rows[0] + right_rows[0]]
left_rows = left_rows[1:]
right_rows = right_rows[1:]
else:
output = []

# Get mapped keys
right_mapped_keys = _get_mapped_keys(right_rows, right_column_id)

output = [left_headers + right_headers]

for left_row in left_rows:
len_left_row = len(left_row)
left_key = left_row[left_column_id]
Expand All @@ -146,30 +150,30 @@ def left_outer_join(left_table, left_column_id, right_table, right_column_id):
for right_row in right_mapped_keys[left_key]:
output.append(left_row + right_row)
else:
output.append(left_row + ([u''] * len(right_headers)))
output.append(left_row + ([u''] * len_right_headers))

return output


def right_outer_join(left_table, left_column_id, right_table, right_column_id):
def right_outer_join(left_rows, left_column_id, right_rows, right_column_id, header=True):
"""
Execute right outer join on two tables and return the combined table.
"""
# Grab headers
left_headers = left_table[0]
len_left_headers = len(left_headers)
right_headers = right_table[0]
left_rows = left_table[1:]
right_rows = right_table[1:]
len_left_headers = len(left_rows[0])

if header:
output = [left_rows[0] + right_rows[0]]
left_rows = left_rows[1:]
right_rows = right_rows[1:]
else:
output = []

# Get ordered keys
left_ordered_keys = _get_ordered_keys(left_rows, left_column_id)

# Get mapped keys
right_mapped_keys = _get_mapped_keys(right_rows, right_column_id)

output = [left_headers + right_headers]

for left_row in left_rows:
len_left_row = len(left_row)
left_key = left_row[left_column_id]
Expand All @@ -185,6 +189,6 @@ def right_outer_join(left_table, left_column_id, right_table, right_column_id):
right_key = right_row[right_column_id]

if right_key not in left_ordered_keys:
output.append(([u''] * len(left_headers)) + right_row)
output.append(([u''] * len_left_headers) + right_row)

return output
36 changes: 14 additions & 22 deletions csvkit/utilities/csvjoin.py
Expand Up @@ -9,7 +9,7 @@
class CSVJoin(CSVKitUtility):
description = 'Execute a SQL-like join to merge CSV files on a specified column or columns.'
epilog = 'Note that the join operation requires reading all files into memory. Don\'t try this on very large files.'
override_flags = ['f', 'H']
override_flags = ['f']

def add_arguments(self):
self.argparser.add_argument(metavar="FILE", nargs='*', dest='input_paths', default=['-'],
Expand Down Expand Up @@ -48,9 +48,10 @@ def main(self):
self.argparser.error('It is not valid to specify both a left and a right join.')

tables = []
header = not self.args.no_header_row

for f in self.input_files:
tables.append(list(agate.reader(f, **self.reader_kwargs)))
tables.append(list(agate.reader(f, header=header, **self.reader_kwargs)))
f.close()

join_column_ids = []
Expand All @@ -59,14 +60,12 @@ def main(self):
for i, t in enumerate(tables):
join_column_ids.append(match_column_identifier(t[0], join_column_names[i]))

jointab = []
jointab = tables[0]

if self.args.left_join:
# Left outer join
jointab = tables[0]

for i, t in enumerate(tables[1:]):
jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1], header=header)
elif self.args.right_join:
# Right outer join
jointab = tables[-1]
Expand All @@ -75,26 +74,19 @@ def main(self):
remaining_tables.reverse()

for i, t in enumerate(remaining_tables):
jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1])
jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1], header=header)
elif self.args.outer_join:
# Full outer join
jointab = tables[0]

for i, t in enumerate(tables[1:]):
jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1], header=header)
elif self.args.columns:
# Inner join
for i, t in enumerate(tables[1:]):
jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1], header=header)
else:
if self.args.columns:
# Inner join
jointab = tables[0]

for i, t in enumerate(tables[1:]):
jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
else:
jointab = tables[0]

# Sequential join
for t in tables[1:]:
jointab = join.sequential_join(jointab, t)
# Sequential join
for t in tables[1:]:
jointab = join.sequential_join(jointab, t, header=header)

output = agate.writer(self.output_file, **self.writer_kwargs)

Expand Down
4 changes: 4 additions & 0 deletions tests/test_utilities/test_csvjoin.py
Expand Up @@ -43,3 +43,7 @@ def test_left_short_columns(self):
output = self.get_output_as_io(['-c', 'a', 'examples/join_a_short.csv', 'examples/join_b.csv'])
with open('examples/join_short.csv') as f:
self.assertEqual(output.readlines(), f.readlines())

def test_no_header_row(self):
output = self.get_output_as_io(['-c', '1', '-H', 'examples/join_a.csv', 'examples/join_no_header_row.csv'])
self.assertEqual(len(output.readlines()), 2)

0 comments on commit 3927ddd

Please sign in to comment.