csvjoin supports --no-header-row, closes #404

wireservice · Feb 9, 2016 · 3927ddd · 3927ddd
1 parent 50757a9
commit 3927ddd
Show file tree

Hide file tree

Showing 4 changed files with 72 additions and 71 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -17,6 +17,7 @@ Improvements:
 
 * "import csvkit as csv" will now defer to agate readers/writers.
 * csvgrep supports --no-header-row.
+* csvjoin supports --no-header-row.
 * csvjson supports --snifflimit and --no-inference.
 * csvlook supports --snifflimit and --no-inference.
 * csvsql supports custom SQLAlchemy dialects.

diff --git a/csvkit/join.py b/csvkit/join.py
@@ -22,47 +22,49 @@ def _get_mapped_keys(rows, column_index):
     return mapped_keys
 
 
-def sequential_join(left_table, right_table):
+def sequential_join(left_rows, right_rows, header=True):
     """
     Join two tables by aligning them horizontally without performing any filtering.
     """
-    # Grab headers
-    left_headers = left_table[0]
-    right_headers = right_table[0]
-    left_rows = left_table[1:]
-    right_rows = iter(right_table[1:])
+    len_left_headers = len(left_rows[0])
+    len_right_headers = len(right_rows[0])
 
-    output = [left_headers + right_headers]
+    if header:
+        output = [left_rows[0] + right_rows[0]]
+        left_rows = left_rows[1:]
+        right_rows = iter(right_rows[1:])
+    else:
+        output = []
 
     for left_row in left_rows:
         try:
             right_row = next(right_rows)
             output.append(left_row + right_row)
         except StopIteration:
-            output.append(left_row + [u''] * len(right_headers))
+            output.append(left_row + [u''] * len_right_headers)
 
     for right_row in right_rows:
-        output.append([u''] * len(left_headers) + right_row)
+        output.append([u''] * len_left_headers + right_row)
 
     return output
 
 
-def inner_join(left_table, left_column_id, right_table, right_column_id):
+def inner_join(left_rows, left_column_id, right_rows, right_column_id, header=True):
     """
     Execute an inner join on two tables and return the combined table.
     """
-    # Grab headers
-    left_headers = left_table[0]
-    len_left_headers = len(left_headers)
-    right_headers = right_table[0]
-    left_rows = left_table[1:]
-    right_rows = right_table[1:]
+    len_left_headers = len(left_rows[0])
+
+    if header:
+        output = [left_rows[0] + right_rows[0]]
+        left_rows = left_rows[1:]
+        right_rows = right_rows[1:]
+    else:
+        output = []
 
     # Map right rows to keys
     right_mapped_keys = _get_mapped_keys(right_rows, right_column_id)
 
-    output = [left_headers + right_headers]
-
     for left_row in left_rows:
         len_left_row = len(left_row)
 
@@ -78,25 +80,26 @@ def inner_join(left_table, left_column_id, right_table, right_column_id):
     return output
 
 
-def full_outer_join(left_table, left_column_id, right_table, right_column_id):
+def full_outer_join(left_rows, left_column_id, right_rows, right_column_id, header=True):
     """
     Execute full outer join on two tables and return the combined table.
     """
-    # Grab headers
-    left_headers = left_table[0]
-    len_left_headers = len(left_headers)
-    right_headers = right_table[0]
-    left_rows = left_table[1:]
-    right_rows = right_table[1:]
+    len_left_headers = len(left_rows[0])
+    len_right_headers = len(right_rows[0])
+
+    if header:
+        output = [left_rows[0] + right_rows[0]]
+        left_rows = left_rows[1:]
+        right_rows = right_rows[1:]
+    else:
+        output = []
 
     # Get ordered keys
     left_ordered_keys = _get_ordered_keys(left_rows, left_column_id)
 
     # Get mapped keys
     right_mapped_keys = _get_mapped_keys(right_rows, right_column_id)
 
-    output = [left_headers + right_headers]
-
     for left_row in left_rows:
         len_left_row = len(left_row)
         left_key = left_row[left_column_id]
@@ -108,33 +111,34 @@ def full_outer_join(left_table, left_column_id, right_table, right_column_id):
             for right_row in right_mapped_keys[left_key]:
                 output.append(left_row + right_row)
         else:
-            output.append(left_row + ([u''] * len(right_headers)))
+            output.append(left_row + ([u''] * len_right_headers))
 
     for right_row in right_rows:
         right_key = right_row[right_column_id]
 
         if right_key not in left_ordered_keys:
-            output.append(([u''] * len(left_headers)) + right_row)
+            output.append(([u''] * len_left_headers) + right_row)
 
     return output
 
 
-def left_outer_join(left_table, left_column_id, right_table, right_column_id):
+def left_outer_join(left_rows, left_column_id, right_rows, right_column_id, header=True):
     """
     Execute left outer join on two tables and return the combined table.
     """
-    # Grab headers
-    left_headers = left_table[0]
-    len_left_headers = len(left_headers)
-    right_headers = right_table[0]
-    left_rows = left_table[1:]
-    right_rows = right_table[1:]
+    len_left_headers = len(left_rows[0])
+    len_right_headers = len(right_rows[0])
+
+    if header:
+        output = [left_rows[0] + right_rows[0]]
+        left_rows = left_rows[1:]
+        right_rows = right_rows[1:]
+    else:
+        output = []
 
     # Get mapped keys
     right_mapped_keys = _get_mapped_keys(right_rows, right_column_id)
 
-    output = [left_headers + right_headers]
-
     for left_row in left_rows:
         len_left_row = len(left_row)
         left_key = left_row[left_column_id]
@@ -146,30 +150,30 @@ def left_outer_join(left_table, left_column_id, right_table, right_column_id):
             for right_row in right_mapped_keys[left_key]:
                 output.append(left_row + right_row)
         else:
-            output.append(left_row + ([u''] * len(right_headers)))
+            output.append(left_row + ([u''] * len_right_headers))
 
     return output
 
 
-def right_outer_join(left_table, left_column_id, right_table, right_column_id):
+def right_outer_join(left_rows, left_column_id, right_rows, right_column_id, header=True):
     """
     Execute right outer join on two tables and return the combined table.
     """
-    # Grab headers
-    left_headers = left_table[0]
-    len_left_headers = len(left_headers)
-    right_headers = right_table[0]
-    left_rows = left_table[1:]
-    right_rows = right_table[1:]
+    len_left_headers = len(left_rows[0])
+
+    if header:
+        output = [left_rows[0] + right_rows[0]]
+        left_rows = left_rows[1:]
+        right_rows = right_rows[1:]
+    else:
+        output = []
 
     # Get ordered keys
     left_ordered_keys = _get_ordered_keys(left_rows, left_column_id)
 
     # Get mapped keys
     right_mapped_keys = _get_mapped_keys(right_rows, right_column_id)
 
-    output = [left_headers + right_headers]
-
     for left_row in left_rows:
         len_left_row = len(left_row)
         left_key = left_row[left_column_id]
@@ -185,6 +189,6 @@ def right_outer_join(left_table, left_column_id, right_table, right_column_id):
         right_key = right_row[right_column_id]
 
         if right_key not in left_ordered_keys:
-            output.append(([u''] * len(left_headers)) + right_row)
+            output.append(([u''] * len_left_headers) + right_row)
 
     return output
diff --git a/csvkit/utilities/csvjoin.py b/csvkit/utilities/csvjoin.py
@@ -9,7 +9,7 @@
 class CSVJoin(CSVKitUtility):
     description = 'Execute a SQL-like join to merge CSV files on a specified column or columns.'
     epilog = 'Note that the join operation requires reading all files into memory. Don\'t try this on very large files.'
-    override_flags = ['f', 'H']
+    override_flags = ['f']
 
     def add_arguments(self):
         self.argparser.add_argument(metavar="FILE", nargs='*', dest='input_paths', default=['-'],
@@ -48,9 +48,10 @@ def main(self):
             self.argparser.error('It is not valid to specify both a left and a right join.')
 
         tables = []
+        header = not self.args.no_header_row
 
         for f in self.input_files:
-            tables.append(list(agate.reader(f, **self.reader_kwargs)))
+            tables.append(list(agate.reader(f, header=header, **self.reader_kwargs)))
             f.close()
 
         join_column_ids = []
@@ -59,14 +60,12 @@ def main(self):
             for i, t in enumerate(tables):
                 join_column_ids.append(match_column_identifier(t[0], join_column_names[i]))
 
-        jointab = []
+        jointab = tables[0]
 
         if self.args.left_join:
             # Left outer join
-            jointab = tables[0]
-
             for i, t in enumerate(tables[1:]):
-                jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
+                jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1], header=header)
         elif self.args.right_join:
             # Right outer join
             jointab = tables[-1]
@@ -75,26 +74,19 @@ def main(self):
             remaining_tables.reverse()
 
             for i, t in enumerate(remaining_tables):
-                jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1])
+                jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1], header=header)
         elif self.args.outer_join:
             # Full outer join
-            jointab = tables[0]
-
             for i, t in enumerate(tables[1:]):
-                jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
+                jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1], header=header)
+        elif self.args.columns:
+            # Inner join
+            for i, t in enumerate(tables[1:]):
+                jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1], header=header)
         else:
-            if self.args.columns:
-                # Inner join
-                jointab = tables[0]
-
-                for i, t in enumerate(tables[1:]):
-                    jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
-            else:
-                jointab = tables[0]
-
-                # Sequential join
-                for t in tables[1:]:
-                    jointab = join.sequential_join(jointab, t)
+            # Sequential join
+            for t in tables[1:]:
+                jointab = join.sequential_join(jointab, t, header=header)
 
         output = agate.writer(self.output_file, **self.writer_kwargs)
 

diff --git a/tests/test_utilities/test_csvjoin.py b/tests/test_utilities/test_csvjoin.py
@@ -43,3 +43,7 @@ def test_left_short_columns(self):
         output = self.get_output_as_io(['-c', 'a', 'examples/join_a_short.csv', 'examples/join_b.csv'])
         with open('examples/join_short.csv') as f:
             self.assertEqual(output.readlines(), f.readlines())
+
+    def test_no_header_row(self):
+        output = self.get_output_as_io(['-c', '1', '-H', 'examples/join_a.csv', 'examples/join_no_header_row.csv'])
+        self.assertEqual(len(output.readlines()), 2)