forked from wireservice/csvkit
/
csvstack.py
89 lines (63 loc) · 3.18 KB
/
csvstack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python
import os
from csvkit import CSVKitReader, CSVKitWriter
from csvkit.cli import CSVKitUtility
from csvkit.headers import make_default_headers
class CSVStack(CSVKitUtility):
description = 'Stack up the rows from multiple CSV files, optionally adding a grouping value.'
override_flags = ['f']
def add_arguments(self):
self.argparser.add_argument(metavar="FILE", nargs='+', dest='input_paths', default=['-'],
help='The CSV file(s) to operate on. If omitted, will accept input on STDIN.')
self.argparser.add_argument('-g', '--groups', dest='groups',
help='A comma-seperated list of values to add as "grouping factors", one for each CSV being stacked. These will be added to the stacked CSV as a new column. You may specify a name for the grouping column using the -n flag.')
self.argparser.add_argument('-n', '--group-name', dest='group_name',
help='A name for the grouping column, e.g. "year". Only used when also specifying -g.')
self.argparser.add_argument('--filenames', dest='group_by_filenames', action='store_true',
help='Use the filename of each input file as its grouping value. When specified, -g will be ignored.')
def main(self):
self.input_files = []
for path in self.args.input_paths:
self.input_files.append(self._open_input_file(path))
if len(self.input_files) < 2:
self.argparser.error('You must specify at least two files to stack.')
if self.args.group_by_filenames:
groups = [os.path.split(f.name)[1] for f in self.input_files]
elif self.args.groups:
groups = self.args.groups.split(',')
if len(groups) != len(self.input_files):
self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.')
else:
groups = None
group_name = self.args.group_name if self.args.group_name else 'group'
output = CSVKitWriter(self.output_file, **self.writer_kwargs)
for i, f in enumerate(self.input_files):
rows = CSVKitReader(f, **self.reader_kwargs)
# If we have header rows, use them
if not self.args.no_header_row:
headers = next(rows, [])
if i == 0:
if groups:
headers.insert(0, group_name)
output.writerow(headers)
# If we don't generate simple column names based on first row
else:
row = next(rows, [])
headers = make_default_headers(len(row))
if i == 0:
if groups:
headers.insert(0, group_name)
output.writerow(headers)
if groups:
row.insert(0, groups[i])
output.writerow(row)
for row in rows:
if groups:
row.insert(0, groups[i])
output.writerow(row)
f.close()
def launch_new_instance():
utility = CSVStack()
utility.main()
if __name__ == "__main__":
launch_new_instance()