/
sheet.go
275 lines (242 loc) · 9.39 KB
/
sheet.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
package main
import (
"fmt"
"io"
"net/url"
"regexp"
"strconv"
"strings"
"time"
"github.com/usdigitalresponse/grants-ingest/internal/log"
"github.com/usdigitalresponse/grants-ingest/pkg/grantsSchemas/ffis"
"github.com/xuri/excelize/v2"
)
// New-style (2024) Grants.gov URL path pattern for grant opportunities,
// e.g. `/search-results-detail/1234`, where `1234` is the grant opportunity ID.
var GrantsGovURLPathPattern = regexp.MustCompile(`^/search-results-detail/\d+$`)
// Currently, the FFIS spreadsheet uses an "X" to indicate eligibility
func parseEligibility(value string) bool {
return value == "X"
}
// parseXLSXFile is a function that reads and processes an Excel file stream, converting the data into a slice
// of ffis.FFISFundingOpportunity objects. The file is expected to be provided as an io.Reader.
// The function filters and retains only those funding opportunities that possess a valid grant ID.
//
// Any errors encountered during the parsing of individual cells within the Excel file are not returned as function errors,
// but are instead logged at the WARN level, accompanied by the associated row and column indices for easy identification.
//
// Parameters:
// r: The io.Reader providing the Excel file stream to be parsed.
// logger: The log.Logger used to log any parsing errors at the WARN level.
//
// Returns:
// A slice of ffis.FFISFundingOpportunity objects representing the parsed funding opportunities from the Excel file.
// An error is returned if the parsing process fails at a level beyond individual cell parsing.
func parseXLSXFile(r io.Reader, logger log.Logger) ([]ffis.FFISFundingOpportunity, error) {
xlFile, err := excelize.OpenReader(r)
if err != nil {
return nil, err
}
// Used to test if a cell is a CFDA number. Apparently
// this is a consistent CFDA format based on this page:
// https://grantsgovprod.wordpress.com/2018/06/04/what-is-a-cfda-number-2/
// Sometimes, FFIS appends a + sign character to this cell to indicate
// that there are additional CFDA numbers not included in the spreadsheet.
cfdaRegex, err := regexp.Compile(`^([0-9]{1,2}\.[0-9]{0,3})\+?$`)
if err != nil {
return nil, err
}
defer func() {
if err := xlFile.Close(); err != nil {
log.Error(logger, "Error closing excel file", err)
}
}()
// We assume the excel file only has one sheet
sheet := "Sheet1"
// Returns all rows in the sheet. Note this assumes the sheet is somewhat limited in
// size, and will not scale to extremely large worksheets (memory overhead)
rows, err := xlFile.GetRows(sheet)
if err != nil {
return nil, err
}
sendMetric("spreadsheet.row_count", float64(len(rows)))
log.Info(logger, "Parsing spreadsheet", "total_rows", len(rows))
var opportunities []ffis.FFISFundingOpportunity
// Tracks if the iterator has found headers for the sheet. A header
// is a column header, like "CFDA", "Opportunity Title", etc.
foundHeaders := false
// The sheet has rows as bills that we want to assign to
// each opportunity underneath that bill, so we use this
// as we iterate through the rows. This could be something like
// "Inflation Reduction Act".
bill := ""
rowLoop:
for rowIndex, row := range rows {
opportunity := ffis.FFISFundingOpportunity{}
for colIndex, cell := range row {
logger := log.With(logger, "row_index", row, "column_index", colIndex)
// We assume the first column header is "CFDA", if it is,
// we're in the headers row, so skip it and set the flag that
// the content follows
if colIndex == 0 && cell == "CFDA" {
foundHeaders = true
continue rowLoop
}
// If we have not yet found the headers, skip the row
if !foundHeaders {
continue rowLoop
}
// Test the first cell, which is likely either a CFDA number,
// a blank row, or a category (eg Inflation Reduction Act)
if colIndex == 0 {
// If the cell is blank, skip the row
if cell == "" {
continue rowLoop
}
// If we don't match a CFDA number and the row isn't blank, we
// assume it's a bill and continue
if !cfdaRegex.MatchString(cell) {
bill = cell
continue rowLoop
}
}
// Populate opportunity based on an assumed format
// where colIndex is a column (zero is A, 1 is B, etc.)
switch colIndex {
case 0:
if f, err := strconv.ParseFloat(strings.TrimRight(cell, "+"), 64); err != nil {
log.Warn(logger, "Error parsing CFDA", err)
sendMetric("spreadsheet.cell_parsing_errors", 1, "target:CFDA")
continue
} else {
opportunity.CFDA = fmt.Sprintf("%06.3f", f)
}
case 1:
opportunity.OppTitle = cell
case 2:
opportunity.Agency = cell
case 3:
// If estimated funding is N/A, assume 0
if cell == "N/A" {
continue
}
num, err := strconv.ParseInt(cell, 10, 64)
// If we can't parse the funding amount, just skip the column
if err != nil {
log.Warn(logger, "Error parsing estimated funding", "error", err)
sendMetric("spreadsheet.cell_parsing_errors", 1, "target:EstimatedFunding")
continue
}
opportunity.EstimatedFunding = num
case 4:
opportunity.ExpectedAwards = cell
case 5:
opportunity.OppNumber = cell
// cellAxis (eg. A4) is used to get a hyperlink for a cell. We
// need to increment the index because Excel is not zero-indexed
cellAxis, err := excelize.CoordinatesToCellName(colIndex+1, rowIndex+1)
if err != nil {
log.Warn(logger, "Error parsing cell axis for grant ID", "error", err)
sendMetric("spreadsheet.cell_parsing_errors", 1, "target:GrantID")
continue
}
hasLink, target, err := xlFile.GetCellHyperLink(sheet, cellAxis)
if err != nil {
// log this, it is not worth aborting the whole extraction for
log.Warn(logger, "Error getting cell hyperlink for grant ID", "error", err)
sendMetric("spreadsheet.cell_parsing_errors", 1, "target:GrantID")
continue
}
// If we have a link, parse the URL for the opportunity ID which
// is the only way to get it from the spreadsheet
if hasLink {
logger := log.With(logger, "target", target)
linkURL, err := url.Parse(target)
if err != nil {
log.Warn(logger, "Error parsing link URL for grant ID", "error", err)
sendMetric("spreadsheet.cell_parsing_errors", 1, "target:GrantID")
continue
}
if h := linkURL.Hostname(); h != "grants.gov" && h != "www.grants.gov" {
log.Warn(logger, "Link URL for grant ID has invalid domain")
sendMetric("spreadsheet.cell_parsing_errors", 1, "target:GrantID")
continue
}
// See if it can be parsed as a newer, path-based Grants.gov URL
// Expected format: https://www.grants.gov/search-results-detail/<value>
if GrantsGovURLPathPattern.MatchString(linkURL.Path) {
lastPathSegment := target[strings.LastIndex(target, "/")+1:]
if val, err := strconv.ParseInt(lastPathSegment, 10, 64); err != nil {
// This will only happen due to programming error getting lastPathSegment
log.Warn(logger, "Error parsing Opportunity ID from link URL path",
"error", err, "match_pattern", GrantsGovURLPathPattern.String(),
"extracted_path_segment", lastPathSegment)
} else {
opportunity.GrantID = val
continue
}
} else {
log.Warn(logger, "Link URL path does not match expected pattern",
"match_pattern", GrantsGovURLPathPattern.String(), "url_path", linkURL.Path,
)
}
// We no longer (as of 2024) expect to see URLs that provide IDs via querystring,
// but attempt for backwards-compatibility.
log.Warn(logger,
"Link does does not match the expected path-based pattern; "+
"attempting fallback extraction from old-style query parameter",
"match_pattern", GrantsGovURLPathPattern.String())
// The opportunity ID should be a < 20 digit numeric value
oppID, err := strconv.ParseInt(linkURL.Query().Get("oppId"), 10, 64)
if err != nil {
log.Warn(logger, "Error parsing opportunity ID", "error", err)
sendMetric("spreadsheet.cell_parsing_errors", 1, "target:GrantID")
continue
}
opportunity.GrantID = oppID
}
case 6:
opportunity.Eligibility.State = parseEligibility(cell)
case 7:
opportunity.Eligibility.Local = parseEligibility(cell)
case 8:
opportunity.Eligibility.Tribal = parseEligibility(cell)
case 9:
opportunity.Eligibility.HigherEducation = parseEligibility(cell)
case 10:
opportunity.Eligibility.NonProfits = parseEligibility(cell)
case 11:
opportunity.Eligibility.Other = parseEligibility(cell)
case 12:
// If we fail to parse the date, just skip the column
// and not the whole row
dateParsed := false
dateLayouts := [...]string{"1-2-06", "1/2/06"}
for _, layout := range dateLayouts {
t, err := time.Parse(layout, cell)
if err == nil {
log.Debug(logger, "Parsed DueDate with layout", "layout", layout)
opportunity.DueDate = t
dateParsed = true
break
}
}
if !dateParsed {
log.Warn(logger, "Could not parse DueDate according to any attempted layouts",
"attempted_layouts", dateLayouts, "raw_value", cell)
sendMetric("spreadsheet.cell_parsing_errors", 1, "target:DueDate")
continue
}
case 13:
opportunity.Match = parseEligibility(cell)
}
// We will use the most recent bill
opportunity.Bill = bill
}
// Only add valid opportunities
if opportunity.GrantID > 0 {
opportunities = append(opportunities, opportunity)
}
}
return opportunities, nil
}