-
Notifications
You must be signed in to change notification settings - Fork 0
/
generic_csv.go
108 lines (87 loc) · 2.5 KB
/
generic_csv.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/* SPDX-License-Identifier: MPL-2.0
*
* Zymatik Nucleo - A Bioinformatics library for Go.
* Copyright (C) 2024 Damian Peckett <damian@pecke.tt>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the Mozilla Public License v2.0.
*
* You should have received a copy of the Mozilla Public License v2.0
* along with this program. If not, see <https://mozilla.org/MPL/2.0/>.
*/
package snparray
import (
"bufio"
"encoding/csv"
"fmt"
"io"
"strconv"
"strings"
"github.com/zymatik-com/genobase/types"
"github.com/zymatik-com/nucleo/names"
)
type genericCSVCodec struct{}
func (c *genericCSVCodec) Detect(r io.Reader) (bool, error) {
scanner := bufio.NewScanner(r)
if !scanner.Scan() {
return false, scanner.Err()
}
if err := scanner.Err(); err != nil {
return false, err
}
return strings.Contains(scanner.Text(), ",") &&
!strings.Contains(scanner.Text(), "\t"), nil
}
type genericCSVReader struct {
reader *csv.Reader
columnMappings map[string]int
}
func (c *genericCSVCodec) Open(r io.Reader) (Reader, error) {
reader := csv.NewReader(r)
reader.Comment = '#'
record, err := reader.Read()
if err != nil {
return nil, fmt.Errorf("error reading genome file: %w", err)
}
// TODO: guess column mappings if not present.
columnMappings := make(map[string]int)
for i, colName := range record {
columnMappings[strings.ToLower(strings.TrimSpace(colName))] = i
}
return &genericCSVReader{
reader: reader,
columnMappings: columnMappings,
}, nil
}
func (r *genericCSVReader) Reference() types.Reference {
// TODO: determine the reference assembly from the coordinates
// of some of the most common SNPs.
return types.ReferenceGRCh37
}
func (r *genericCSVReader) Read() (*SNP, error) {
var record []string
// Skip over no call variants.
genotype := "--"
for genotype == "--" || genotype == "00" {
var err error
record, err = r.reader.Read()
if err != nil {
return nil, err
}
if len(record) < len(r.columnMappings) {
return nil, fmt.Errorf("not enough columns")
}
genotype = record[r.columnMappings["result"]]
}
// TODO: support a more fuzzy matching of column names.
position, err := strconv.ParseInt(record[r.columnMappings["position"]], 10, 64)
if err != nil {
return nil, fmt.Errorf("error parsing position: %s", err)
}
return &SNP{
RSID: record[r.columnMappings["rsid"]],
Chromosome: names.Chromosome(record[r.columnMappings["chromosome"]]),
Position: position,
Genotype: genotype,
}, nil
}