Skip to content

Commit 69391f8

Browse files
committed
CSV column discernment
1 parent 85dcc45 commit 69391f8

File tree

3 files changed

+132
-32
lines changed

3 files changed

+132
-32
lines changed

Diff for: src/databases/csv/discern.rs

+102
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
use {
2+
super::string_record_to_row,
3+
crate::{
4+
data::Schema, CSVDatabaseError, CSVSettings, Cast, Column, Result, Value,
5+
ValueType,
6+
},
7+
csv::{Reader, ReaderBuilder, StringRecord},
8+
std::{
9+
default::Default,
10+
fs::{File},
11+
},
12+
};
13+
14+
impl CSVSettings {
15+
pub(crate) fn new_reader(&self, file: File) -> Reader<File> {
16+
ReaderBuilder::new()
17+
.delimiter(self.delimiter)
18+
.has_headers(self.has_header.unwrap_or(true))
19+
.from_reader(file)
20+
}
21+
pub(crate) fn discern_header(&mut self, header: &StringRecord) -> Vec<String> {
22+
let header = string_record_to_row(header);
23+
24+
let has_header = if let Some(has_header) = self.has_header {
25+
has_header
26+
} else {
27+
let has_header = !header
28+
.iter()
29+
.map(ValueType::from)
30+
.any(|vt| vt != ValueType::Str);
31+
self.has_header = Some(has_header);
32+
has_header
33+
};
34+
35+
if has_header {
36+
header
37+
.into_iter()
38+
.map(Cast::cast)
39+
.collect::<Result<Vec<String>>>()
40+
.unwrap()
41+
} else {
42+
header
43+
.into_iter()
44+
.enumerate()
45+
.map(|(index, _)| format!("column_{}", index))
46+
.collect()
47+
}
48+
}
49+
pub(crate) fn discern_schema(&mut self, file: File) -> Result<Option<Schema>> {
50+
let mut reader = self.new_reader(file);
51+
let header = reader
52+
.headers()
53+
.map_err(|error| CSVDatabaseError::HeaderError(format!("{:?}", error)))?;
54+
let header = self.discern_header(header);
55+
if header.is_empty() {
56+
return Ok(None);
57+
}
58+
let value_types = self.discern_types(reader);
59+
60+
let column_defs = header
61+
.into_iter()
62+
.zip(value_types)
63+
.map(|(header, value_type)| {
64+
let mut column = Column::default();
65+
column.name = header;
66+
column.data_type = value_type;
67+
column
68+
})
69+
.collect();
70+
Ok(Some(Schema {
71+
table_name: String::new(),
72+
column_defs,
73+
indexes: vec![],
74+
}))
75+
}
76+
pub(crate) fn discern_types(&self, reader: Reader<File>) -> Vec<ValueType> {
77+
let sample = reader
78+
.into_records()
79+
.take(self.sample_rows)
80+
.map(|record| string_record_to_row(&record.unwrap()))
81+
.collect();
82+
discern_types_from_sample(sample)
83+
}
84+
}
85+
86+
pub(crate) fn discern_types_from_sample(sample: Vec<Vec<Value>>) -> Vec<ValueType> {
87+
let mut types = sample
88+
.into_iter()
89+
.map(|row| row.iter().map(ValueType::from).collect());
90+
let first_types = types.next().unwrap();
91+
types.fold(first_types, |mut out_types, row_types| {
92+
out_types
93+
.iter_mut()
94+
.zip(row_types)
95+
.for_each(|(out_type, row_type)| {
96+
if out_type != &row_type {
97+
*out_type = ValueType::Any
98+
}
99+
});
100+
out_types
101+
})
102+
}

Diff for: src/databases/csv/mod.rs

+15-32
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,19 @@
11
mod auto_increment;
22
mod base;
3+
mod discern;
34
mod mutable;
5+
mod record;
46
mod utils;
57

8+
pub use {discern::*, record::*};
9+
610
use {
7-
crate::{data::Schema, Column, DBFull, Database, Result, ValueType, WIPError},
8-
csv::ReaderBuilder,
11+
crate::{data::Schema, DBFull, Database, Result, WIPError},
912
serde::{Deserialize, Serialize},
1013
std::{
1114
default::Default,
1215
fmt::Debug,
13-
fs::{File, OpenOptions},
16+
fs::{OpenOptions},
1417
},
1518
thiserror::Error,
1619
};
@@ -19,6 +22,9 @@ use {
1922
pub enum CSVDatabaseError {
2023
#[error("CSV storages only support one table at a time")]
2124
OnlyOneTableAllowed,
25+
26+
#[error("Failed to open CSV because of a error with header: {0}")]
27+
HeaderError(String),
2228
}
2329

2430
pub struct CSVDatabase {
@@ -30,12 +36,16 @@ pub struct CSVDatabase {
3036
pub struct CSVSettings {
3137
pub delimiter: u8,
3238
pub quoting: bool,
39+
pub has_header: Option<bool>,
40+
pub sample_rows: usize,
3341
}
3442
impl Default for CSVSettings {
3543
fn default() -> Self {
3644
Self {
3745
delimiter: b',',
3846
quoting: true,
47+
has_header: None,
48+
sample_rows: 100,
3949
}
4050
}
4151
}
@@ -51,46 +61,19 @@ impl CSVDatabase {
5161
pub fn new(path: &str) -> Result<Self> {
5262
Self::new_with_settings(path, CSVSettings::default())
5363
}
54-
pub fn new_with_settings(path: &str, csv_settings: CSVSettings) -> Result<Self> {
64+
pub fn new_with_settings(path: &str, mut csv_settings: CSVSettings) -> Result<Self> {
5565
let file = OpenOptions::new()
5666
.read(true)
5767
.write(true)
5868
.create(true)
5969
.open(path)
6070
.map_err(|error| WIPError::Debug(format!("{:?}", error)))?;
6171

62-
let schema = discern_schema(file, &csv_settings)?;
72+
let schema = csv_settings.discern_schema(file)?;
6373
Ok(Self {
6474
schema,
6575
path: path.to_string(),
6676
csv_settings,
6777
})
6878
}
6979
}
70-
71-
fn discern_schema(file: File, csv_settings: &CSVSettings) -> Result<Option<Schema>> {
72-
let mut reader = ReaderBuilder::new()
73-
.delimiter(csv_settings.delimiter)
74-
.from_reader(file);
75-
let headers = reader
76-
.headers()
77-
.map_err(|error| WIPError::Debug(format!("{:?}", error)))?;
78-
let column_defs = headers
79-
.iter()
80-
.map(|header| {
81-
let mut column = Column::default();
82-
column.name = header.to_string();
83-
column.data_type = ValueType::Str;
84-
column
85-
})
86-
.collect();
87-
if headers.is_empty() {
88-
Ok(None)
89-
} else {
90-
Ok(Some(Schema {
91-
table_name: String::new(),
92-
column_defs,
93-
indexes: vec![],
94-
}))
95-
}
96-
}

Diff for: src/databases/csv/record.rs

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
use {crate::Value, csv::StringRecord};
2+
3+
pub fn string_record_to_row(record: &StringRecord) -> Vec<Value> {
4+
record.iter().map(csv_cell_to_value).collect()
5+
}
6+
7+
fn csv_cell_to_value(cell: &str) -> Value {
8+
let cell = cell.to_string();
9+
cell.parse::<bool>()
10+
.map(|v| Value::Bool(v))
11+
.or_else(|_| cell.parse::<u64>().map(|v| Value::U64(v)))
12+
.or_else(|_| cell.parse::<i64>().map(|v| Value::I64(v)))
13+
.or_else(|_| cell.parse::<f64>().map(|v| Value::F64(v)))
14+
.unwrap_or_else(|_| Value::Str(cell))
15+
}

0 commit comments

Comments
 (0)