From 505401df33e0eedcf21c2e5c786a0f6db32ec698 Mon Sep 17 00:00:00 2001 From: David Lutterkort <lutter@watzmann.net> Date: Tue, 21 Jan 2025 16:26:13 -0800 Subject: [PATCH 01/13] store: Make Layout.table a little easier to use --- store/postgres/src/relational.rs | 9 +++++---- store/postgres/src/relational/ddl_tests.rs | 4 +--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/store/postgres/src/relational.rs b/store/postgres/src/relational.rs index d148060efc2..c5c929e189a 100644 --- a/store/postgres/src/relational.rs +++ b/store/postgres/src/relational.rs @@ -443,12 +443,13 @@ impl Layout { Ok(()) } - /// Find the table with the provided `name`. The name must exactly match - /// the name of an existing table. No conversions of the name are done - pub fn table(&self, name: &SqlName) -> Option<&Table> { + /// Find the table with the provided `sql_name`. The name must exactly + /// match the name of an existing table. No conversions of the name are + /// done + pub fn table(&self, sql_name: &str) -> Option<&Table> { self.tables .values() - .find(|table| &table.name == name) + .find(|table| &table.name == sql_name) .map(|rc| rc.as_ref()) } diff --git a/store/postgres/src/relational/ddl_tests.rs b/store/postgres/src/relational/ddl_tests.rs index 86e9f232d49..bab910104bf 100644 --- a/store/postgres/src/relational/ddl_tests.rs +++ b/store/postgres/src/relational/ddl_tests.rs @@ -26,9 +26,7 @@ fn test_layout(gql: &str) -> Layout { #[test] fn table_is_sane() { let layout = test_layout(THING_GQL); - let table = layout - .table(&"thing".into()) - .expect("failed to get 'thing' table"); + let table = layout.table("thing").expect("failed to get 'thing' table"); assert_eq!(SqlName::from("thing"), table.name); assert_eq!("Thing", table.object.as_str()); From f8743a7063965e9c36c0c90fc6be90272af68d6a Mon Sep 17 00:00:00 2001 From: Gustavo Inacio <gustavo@semiotic.ai> Date: Tue, 7 May 2024 21:29:34 -0300 Subject: [PATCH 02/13] graph, store: Create database sql executor --- Cargo.lock | 14 + Cargo.toml | 7 +- graph/src/components/store/traits.rs | 4 +- graph/src/data/query/error.rs | 3 + graph/src/data/store/mod.rs | 3 + store/postgres/Cargo.toml | 2 + store/postgres/src/deployment_store.rs | 23 +- store/postgres/src/lib.rs | 1 + store/postgres/src/query_store.rs | 27 +- store/postgres/src/relational_queries.rs | 43 +++ store/postgres/src/sql/constants.rs | 435 +++++++++++++++++++++++ store/postgres/src/sql/formatter.rs | 101 ++++++ store/postgres/src/sql/mod.rs | 10 + store/postgres/src/sql/parser.rs | 191 ++++++++++ store/postgres/src/sql/validation.rs | 290 +++++++++++++++ 15 files changed, 1145 insertions(+), 9 deletions(-) create mode 100644 store/postgres/src/sql/constants.rs create mode 100644 store/postgres/src/sql/formatter.rs create mode 100644 store/postgres/src/sql/mod.rs create mode 100644 store/postgres/src/sql/parser.rs create mode 100644 store/postgres/src/sql/validation.rs diff --git a/Cargo.lock b/Cargo.lock index c511d00601f..63ffc8446af 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2149,7 +2149,9 @@ dependencies = [ "rand 0.8.5", "serde", "serde_json", + "sqlparser", "stable-hash 0.3.4", + "thiserror 1.0.61", ] [[package]] @@ -4708,6 +4710,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "11a81a8cad9befe4cf1b9d2d4b9c6841c76f0882a3fec00d95133953c13b3d3d" dependencies = [ "log", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index a8193c3f0ed..b93c2107781 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,11 +69,8 @@ serde_derive = "1.0.125" serde_json = { version = "1.0", features = ["arbitrary_precision"] } serde_regex = "1.1.0" serde_yaml = "0.9.21" -slog = { version = "2.7.0", features = [ - "release_max_level_trace", - "max_level_trace", -] } -sqlparser = "0.46.0" +slog = { version = "2.7.0", features = ["release_max_level_trace", "max_level_trace"] } +sqlparser = { version = "0.46.0", features = ["visitor"] } strum = { version = "0.26", features = ["derive"] } syn = { version = "2.0.87", features = ["full"] } test-store = { path = "./store/test-store" } diff --git a/graph/src/components/store/traits.rs b/graph/src/components/store/traits.rs index 27cb3768e2c..bed1b3af548 100644 --- a/graph/src/components/store/traits.rs +++ b/graph/src/components/store/traits.rs @@ -15,7 +15,7 @@ use crate::components::transaction_receipt; use crate::components::versions::ApiVersion; use crate::data::query::Trace; use crate::data::store::ethereum::call; -use crate::data::store::QueryObject; +use crate::data::store::{QueryObject, SqlQueryObject}; use crate::data::subgraph::{status, DeploymentFeatures}; use crate::data::{query::QueryTarget, subgraph::schema::*}; use crate::prelude::{DeploymentState, NodeId, QueryExecutionError, SubgraphName}; @@ -635,6 +635,8 @@ pub trait QueryStore: Send + Sync { query: EntityQuery, ) -> Result<(Vec<QueryObject>, Trace), QueryExecutionError>; + fn execute_sql(&self, sql: &str) -> Result<Vec<SqlQueryObject>, QueryExecutionError>; + async fn is_deployment_synced(&self) -> Result<bool, Error>; async fn block_ptr(&self) -> Result<Option<BlockPtr>, StoreError>; diff --git a/graph/src/data/query/error.rs b/graph/src/data/query/error.rs index 65fc1bcd259..83c2f5ab8b2 100644 --- a/graph/src/data/query/error.rs +++ b/graph/src/data/query/error.rs @@ -72,6 +72,7 @@ pub enum QueryExecutionError { InvalidSubgraphManifest, ResultTooBig(usize, usize), DeploymentNotFound(String), + SqlError(String), IdMissing, IdNotString, ConstraintViolation(String), @@ -133,6 +134,7 @@ impl QueryExecutionError { | IdMissing | IdNotString | ConstraintViolation(_) => false, + SqlError(_) => false, } } } @@ -275,6 +277,7 @@ impl fmt::Display for QueryExecutionError { IdMissing => write!(f, "entity is missing an `id` attribute"), IdNotString => write!(f, "entity `id` attribute is not a string"), ConstraintViolation(msg) => write!(f, "internal constraint violated: {}", msg), + SqlError(e) => write!(f, "sql error: {}", e), } } } diff --git a/graph/src/data/store/mod.rs b/graph/src/data/store/mod.rs index c8786e9b473..cf464ff7b6b 100644 --- a/graph/src/data/store/mod.rs +++ b/graph/src/data/store/mod.rs @@ -1102,6 +1102,9 @@ pub struct QueryObject { pub entity: r::Object, } +/// An object that is returned from a SQL query. It wraps an `r::Value` +pub struct SqlQueryObject(pub r::Value); + impl CacheWeight for QueryObject { fn indirect_weight(&self) -> usize { self.parent.indirect_weight() + self.entity.indirect_weight() diff --git a/store/postgres/Cargo.toml b/store/postgres/Cargo.toml index 9a746646807..574926c7010 100644 --- a/store/postgres/Cargo.toml +++ b/store/postgres/Cargo.toml @@ -32,6 +32,8 @@ git-testament = "0.2.5" itertools = "0.13.0" hex = "0.4.3" pretty_assertions = "1.4.0" +sqlparser = { workspace = true } +thiserror = { workspace = true } [dev-dependencies] clap.workspace = true diff --git a/store/postgres/src/deployment_store.rs b/store/postgres/src/deployment_store.rs index b148129d924..de342075d4d 100644 --- a/store/postgres/src/deployment_store.rs +++ b/store/postgres/src/deployment_store.rs @@ -12,8 +12,9 @@ use graph::components::store::{ PruningStrategy, QueryPermit, StoredDynamicDataSource, VersionStats, }; use graph::components::versions::VERSIONS; +use graph::data::graphql::IntoValue; use graph::data::query::Trace; -use graph::data::store::IdList; +use graph::data::store::{IdList, SqlQueryObject}; use graph::data::subgraph::{status, SPEC_VERSION_0_0_6}; use graph::data_source::CausalityRegion; use graph::derive::CheapClone; @@ -54,7 +55,7 @@ use crate::dynds::DataSourcesTable; use crate::primary::DeploymentId; use crate::relational::index::{CreateIndex, IndexList, Method}; use crate::relational::{Layout, LayoutCache, SqlName, Table}; -use crate::relational_queries::FromEntityData; +use crate::relational_queries::{FromEntityData, JSONData}; use crate::{advisory_lock, catalog, retry}; use crate::{connection_pool::ConnectionPool, detail}; use crate::{dynds, primary::Site}; @@ -286,6 +287,24 @@ impl DeploymentStore { layout.query(&logger, conn, query) } + pub(crate) fn execute_sql( + &self, + conn: &mut PgConnection, + query: &str, + ) -> Result<Vec<SqlQueryObject>, QueryExecutionError> { + let query = diesel::sql_query(query); + + // Execute the provided SQL query + let results = query + .load::<JSONData>(conn) + .map_err(|e| QueryExecutionError::SqlError(e.to_string()))?; + + Ok(results + .into_iter() + .map(|e| SqlQueryObject(e.into_value())) + .collect::<Vec<_>>()) + } + fn check_intf_uniqueness( &self, conn: &mut PgConnection, diff --git a/store/postgres/src/lib.rs b/store/postgres/src/lib.rs index 759e8601313..713adfdb405 100644 --- a/store/postgres/src/lib.rs +++ b/store/postgres/src/lib.rs @@ -32,6 +32,7 @@ pub mod query_store; mod relational; mod relational_queries; mod retry; +mod sql; mod store; mod store_events; mod subgraph_store; diff --git a/store/postgres/src/query_store.rs b/store/postgres/src/query_store.rs index 8fc2da822e4..f6b2a22712c 100644 --- a/store/postgres/src/query_store.rs +++ b/store/postgres/src/query_store.rs @@ -2,9 +2,10 @@ use std::collections::HashMap; use std::time::Instant; use crate::deployment_store::{DeploymentStore, ReplicaId}; +use crate::sql::Parser; use graph::components::store::{DeploymentId, QueryPermit, QueryStore as QueryStoreTrait}; use graph::data::query::Trace; -use graph::data::store::QueryObject; +use graph::data::store::{QueryObject, SqlQueryObject}; use graph::prelude::*; use graph::schema::{ApiSchema, InputSchema}; @@ -16,6 +17,7 @@ pub(crate) struct QueryStore { store: Arc<DeploymentStore>, chain_store: Arc<crate::ChainStore>, api_version: Arc<ApiVersion>, + sql_parser: Result<Parser, StoreError>, } impl QueryStore { @@ -26,12 +28,16 @@ impl QueryStore { replica_id: ReplicaId, api_version: Arc<ApiVersion>, ) -> Self { + let sql_parser = store + .find_layout(site.clone()) + .map(|layout| Parser::new(layout)); QueryStore { site, replica_id, store, chain_store, api_version, + sql_parser, } } } @@ -57,6 +63,25 @@ impl QueryStoreTrait for QueryStore { }) } + fn execute_sql( + &self, + sql: &str, + ) -> Result<Vec<SqlQueryObject>, graph::prelude::QueryExecutionError> { + let mut conn = self + .store + .get_replica_conn(self.replica_id) + .map_err(|e| QueryExecutionError::SqlError(format!("SQL error: {}", e)))?; + + let parser = self + .sql_parser + .as_ref() + .map_err(|e| QueryExecutionError::SqlError(format!("SQL error: {}", e)))?; + + let sql = parser.parse_and_validate(sql)?; + + self.store.execute_sql(&mut conn, &sql) + } + /// Return true if the deployment with the given id is fully synced, /// and return false otherwise. Errors from the store are passed back up async fn is_deployment_synced(&self) -> Result<bool, Error> { diff --git a/store/postgres/src/relational_queries.rs b/store/postgres/src/relational_queries.rs index 19f9400c470..ed048699a1a 100644 --- a/store/postgres/src/relational_queries.rs +++ b/store/postgres/src/relational_queries.rs @@ -14,6 +14,8 @@ use diesel::sql_types::{Array, BigInt, Binary, Bool, Int8, Integer, Jsonb, Text, use diesel::QuerySource as _; use graph::components::store::write::{EntityWrite, RowGroup, WriteChunk}; use graph::components::store::{Child as StoreChild, DerivedEntityQuery}; + +use graph::data::graphql::IntoValue; use graph::data::store::{Id, IdType, NULL}; use graph::data::store::{IdList, IdRef, QueryObject}; use graph::data::value::{Object, Word}; @@ -439,6 +441,47 @@ pub fn parse_id(id_type: IdType, json: serde_json::Value) -> Result<Id, StoreErr } } +#[derive(QueryableByName, Debug)] +pub struct JSONData { + #[diesel(sql_type = Jsonb)] + pub data: serde_json::Value, +} + +impl IntoValue for JSONData { + fn into_value(self) -> r::Value { + JSONData::to_value(self.data) + } +} + +impl JSONData { + pub fn to_value(data: serde_json::Value) -> r::Value { + match data { + serde_json::Value::Null => r::Value::Null, + serde_json::Value::Bool(b) => r::Value::Boolean(b), + serde_json::Value::Number(n) => { + if let Some(i) = n.as_i64() { + r::Value::Int(i) + } else { + r::Value::Float(n.as_f64().unwrap()) + } + } + serde_json::Value::String(s) => r::Value::String(s), + serde_json::Value::Array(vals) => { + let vals: Vec<_> = vals.into_iter().map(JSONData::to_value).collect::<Vec<_>>(); + r::Value::List(vals) + } + serde_json::Value::Object(map) => { + let mut m = std::collections::BTreeMap::new(); + for (k, v) in map { + let value = JSONData::to_value(v); + m.insert(Word::from(k), value); + } + r::Value::object(m) + } + } + } +} + /// Helper struct for retrieving entities from the database. With diesel, we /// can only run queries that return columns whose number and type are known /// at compile time. Because of that, we retrieve the actual data for an diff --git a/store/postgres/src/sql/constants.rs b/store/postgres/src/sql/constants.rs new file mode 100644 index 00000000000..b24f191f938 --- /dev/null +++ b/store/postgres/src/sql/constants.rs @@ -0,0 +1,435 @@ +use std::collections::HashSet; + +use lazy_static::lazy_static; +use sqlparser::dialect::PostgreSqlDialect; + +lazy_static! { + pub(super) static ref ALLOWED_FUNCTIONS: HashSet<&'static str> = { + vec![ + // Comparison Functions see https://www.postgresql.org/docs/14/functions-comparison.html#FUNCTIONS-COMPARISON-FUNC-TABLE + "num_nonnulls", // Number of non-null arguments + "num_nulls", // Number of null arguments + + // Mathematical Functions see https://www.postgresql.org/docs/14/functions-math.html#FUNCTIONS-MATH-FUNC-TABLE + "abs", // Asolute value + "cbrt", // Cube root + "ceil", // Nearest integer greater than or equal to argument + "ceiling", // Nearest integer greater than or equal to argument + "degrees", // Converts radians to degrees + "div", // Integer quotient of y/x (truncates towards zero) + "exp", // Exponential (e raised to the given power) + "factorial", // Factorial + "floor", // Nearest integer less than or equal to argument + "gcd", // Greatest common divisor (the largest positive number that divides both inputs with no remainder); returns 0 if both inputs are zero; available for integer, bigint, and numeric + "lcm", // Least common multiple (the smallest strictly positive number that is an integral multiple of both inputs); returns 0 if either input is zero; available for integer, bigint, and numeric + "ln", // Natural logarithm + "log", // Base 10 logarithm + "log10", // Base 10 logarithm (same as log) + "mod", // Remainder of y/x; available for smallint, integer, bigint, and numeric + "pi", // Approximate value of π + "power", // a raised to the power of b + "radians", // Converts degrees to radians + "round", // Rounds to nearest integer. For numeric, ties are broken by rounding away from zero. For double precision, the tie-breaking behavior is platform dependent, but “round to nearest even” is the most common rule. + "scale", // Scale of the argument (the number of decimal digits in the fractional part) + "sign", // Sign of the argument (-1, 0, or +1) + "sqrt", // Square root + "trim_scale", // Reduces the value's scale (number of fractional decimal digits) by removing trailing zeroes + "trunc", // Truncates to integer (towards zero) + "width_bucket", // Returns the number of the bucket in which operand falls in a histogram having count equal-width buckets spanning the range low to high. Returns 0 or count+1 for an input outside that range. + + // Random Functions see https://www.postgresql.org/docs/14/functions-math.html#FUNCTIONS-MATH-RANDOM-TABLE + "random", // Returns a random value in the range 0.0 <= x < 1.0 + "setseed", // Sets the seed for subsequent random() calls; argument must be between -1.0 and 1.0, inclusive + + // Trigonometric Functions see https://www.postgresql.org/docs/14/functions-math.html#FUNCTIONS-MATH-TRIG-TABLE + "acos", // Arc cosine, result in radians + "acosd", // Arc cosine, result in degrees + "asin", // Arc sine, result in radians + "asind", // Arc sine, result in degrees + "atan", // Arc tangent, result in radians + "atand", // Arc tangent, result in degrees + "atan2", // Arc tangent of y/x, result in radians + "atan2d", // Arc tangent of y/x, result in degrees + "cos", // Cosine, argument in radians + "cosd", // Cosine, argument in degrees + "cot", // Cotangent, argument in radians + "cotd", // Cotangent, argument in degrees + "sin", // Sine, argument in radians + "sind", // Sine, argument in degrees + "tan", // Tangent, argument in radians + "tand", // Tangent, argument in degrees + + // Hyperbolic Functions see https://www.postgresql.org/docs/14/functions-math.html#FUNCTIONS-MATH-HYPERBOLIC-TABLE + "sinh", // Hyperbolic sine + "cosh", // Hyperbolic cosine + "tanh", // Hyperbolic tangent + "asinh", // Inverse hyperbolic sine + "acosh", // Inverse hyperbolic cosine + "atanh", // Inverse hyperbolic tangent + + // String Functions see https://www.postgresql.org/docs/14/functions-string.html#FUNCTIONS-STRING-SQL + "bit_length", // Number of bits in string + "char_length", // Number of characters in string + "character_length", // Synonym for char_length + "lower", // Convert string to lower case + "normalize", // Convert string to specified Unicode normalization form + "octet_length", // Number of bytes in string + "overlay", // Replace substring + "position", // Location of specified substring + "substring", // Extract substring + "trim", // Remove leading and trailing characters + "upper", // Convert string to upper case + + //Additional string functions see https://www.postgresql.org/docs/14/functions-string.html#FUNCTIONS-STRING-OTHER + "ascii", // Convert first character to its numeric code + "btrim", // Remove the longest string containing only characters from characters (a space by default) from the start and end of string + "chr", // Convert integer to character + "concat", // Concatenate strings + "concat_ws", // Concatenate with separator + "format", // Format arguments according to a format string + "initcap", // Convert first letter of each word to upper case and the rest to lower case + "left", // Extract substring + "length", // Number of characters in string + "lpad", // Pad string to length length by prepending the characters fill (a space by default) + "ltrim", // Remove the longest string containing only characters from characters (a space by default) from the start of string + "md5", // Compute MD5 hash + "parse_ident", // Split qualified_identifier into an array of identifiers, removing any quoting of individual identifiers + "quote_ident", // Returns the given string suitably quoted to be used as an identifier in an SQL statement string + "quote_literal", // Returns the given string suitably quoted to be used as a string literal in an SQL statement string + "quote_nullable", // Returns the given string suitably quoted to be used as a string literal in an SQL statement string; or, if the argument is null, returns NULL + "regexp_match", // Returns captured substrings resulting from the first match of a POSIX regular expression to the string + "regexp_matches", // Returns captured substrings resulting from the first match of a POSIX regular expression to the string, or multiple matches if the g flag is used + "regexp_replace", // Replaces substrings resulting from the first match of a POSIX regular expression, or multiple substring matches if the g flag is used + "regexp_split_to_array", // Splits string using a POSIX regular expression as the delimiter, producing an array of results + "regexp_split_to_table", // Splits string using a POSIX regular expression as the delimiter, producing a set of results + "repeat", // Repeats string the specified number of times + "replace", // Replaces all occurrences in string of substring from with substring to + "reverse", // Reverses the order of the characters in the string + "right", // Extract substring + "rpad", // Pad string to length length by appending the characters fill (a space by default) + "rtrim", // Remove the longest string containing only characters from characters (a space by default) from the end of string + "split_part", // Splits string at occurrences of delimiter and returns the n'th field (counting from one), or when n is negative, returns the |n|'th-from-last field + "strpos", // Returns first starting index of the specified substring within string, or zero if it's not present + "substr", // Extracts the substring of string starting at the start'th character, and extending for count characters if that is specified + "starts_with", // Returns true if string starts with prefix + "string_to_array", // Splits the string at occurrences of delimiter and forms the resulting fields into a text array + "string_to_table", // Splits the string at occurrences of delimiter and returns the resulting fields as a set of text rows + "to_ascii", // Converts string to ASCII from another encoding, which may be identified by name or number + "to_hex", // Converts the number to its equivalent hexadecimal representation + "translate", // Replaces each character in string that matches a character in the from set with the corresponding character in the to set + "unistr", // Evaluate escaped Unicode characters in the argument + + // Binary String Functions see https://www.postgresql.org/docs/14/functions-binarystring.html#FUNCTIONS-BINARYSTRING-OTHER + "bit_count", // Number of bits set in the argument + "get_bit", // Extracts the n'th bit from string + "get_byte", // Extracts the n'th byte from string + "set_bit", // Sets the n'th bit in string to newvalue + "set_byte", // Sets the n'th byte in string to newvalue + "sha224", // Compute SHA-224 hash + "sha256", // Compute SHA-256 hash + "sha384", // Compute SHA-384 hash + "sha512", // Compute SHA-512 hash + + // String conversion functions see https://www.postgresql.org/docs/14/functions-binarystring.html#FUNCTIONS-BINARYSTRING-CONVERSIONS + "convert", // Converts a binary string representing text in encoding src_encoding to a binary string in encoding dest_encoding + "convert_from", // Converts a binary string representing text in encoding src_encoding to text in the database encoding + "convert_to", // Converts a text string (in the database encoding) to a binary string encoded in encoding dest_encoding + "encode", // Encodes binary data into a textual representation + "decode", // Decodes binary data from a textual representation + + // Formatting Functions see https://www.postgresql.org/docs/14/functions-formatting.html#FUNCTIONS-FORMATTING-TABLE + "to_char", // Converts number to a string according to the given format + "to_date", // Converts string to date + "to_number", // Converts string to number + "to_timestamp", // Converts string to timestamp with time zone + + // Date/Time Functions see https://www.postgresql.org/docs/14/functions-datetime.html + "age", // Subtract arguments, producing a “symbolic” result that uses years and months, rather than just days + "clock_timestamp", // Current date and time (changes during statement execution) + "current_date", // Current date + "current_time", // Current time of day + "current_timestamp", // Current date and time (start of current transaction) + "date_bin", // Bin input into specified interval aligned with specified origin + "date_part", // Get subfield (equivalent to extract) + "date_trunc", // Truncate to specified precision + "extract", // Get subfield + "isfinite", // Test for finite date (not +/-infinity) + "justify_days", // Adjust interval so 30-day time periods are represented as months + "justify_hours", // Adjust interval so 24-hour time periods are represented as days + "justify_interval", // Adjust interval using justify_days and justify_hours, with additional sign adjustments + "localtime", // Current time of day + "localtimestamp", // Current date and time (start of current transaction) + "make_date", // Create date from year, month and day fields (negative years signify BC) + "make_interval", // Create interval from years, months, weeks, days, hours, minutes and seconds fields, each of which can default to zero + "make_time", // Create time from hour, minute and seconds fields + "make_timestamp", // Create timestamp from year, month, day, hour, minute and seconds fields (negative years signify BC) + "make_timestamptz", // Create timestamp with time zone from year, month, day, hour, minute and seconds fields (negative years signify BC). + "now", // Current date and time (start of current transaction) + "statement_timestamp", // Current date and time (start of current statement) + "timeofday", // Current date and time (like clock_timestamp, but as a text string) + "transaction_timestamp", // Current date and time (start of current transaction) + + // Enum support functions see https://www.postgresql.org/docs/14/functions-enum.html#FUNCTIONS-ENUM-SUPPORT + "enum_first", // Returns the first value of an enum type + "enum_last", // Returns the last value of an enum type + "enum_range", // Returns a range of values of an enum type + + // Geometric Functions see https://www.postgresql.org/docs/14/functions-geometry.html + "area", // Computes area + "center", // Computes center point + "diagonal", // Extracts box's diagonal as a line segment (same as lseg(box)) + "diameter", // Computes diameter of circle + "height", // Computes vertical size of box + "isclosed", // Is path closed? + "isopen", // Is path open? + "length", // Computes the total length + "npoints", // Returns the number of points + "pclose", // Converts path to closed form + "popen", // Converts path to open form + "radius", // Computes radius of circle + "slope", // Computes slope of a line drawn through the two points + "width", // Computes horizontal size of box + + // Geometric Type Conversion Functions see https://www.postgresql.org/docs/14/functions-geometry.html + "box", // Convert to a box + "circle", // Convert to a circle + "line", // Convert to a line + "lseg", // Convert to a line segment + "path", // Convert to a path + "point", // Convert to a point + "polygon", // Convert to a polygon + + // IP Address Functions see https://www.postgresql.org/docs/14/functions-net.html + "abbrev", // Creates an abbreviated display format as text + "broadcast", // Computes the broadcast address for the address's network + "family", // Returns the address's family: 4 for IPv4, 6 for IPv6 + "host", // Returns the IP address as text, ignoring the netmask + "hostmask", // Computes the host mask for the address's network + "inet_merge", // Computes the smallest network that includes both of the given networks + "inet_same_family", // Tests whether the addresses belong to the same IP family + "masklen", // Returns the netmask length in bits + "netmask", // Computes the network mask for the address's network + "network", // Returns the network part of the address, zeroing out whatever is to the right of the netmask + "set_masklen", // Sets the netmask length for an inet value. The address part does not change + "text", // Returns the unabbreviated IP address and netmask length as text + + // MAC Address Functions see https://www.postgresql.org/docs/14/functions-net.html#MACADDR-FUNCTIONS-TABLE + "macaddr8_set7bit", //Sets the 7th bit of the address to one, creating what is known as modified EUI-64, for inclusion in an IPv6 address. + + // Text Search Functions see https://www.postgresql.org/docs/14/functions-textsearch.html + "array_to_tsvector", // Converts an array of lexemes to a tsvector + "get_current_ts_config", // Returns the OID of the current default text search configuration (as set by default_text_search_config) + "numnode", // Returns the number of lexemes plus operators in the tsquery + "plainto_tsquery", // Converts text to a tsquery, normalizing words according to the specified or default configuration. + "phraseto_tsquery", // Converts text to a tsquery, normalizing words according to the specified or default configuration. + "websearch_to_tsquery", // Converts text to a tsquery, normalizing words according to the specified or default configuration. + "querytree", // Produces a representation of the indexable portion of a tsquery. A result that is empty or just T indicates a non-indexable query. + "setweight", // Assigns the specified weight to each element of the vector. + "strip", // Removes positions and weights from the tsvector. + "to_tsquery", // Converts text to a tsquery, normalizing words according to the specified or default configuration. + "to_tsvector", // Converts text to a tsvector, normalizing words according to the specified or default configuration. + "json_to_tsvector", // Selects each item in the JSON document that is requested by the filter and converts each one to a tsvector, normalizing words according to the specified or default configuration. + "jsonb_to_tsvector",// Selects each item in the JSON document that is requested by the filter and converts each one to a tsvector, normalizing words according to the specified or default configuration. + "ts_delete", // Removes any occurrence of the given lexeme from the vector. + "ts_filter", // Selects only elements with the given weights from the vector. + "ts_headline", // Displays, in an abbreviated form, the match(es) for the query in the document, which must be raw text not a tsvector. + "ts_rank", // Computes a score showing how well the vector matches the query. See Section 12.3.3 for details. + "ts_rank_cd", // Computes a score showing how well the vector matches the query, using a cover density algorithm. See Section 12.3.3 for details. + "ts_rewrite", // Replaces occurrences of target with substitute within the query. See Section + "tsquery_phrase", // Constructs a phrase query that searches for matches of query1 and query2 at successive lexemes (same as <-> operator). + "tsvector_to_array", // Converts a tsvector to an array of lexemes. + + // Text search debugging functions see https://www.postgresql.org/docs/14/functions-textsearch.html#TEXTSEARCH-FUNCTIONS-DEBUG-TABLE + "ts_debug", // Extracts and normalizes tokens from the document according to the specified or default text search configuration, and returns information about how each token was processed. See Section 12.8.1 for details. + "ts_lexize", // Returns an array of replacement lexemes if the input token is known to the dictionary, or an empty array if the token is known to the dictionary but it is a stop word, or NULL if it is not a known word. See Section 12.8.3 for details. + "ts_parse", // Extracts tokens from the document using the named parser. See Section 12.8.2 for details. + "ts_token_type", // Returns a table that describes each type of token the named parser can recognize. See Section 12.8.2 for details. + + // UUID Functions see https://www.postgresql.org/docs/14/functions-uuid.html + "gen_random_uuid", // Generate a version 4 (random) UUID + + // XML Functions see https://www.postgresql.org/docs/14/functions-xml.html + "xmlcomment", // Creates an XML comment + "xmlconcat", // Concatenates XML values + "xmlelement", // Creates an XML element + "xmlforest", // Creates an XML forest (sequence) of elements + "xmlpi", // Creates an XML processing instruction + "xmlagg", // Concatenates the input values to the aggregate function call, much like xmlconcat does, except that concatenation occurs across rows rather than across expressions in a single row. + "xmlexists", // Evaluates an XPath 1.0 expression (the first argument), with the passed XML value as its context item. + "xml_is_well_formed", // Checks whether the argument is a well-formed XML document or fragment. + "xml_is_well_formed_content", // Checks whether the argument is a well-formed XML document or fragment, and that it contains no document type declaration. + "xml_is_well_formed_document", // Checks whether the argument is a well-formed XML document. + "xpath", // Evaluates the XPath 1.0 expression xpath (given as text) against the XML value xml. + "xpath_exists", // Evaluates the XPath 1.0 expression xpath (given as text) against the XML value xml, and returns true if the expression selects at least one node, otherwise false. + "xmltable", // Expands an XML value into a table whose columns match the rowtype defined by the function's parameter list. + "table_to_xml", // Converts a table to XML. + "cursor_to_xml", // Converts a cursor to XML. + + // JSON and JSONB creation functions see https://www.postgresql.org/docs/14/functions-json.html#FUNCTIONS-JSON-CREATION-TABLE + "to_json", // Converts any SQL value to JSON. + "to_jsonb", // Converts any SQL value to JSONB. + "array_to_json", // Converts an SQL array to a JSON array. + "row_to_json", // Converts an SQL composite value to a JSON object. + "json_build_array", // Builds a possibly-heterogeneously-typed JSON array out of a variadic argument list. + "jsonb_build_array", // Builds a possibly-heterogeneously-typed JSON array out of a variadic argument list. + "json_build_object", // Builds a JSON object out of a variadic argument list. + "json_object", // Builds a JSON object out of a text array. + "jsonb_object", // Builds a JSONB object out of a text array. + + // JSON and JSONB processing functions see https://www.postgresql.org/docs/14/functions-json.html#FUNCTIONS-JSON-PROCESSING-TABLE + "json_array_elements", // Expands the top-level JSON array into a set of JSON values. + "jsonb_array_elements", // Expands the top-level JSON array into a set of JSONB values. + "json_array_elements_text", // Expands the top-level JSON array into a set of text values. + "jsonb_array_elements_text", // Expands the top-level JSONB array into a set of text values. + "json_array_length", // Returns the number of elements in the top-level JSON array. + "jsonb_array_length", // Returns the number of elements in the top-level JSONB array. + "json_each", // Expands the top-level JSON object into a set of key/value pairs. + "jsonb_each", // Expands the top-level JSONB object into a set of key/value pairs. + "json_each_text", // Expands the top-level JSON object into a set of key/value pairs. The returned values will be of type text. + "jsonb_each_text", // Expands the top-level JSONB object into a set of key/value pairs. The returned values will be of type text. + "json_extract_path", // Extracts JSON sub-object at the specified path. + "jsonb_extract_path", // Extracts JSONB sub-object at the specified path. + "json_extract_path_text", // Extracts JSON sub-object at the specified path as text. + "jsonb_extract_path_text", // Extracts JSONB sub-object at the specified path as text. + "json_object_keys", // Returns the set of keys in the top-level JSON object. + "jsonb_object_keys", // Returns the set of keys in the top-level JSONB object. + "json_populate_record", // Expands the top-level JSON object to a row having the composite type of the base argument. + "jsonb_populate_record", // Expands the top-level JSON object to a row having the composite type of the base argument. + "json_populate_recordset", // Expands the top-level JSON array of objects to a set of rows having the composite type of the base argument. + "jsonb_populate_recordset", // Expands the top-level JSONB array of objects to a set of rows having the composite type of the base argument. + "json_to_record", // Expands the top-level JSON object to a row having the composite type defined by an AS clause. + "jsonb_to_record", // Expands the top-level JSONB object to a row having the composite type defined by an AS clause. + "json_to_recordset", // Expands the top-level JSON array of objects to a set of rows having the composite type defined by an AS clause. + "jsonb_to_recordset", // Expands the top-level JSONB array of objects to a set of rows having the composite type defined by an AS clause. + "json_strip_nulls", // Deletes all object fields that have null values from the given JSON value, recursively. + "jsonb_strip_nulls", // Deletes all object fields that have null values from the given JSONB value, recursively. + "jsonb_set", // Returns target with the item designated by path replaced by new_value, or with new_value added if create_if_missing is true (which is the default) and the item designated by path does not exist. + "jsonb_set_lax", // If new_value is not NULL, behaves identically to jsonb_set. Otherwise behaves according to the value of null_value_treatment which must be one of 'raise_exception', 'use_json_null', 'delete_key', or 'return_target'. The default is 'use_json_null'. + "jsonb_insert", //Returns target with new_value inserted. + "jsonb_path_exists", // Checks whether the JSON path returns any item for the specified JSON value. + "jsonb_path_match", // Returns the result of a JSON path predicate check for the specified JSON value. + "jsonb_path_query", // Returns all JSON items returned by the JSON path for the specified JSON value. + "jsonb_path_query_array", // Returns all JSON items returned by the JSON path for the specified JSON value, as a JSON array. + "jsonb_path_query_first", // Returns the first JSON item returned by the JSON path for the specified JSON value. Returns NULL if there are no results. + "jsonb_path_exists_tz", // Support comparisons of date/time values that require timezone-aware conversions. + "jsonb_path_match_tz", // Support comparisons of date/time values that require timezone-aware conversions. + "jsonb_path_query_tz", // Support comparisons of date/time values that require timezone-aware conversions. + "jsonb_path_query_array_tz", // Support comparisons of date/time values that require timezone-aware conversions. + "jsonb_path_query_first_tz", // Support comparisons of date/time values that require timezone-aware conversions. + "jsonb_pretty", // Converts the given JSON value to pretty-printed, indented text. + "json_typeof", // Returns the type of the top-level JSON value as a text string. + "jsonb_typeof", // Returns the type of the top-level JSONB value as a text string. + + // Conditional Expressions hhttps://www.postgresql.org/docs/14/functions-conditional.html + "coalesce", // Return first non-null argument. + "nullif", // Return null if two arguments are equal, otherwise return the first argument. + "greatest", // Return greatest of a list of values. + "least", // Return smallest of a list of values. + + // Array Functions https://www.postgresql.org/docs/14/functions-array.html#ARRAY-FUNCTIONS-TABLE + "array_append", // Appends an element to the end of an array (same as the || operator). + "array_cat", // Concatenates two arrays (same as the || operator). + "array_dims", // Returns a text representation of the array's dimensions. + "array_fill", // Returns an array filled with copies of the given value, having dimensions of the lengths specified by the second argument. The optional third argument supplies lower-bound values for each dimension (which default to all 1). + "array_length", // Returns the length of the requested array dimension. (Produces NULL instead of 0 for empty or missing array dimensions.) + "array_lower", // Returns the lower bound of the requested array dimension. + "array_ndims", // Returns the number of dimensions of the array. + "array_position", // Returns the subscript of the first occurrence of the second argument in the array, or NULL if it's not present. + "array_prepend", // Prepends an element to the beginning of an array (same as the || operator). + "array_remove", // Removes all elements equal to the given value from the array. The array must be one-dimensional. Comparisons are done using IS NOT DISTINCT FROM semantics, so it is possible to remove NULLs. + "array_replace", // Replaces each array element equal to the second argument with the third argument. + "array_to_string", // Converts each array element to its text representation, and concatenates those separated by the delimiter string. If null_string is given and is not NULL, then NULL array entries are represented by that string; otherwise, they are omitted. + "array_upper", // Returns the upper bound of the requested array dimension. + "cardinality", // Returns the total number of elements in the array, or 0 if the array is empty. + "trim_array", // Trims an array by removing the last n elements. If the array is multidimensional, only the first dimension is trimmed. + "unnest", // Expands an array into a set of rows. The array's elements are read out in storage order. + + // Range Functions https://www.postgresql.org/docs/14/functions-range.html#RANGE-FUNCTIONS-TABLE + "lower", // Extracts the lower bound of the range (NULL if the range is empty or the lower bound is infinite). + "upper", // Extracts the upper bound of the range (NULL if the range is empty or the upper bound is infinite). + "isempty", // Is the range empty? + "lower_inc", // Is the range's lower bound inclusive? + "upper_inc", // Is the range's upper bound inclusive? + "lower_inf", // Is the range's lower bound infinite? + "upper_inf", // Is the range's upper bound infinite? + "range_merge", // Computes the smallest range that includes both of the given ranges. + + // Multi-range Functions https://www.postgresql.org/docs/14/functions-range.html#MULTIRANGE-FUNCTIONS-TABLE + "multirange", // Returns a multirange containing just the given range. + + // General purpose aggregate functions https://www.postgresql.org/docs/14/functions-aggregate.html#FUNCTIONS-AGGREGATE-TABLE + "array_agg", // Collects all the input values, including nulls, into an array. + "avg", // Computes the average (arithmetic mean) of all the non-null input values. + "bit_and", // Computes the bitwise AND of all non-null input values. + "bit_or", // Computes the bitwise OR of all non-null input values. + "bit_xor", // Computes the bitwise exclusive OR of all non-null input values. Can be useful as a checksum for an unordered set of values. + "bool_and", // Returns true if all non-null input values are true, otherwise false. + "bool_or", // Returns true if any non-null input value is true, otherwise false. + "count", // Computes the number of input rows. + "every", // This is the SQL standard's equivalent to bool_and. + "json_agg", // Collects all the input values, including nulls, into a JSON array. Values are converted to JSON as per to_json or to_jsonb. + "json_object_agg", // Collects all the key/value pairs into a JSON object. Key arguments are coerced to text; value arguments are converted as per to_json or to_jsonb. Values can be null, but not keys. + "max", // Computes the maximum of the non-null input values. Available for any numeric, string, date/time, or enum type, as well as inet, interval, money, oid, pg_lsn, tid, and arrays of any of these types. + "min", // Computes the minimum of the non-null input values. Available for any numeric, string, date/time, or enum type, as well as inet, interval, money, oid, pg_lsn, tid, and arrays of any of these types. + "range_agg", // Computes the union of the non-null input values. + "range_intersect_agg", // Computes the intersection of the non-null input values. + "string_agg", // Concatenates the non-null input values into a string. Each value after the first is preceded by the corresponding delimiter (if it's not null). + "sum", // Computes the sum of the non-null input values. + "xmlagg", // Concatenates the non-null XML input values. + + // Statistical aggregate functions https://www.postgresql.org/docs/14/functions-aggregate.html#FUNCTIONS-AGGREGATE-STATISTICS-TABLE + "corr", // Computes the correlation coefficient. + "covar_pop", // Computes the population covariance. + "covar_samp", // Computes the sample covariance. + "regr_avgx", // Computes the average of the independent variable, sum(X)/N. + "regr_avgy", // Computes the average of the dependent variable, sum(Y)/N. + "regr_count", // Computes the number of rows in which both inputs are non-null. + "regr_intercept", // Computes the y-intercept of the least-squares-fit linear equation determined by the (X, Y) pairs. + "regr_r2", // Computes the square of the correlation coefficient. + "regr_slope", // Computes the slope of the least-squares-fit linear equation determined by the (X, Y) pairs. + "regr_sxx", // Computes the “sum of squares” of the independent variable, sum(X^2) - sum(X)^2/N. + "regr_sxy", // Computes the “sum of products” of independent times dependent variables, sum(X*Y) - sum(X) * sum(Y)/N. + "regr_syy", // Computes the “sum of squares” of the dependent variable, sum(Y^2) - sum(Y)^2/N. + "stddev", // This is a historical alias for stddev_samp. + "stddev_pop", // Computes the population standard deviation of the input values. + "stddev_samp", // Computes the sample standard deviation of the input values. + "variance", // This is a historical alias for var_samp. + "var_pop", // Computes the population variance of the input values (square of the population standard deviation). + "var_samp", // Computes the sample variance of the input values (square of the sample standard deviation). + + // Ordered-set aggregate functions https://www.postgresql.org/docs/14/functions-aggregate.html#FUNCTIONS-AGGREGATE-ORDEREDSET-TABLE + "mode", // Computes the mode (most frequent value) of the input values. + "percentile_cont", // Computes the continuous percentile of the input values. + "percentile_disc", // Computes the discrete percentile of the input values. + + // Hypothetical-set aggregate functions https://www.postgresql.org/docs/14/functions-aggregate.html#FUNCTIONS-AGGREGATE-HYPOTHETICAL-TABLE + "rank", // Computes the rank of the current row with gaps; same as row_number of its first peer. + "dense_rank", // Computes the rank of the current row without gaps; this function counts peer groups. + "percent_rank", // Computes the relative rank (percentile) of the current row: (rank - 1) / (total partition rows - 1). + "cume_dist", // Computes the relative rank of the current row: (number of partition rows preceding or peer with current row) / (total partition rows). + + // Grouping set aggregate functions https://www.postgresql.org/docs/14/functions-aggregate.html#FUNCTIONS-AGGREGATE-GROUPINGSET-TABLE + "grouping", // Returns a bit mask indicating which GROUP BY expressions are not included in the current grouping set. + + // Window functions https://www.postgresql.org/docs/14/functions-window.html#FUNCTIONS-WINDOW-TABLE + "row_number", // Number of the current row within its partition, counting from 1. + "ntile", // Integer ranging from 1 to the argument value, dividing the partition as equally as possible. + "lag", // Returns value evaluated at the row that is offset rows before the current row within the partition; if there is no such row, instead returns default (which must be of a type compatible with value). + "lead", // Returns value evaluated at the row that is offset rows after the current row within the partition; if there is no such row, instead returns default (which must be of a type compatible with value). + "first_value", // Returns value evaluated at the row that is the first row of the window frame. + "last_value", // Returns value evaluated at the row that is the last row of the window frame. + "nth_value", // Returns value evaluated at the row that is the n'th row of the window frame (counting from 1); returns NULL if there is no such row. + + // Set returning functions https://www.postgresql.org/docs/14/functions-srf.html + "generate_series", // Expands range arguments into a set of rows. + "generate_subscripts", // Expands array arguments into a set of rows. + + // Abbreivated syntax for common functions + "pow", // see power function + "date", // see to_date + + ].into_iter().collect() + }; +} + +pub(super) static SQL_DIALECT: PostgreSqlDialect = PostgreSqlDialect {}; diff --git a/store/postgres/src/sql/formatter.rs b/store/postgres/src/sql/formatter.rs new file mode 100644 index 00000000000..dff810aa6ea --- /dev/null +++ b/store/postgres/src/sql/formatter.rs @@ -0,0 +1,101 @@ +use sqlparser::ast::{ObjectName, Statement, TableFactor, VisitMut, VisitorMut}; +use std::ops::ControlFlow; + +use super::Schema; + +pub struct Formatter<'a> { + prelude: &'a str, + schema: &'a Schema, +} + +impl<'a> Formatter<'a> { + pub fn new(prelude: &'a str, schema: &'a Schema) -> Self { + Self { prelude, schema } + } + + fn prepend_prefix_to_object_name_mut(&self, name: &mut ObjectName) { + let table_identifier = &mut name.0; + // remove all but the last identifier + table_identifier.drain(0..table_identifier.len() - 1); + + // Ensure schema tables has quotation to match up with prelude generated cte. + if let Some(table_name) = table_identifier.last_mut() { + if self.schema.contains_key(&table_name.value) { + table_name.quote_style = Some('"'); + } + } + } + + pub fn format(&mut self, statement: &mut Statement) -> String { + statement.visit(self); + + format!( + "{} SELECT to_jsonb(sub.*) AS data FROM ( {} ) AS sub", + self.prelude, statement + ) + } +} + +impl VisitorMut for Formatter<'_> { + type Break = (); + + fn pre_visit_table_factor( + &mut self, + table_factor: &mut TableFactor, + ) -> ControlFlow<Self::Break> { + if let TableFactor::Table { name, .. } = table_factor { + self.prepend_prefix_to_object_name_mut(name); + } + ControlFlow::Continue(()) + } +} + +#[cfg(test)] +mod test { + use std::collections::HashSet; + + use super::*; + use crate::sql::constants::SQL_DIALECT; + const CTE_PREFIX: &str = "WITH \"swap\" AS ( + SELECT + id, + amount_in, + amount_out, + concat('0x',encode(token_in,'hex') as token_in, + concat('0x',token_out,'hex') AS token_out + FROM + sdg1.swap + )"; + + #[test] + fn format_sql() { + let mut schema = Schema::new(); + schema.insert( + "swap".to_string(), + HashSet::from_iter( + ["id", "amount_in", "amount_out", "token_in", "token_out"] + .into_iter() + .map(|s| s.to_string()), + ), + ); + + let mut formatter = Formatter::new(CTE_PREFIX, &schema); + + let sql = "SELECT token_in, SUM(amount_in) AS amount FROM unknown.swap GROUP BY token_in"; + + let mut statements = sqlparser::parser::Parser::parse_sql(&SQL_DIALECT, sql).unwrap(); + + let mut statement = statements.get_mut(0).unwrap(); + + let result = formatter.format(&mut statement); + + assert_eq!( + result, + format!( + "{} SELECT to_jsonb(sub.*) AS data FROM ( {} ) AS sub", + CTE_PREFIX, + "SELECT token_in, SUM(amount_in) AS amount FROM \"swap\" GROUP BY token_in" + ) + ); + } +} diff --git a/store/postgres/src/sql/mod.rs b/store/postgres/src/sql/mod.rs new file mode 100644 index 00000000000..d3962ae968e --- /dev/null +++ b/store/postgres/src/sql/mod.rs @@ -0,0 +1,10 @@ +mod constants; +mod formatter; +mod parser; +mod validation; + +use std::collections::{HashMap, HashSet}; + +pub(self) type Schema = HashMap<String, HashSet<String>>; // HashMap<Table, HashSet<Column>> + +pub use parser::Parser; diff --git a/store/postgres/src/sql/parser.rs b/store/postgres/src/sql/parser.rs new file mode 100644 index 00000000000..c449803fe86 --- /dev/null +++ b/store/postgres/src/sql/parser.rs @@ -0,0 +1,191 @@ +use super::{constants::SQL_DIALECT, formatter::Formatter, validation::Validator}; +use crate::relational::{ColumnType, Layout}; +use anyhow::{anyhow, Ok, Result}; +use graph::components::store::BLOCK_NUMBER_MAX; +use itertools::Itertools; +use std::sync::Arc; + +pub fn generate_table_prelude_from_layout(layout: &Layout) -> String { + let schema = &layout.catalog.site.namespace; + let ctes = layout + .tables + .iter() + .filter(|(entity, _)| !entity.is_poi()) + .map(|(_, table)| { + let table_name = table.name.as_str(); + + let (block_column, filter) = if !table.immutable { + ( + "block_range", + Some(format!(" WHERE \"block_range\" @> {}", BLOCK_NUMBER_MAX)), + ) + } else { + ("block$", None) + }; + + let columns = table + .columns + .iter() + .map(|col| { + if !col.is_list() && col.column_type == ColumnType::Bytes { + format!( + r#"concat('0x', encode("{}", 'hex')) AS "{}""#, + col.name.as_str(), + col.name.as_str() + ) + } else { + format!(r#""{}""#, col.name.as_str()) + } + }) + .chain(std::iter::once(format!(r#""{}""#, block_column))) + .collect::<Vec<_>>() + .join(", "); + format!( + "\"{table_name}\" AS (SELECT {columns} FROM \"{schema}\".\"{table_name}\"{})", + filter.unwrap_or_default() + ) + }) + .sorted() + .collect::<Vec<_>>() + .join(",\n"); + format!("WITH {ctes}") +} + +pub struct Parser { + schema: super::Schema, + prelude: String, +} + +impl Parser { + pub fn new(layout: Arc<Layout>) -> Self { + Self { + schema: layout + .tables + .iter() + .filter(|(entity, _)| !entity.is_poi()) + .map(|(_, table)| { + ( + table.name.to_string(), + table + .columns + .iter() + .map(|column| column.name.to_string()) + .collect(), + ) + }) + .collect(), + prelude: generate_table_prelude_from_layout(&layout), + } + } + + pub fn parse_and_validate(&self, sql: &str) -> Result<String> { + let mut statements = sqlparser::parser::Parser::parse_sql(&SQL_DIALECT, sql)?; + + let mut validator = Validator::new(&self.schema); + validator.validate_statements(&statements)?; + + let mut formatter = Formatter::new(&self.prelude, &self.schema); + + let statement = statements + .get_mut(0) + .ok_or_else(|| anyhow!("No SQL statements found"))?; + + let result = formatter.format(statement); + + Ok(result) + } +} + +#[cfg(test)] +mod test { + + use crate::layout_for_tests::{make_dummy_site, Catalog, Namespace}; + + use super::*; + use graph::{data::subgraph::DeploymentHash, schema::InputSchema}; + + const TEST_GQL: &str = " + type SwapMulti @entity(immutable: true) { + id: Bytes! + sender: Bytes! # address + amountsIn: [BigInt!]! # uint256[] + tokensIn: [Bytes!]! # address[] + amountsOut: [BigInt!]! # uint256[] + tokensOut: [Bytes!]! # address[] + referralCode: BigInt! # uint32 + blockNumber: BigInt! + blockTimestamp: BigInt! + transactionHash: Bytes! + } + + type Token @entity { + id: ID! + address: Bytes! # address + symbol: String! + name: String! + decimals: Int! + } + "; + + const NAMESPACE: &str = "sgd0815"; + + const SQL_QUERY: &str = " + with tokens as ( + select * from (values + ('0x0000000000000000000000000000000000000000','ETH','Ethereum',18), + ('0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48','USDC','USD Coin',6) + ) as t(address,symbol,name,decimals) + ) + + select + date, + t.symbol, + SUM(amount)/pow(10,t.decimals) as amount + from (select + date(to_timestamp(block_timestamp) at time zone 'utc') as date, + token, + amount + from swap_multi as sm + ,unnest(sm.amounts_in,sm.tokens_in) as smi(amount,token) + union all + select + date(to_timestamp(block_timestamp) at time zone 'utc') as date, + token, + amount + from sgd1.swap_multi as sm + ,unnest(sm.amounts_out,sm.tokens_out) as smo(amount,token) + ) as tp + inner join tokens as t on t.address = '0x' || encode(tp.token,'hex') + group by tp.date,t.symbol,t.decimals + order by tp.date desc ,amount desc + + "; + + fn test_layout() -> Layout { + let subgraph = DeploymentHash::new("subgraph").unwrap(); + let schema = + InputSchema::parse_latest(TEST_GQL, subgraph.clone()).expect("Test schema invalid"); + let namespace = Namespace::new(NAMESPACE.to_owned()).unwrap(); + let site = Arc::new(make_dummy_site(subgraph, namespace, "anet".to_string())); + let catalog = + Catalog::for_tests(site.clone(), Default::default()).expect("Can not create catalog"); + Layout::new(site, &schema, catalog).expect("Failed to construct Layout") + } + + #[test] + fn parse_sql() { + let parser = Parser::new(Arc::new(test_layout())); + + let result = parser.parse_and_validate(SQL_QUERY); + + assert!(result.is_ok()); + + let query = result.unwrap(); + + assert_eq!( + query, + r#"WITH "swap_multi" AS (SELECT concat('0x', encode("id", 'hex')) AS "id", concat('0x', encode("sender", 'hex')) AS "sender", "amounts_in", "tokens_in", "amounts_out", "tokens_out", "referral_code", "block_number", "block_timestamp", concat('0x', encode("transaction_hash", 'hex')) AS "transaction_hash", "block$" FROM "sgd0815"."swap_multi"), +"token" AS (SELECT "id", concat('0x', encode("address", 'hex')) AS "address", "symbol", "name", "decimals", "block_range" FROM "sgd0815"."token" WHERE "block_range" @> 2147483647) SELECT to_jsonb(sub.*) AS data FROM ( WITH tokens AS (SELECT * FROM (VALUES ('0x0000000000000000000000000000000000000000', 'ETH', 'Ethereum', 18), ('0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48', 'USDC', 'USD Coin', 6)) AS t (address, symbol, name, decimals)) SELECT date, t.symbol, SUM(amount) / pow(10, t.decimals) AS amount FROM (SELECT date(to_timestamp(block_timestamp) AT TIME ZONE 'utc') AS date, token, amount FROM "swap_multi" AS sm, UNNEST(sm.amounts_in, sm.tokens_in) AS smi (amount, token) UNION ALL SELECT date(to_timestamp(block_timestamp) AT TIME ZONE 'utc') AS date, token, amount FROM "swap_multi" AS sm, UNNEST(sm.amounts_out, sm.tokens_out) AS smo (amount, token)) AS tp JOIN tokens AS t ON t.address = '0x' || encode(tp.token, 'hex') GROUP BY tp.date, t.symbol, t.decimals ORDER BY tp.date DESC, amount DESC ) AS sub"# + ); + } +} diff --git a/store/postgres/src/sql/validation.rs b/store/postgres/src/sql/validation.rs new file mode 100644 index 00000000000..674cee4cc5d --- /dev/null +++ b/store/postgres/src/sql/validation.rs @@ -0,0 +1,290 @@ +use sqlparser::ast::{Expr, ObjectName, Query, SetExpr, Statement, TableFactor, Visit, Visitor}; +use std::result::Result; +use std::{collections::HashSet, ops::ControlFlow}; + +use super::{constants::ALLOWED_FUNCTIONS, Schema}; + +#[derive(thiserror::Error, Debug, PartialEq)] +pub enum Error { + #[error("Unknown or unsupported function {0}")] + UnknownFunction(String), + #[error("Multi statement is not supported.")] + MultiStatementUnSupported, + #[error("Only SELECT query is supported.")] + NotSelectQuery, + #[error("Unknown table {0}")] + UnknownTable(String), +} + +pub struct Validator<'a> { + schema: &'a Schema, + ctes: HashSet<String>, +} + +impl<'a> Validator<'a> { + pub fn new(schema: &'a Schema) -> Self { + Self { + schema, + ctes: Default::default(), + } + } + + fn validate_function_name(&self, name: &ObjectName) -> ControlFlow<Error> { + let name = name.to_string().to_lowercase(); + if ALLOWED_FUNCTIONS.contains(name.as_str()) { + ControlFlow::Continue(()) + } else { + ControlFlow::Break(Error::UnknownFunction(name)) + } + } + + pub fn validate_statements(&mut self, statements: &Vec<Statement>) -> Result<(), Error> { + self.ctes.clear(); + + if statements.len() > 1 { + return Err(Error::MultiStatementUnSupported); + } + + if let ControlFlow::Break(error) = statements.visit(self) { + return Err(error); + } + + Ok(()) + } + + fn validate_table_name(&mut self, name: &ObjectName) -> ControlFlow<Error> { + if let Some(table_name) = name.0.last() { + let table_name = table_name.to_string().to_lowercase(); + if !self.schema.contains_key(&table_name) && !self.ctes.contains(&table_name) { + return ControlFlow::Break(Error::UnknownTable(table_name)); + } + } + ControlFlow::Continue(()) + } +} + +impl Visitor for Validator<'_> { + type Break = Error; + + fn pre_visit_statement(&mut self, _statement: &Statement) -> ControlFlow<Self::Break> { + match _statement { + Statement::Query(_) => ControlFlow::Continue(()), + _ => ControlFlow::Break(Error::NotSelectQuery), + } + } + + fn pre_visit_query(&mut self, _query: &Query) -> ControlFlow<Self::Break> { + // Add common table expressions to the set of known tables + if let Some(ref with) = _query.with { + self.ctes.extend( + with.cte_tables + .iter() + .map(|cte| cte.alias.name.value.to_lowercase()), + ); + } + + match *_query.body { + SetExpr::Update(_) | SetExpr::Insert(_) => ControlFlow::Break(Error::NotSelectQuery), + _ => ControlFlow::Continue(()), + } + } + + /// Invoked for any table function in the AST. + /// See [TableFactor::Table.args](sqlparser::ast::TableFactor::Table::args) for more details identifying a table function + fn pre_visit_table_factor(&mut self, table_factor: &TableFactor) -> ControlFlow<Self::Break> { + if let TableFactor::Table { name, args, .. } = table_factor { + if args.is_some() { + return self.validate_function_name(name); + } else { + return self.validate_table_name(name); + } + } + ControlFlow::Continue(()) + } + + /// Invoked for any function expressions that appear in the AST + fn pre_visit_expr(&mut self, _expr: &Expr) -> ControlFlow<Self::Break> { + if let Expr::Function(function) = _expr { + return self.validate_function_name(&function.name); + } + ControlFlow::Continue(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::sql::constants::SQL_DIALECT; + use std::collections::{HashMap, HashSet}; + + fn validate(sql: &str) -> Result<(), Error> { + let statements = sqlparser::parser::Parser::parse_sql(&SQL_DIALECT, sql).unwrap(); + + let schema: Schema = HashMap::from([( + "swap".to_owned(), + HashSet::from([ + "vid".to_owned(), + "block$".to_owned(), + "id".to_owned(), + "sender".to_owned(), + "input_amount".to_owned(), + "input_token".to_owned(), + "amount_out".to_owned(), + "output_token".to_owned(), + "slippage".to_owned(), + "referral_code".to_owned(), + "block_number".to_owned(), + "block_timestamp".to_owned(), + "transaction_hash".to_owned(), + ]), + )]); + + let mut validator = Validator::new(&schema); + + validator.validate_statements(&statements) + } + + #[test] + fn test_function_blacklisted() { + let result = validate( + " + SELECT + input_token + FROM swap + WHERE '' = ( + SELECT + CAST(pg_sleep(5) AS text + ) + )", + ); + assert_eq!(result, Err(Error::UnknownFunction("pg_sleep".to_owned()))); + } + + #[test] + fn test_table_function_blacklisted() { + let result = validate( + " + SELECT + vid, + k.sname + FROM swap, + LATERAL( + SELECT + current_schemas as sname + FROM current_schemas(true) + ) as k", + ); + assert_eq!( + result, + Err(Error::UnknownFunction("current_schemas".to_owned())) + ); + } + + #[test] + fn test_function_blacklisted_without_paranthesis() { + let result = validate( + " + SELECT + input_token + FROM swap + WHERE '' = ( + SELECT user + )", + ); + assert_eq!(result, Err(Error::UnknownFunction("user".to_owned()))); + } + + #[test] + fn test_function_whitelisted() { + let result = validate( + " + SELECT + input_token, + SUM(input_amount) AS total_amount + FROM swap + GROUP BY input_token + HAVING SUM(input_amount) > 1000 + ", + ); + assert_eq!(result, Ok(())); + } + + #[test] + fn test_function_unknown() { + let result = validate( + " + SELECT + input_token + FROM swap + WHERE '' = ( + SELECT + CAST(do_strange_math(amount_in) AS text + ) + )", + ); + assert_eq!( + result, + Err(Error::UnknownFunction("do_strange_math".to_owned())) + ); + } + + #[test] + fn test_not_select_ddl() { + let result = validate( + " + CREATE TABLE foo (id INT PRIMARY KEY); + ", + ); + assert_eq!(result, Err(Error::NotSelectQuery)); + } + + #[test] + fn test_not_select_insert() { + let result = validate( + " + INSERT INTO foo VALUES (1); + ", + ); + assert_eq!(result, Err(Error::NotSelectQuery)); + } + + #[test] + fn test_common_table_expression() { + let result = validate( + " + WITH foo AS (SELECT 1) SELECT * FROM foo; + ", + ); + assert_eq!(result, Ok(())); + } + + #[test] + fn test_common_table_expression_with_effect() { + let result = validate( + " + WITH foo AS (INSERT INTO target VALUES(1)) SELECT * FROM bar; + ", + ); + assert_eq!(result, Err(Error::NotSelectQuery)); + } + + #[test] + fn test_no_multi_statement() { + let result = validate( + " + SELECT 1; SELECT 2; + ", + ); + assert_eq!(result, Err(Error::MultiStatementUnSupported)); + } + + #[test] + fn test_table_unknown() { + let result = validate( + " + SELECT * FROM unknown_table; + ", + ); + assert_eq!(result, Err(Error::UnknownTable("unknown_table".to_owned()))); + } +} From 8b5061564bb0e99fb91527d6ec2fa56d58a116b0 Mon Sep 17 00:00:00 2001 From: David Lutterkort <lutter@watzmann.net> Date: Tue, 21 Jan 2025 16:41:51 -0800 Subject: [PATCH 03/13] store: Use Layout for schema information --- store/postgres/src/sql/formatter.rs | 38 ++++++++-------- store/postgres/src/sql/mod.rs | 25 +++++++++-- store/postgres/src/sql/parser.rs | 26 +++-------- store/postgres/src/sql/validation.rs | 65 ++++++++++++++-------------- 4 files changed, 79 insertions(+), 75 deletions(-) diff --git a/store/postgres/src/sql/formatter.rs b/store/postgres/src/sql/formatter.rs index dff810aa6ea..0a2aafb6413 100644 --- a/store/postgres/src/sql/formatter.rs +++ b/store/postgres/src/sql/formatter.rs @@ -1,16 +1,16 @@ use sqlparser::ast::{ObjectName, Statement, TableFactor, VisitMut, VisitorMut}; use std::ops::ControlFlow; -use super::Schema; +use crate::relational::{Layout, SqlName}; pub struct Formatter<'a> { prelude: &'a str, - schema: &'a Schema, + layout: &'a Layout, } impl<'a> Formatter<'a> { - pub fn new(prelude: &'a str, schema: &'a Schema) -> Self { - Self { prelude, schema } + pub fn new(prelude: &'a str, layout: &'a Layout) -> Self { + Self { prelude, layout } } fn prepend_prefix_to_object_name_mut(&self, name: &mut ObjectName) { @@ -20,7 +20,8 @@ impl<'a> Formatter<'a> { // Ensure schema tables has quotation to match up with prelude generated cte. if let Some(table_name) = table_identifier.last_mut() { - if self.schema.contains_key(&table_name.value) { + let sql_name = SqlName::verbatim(table_name.to_string()); + if self.layout.table(&sql_name).is_some() { table_name.quote_style = Some('"'); } } @@ -52,10 +53,19 @@ impl VisitorMut for Formatter<'_> { #[cfg(test)] mod test { - use std::collections::HashSet; - use super::*; - use crate::sql::constants::SQL_DIALECT; + use crate::sql::{constants::SQL_DIALECT, test::make_layout}; + + const GQL: &str = " + type Swap @entity { + id: ID! + amountIn: BigDecimal! + amountOut: BigDecimal! + tokenIn: Bytes! + tokenOut: Bytes! + } + "; + const CTE_PREFIX: &str = "WITH \"swap\" AS ( SELECT id, @@ -69,17 +79,9 @@ mod test { #[test] fn format_sql() { - let mut schema = Schema::new(); - schema.insert( - "swap".to_string(), - HashSet::from_iter( - ["id", "amount_in", "amount_out", "token_in", "token_out"] - .into_iter() - .map(|s| s.to_string()), - ), - ); + let layout = make_layout(GQL); - let mut formatter = Formatter::new(CTE_PREFIX, &schema); + let mut formatter = Formatter::new(CTE_PREFIX, &layout); let sql = "SELECT token_in, SUM(amount_in) AS amount FROM unknown.swap GROUP BY token_in"; diff --git a/store/postgres/src/sql/mod.rs b/store/postgres/src/sql/mod.rs index d3962ae968e..cf655050753 100644 --- a/store/postgres/src/sql/mod.rs +++ b/store/postgres/src/sql/mod.rs @@ -3,8 +3,27 @@ mod formatter; mod parser; mod validation; -use std::collections::{HashMap, HashSet}; +pub use parser::Parser; -pub(self) type Schema = HashMap<String, HashSet<String>>; // HashMap<Table, HashSet<Column>> +#[cfg(test)] +mod test { + use std::{collections::BTreeSet, sync::Arc}; -pub use parser::Parser; + use graph::{prelude::DeploymentHash, schema::InputSchema}; + + use crate::{ + catalog::Catalog, + primary::{make_dummy_site, Namespace}, + relational::Layout, + }; + + pub(crate) fn make_layout(gql: &str) -> Layout { + let subgraph = DeploymentHash::new("Qmasubgraph").unwrap(); + let schema = InputSchema::parse_latest(gql, subgraph.clone()).unwrap(); + let namespace = Namespace::new("sgd0815".to_string()).unwrap(); + let site = Arc::new(make_dummy_site(subgraph, namespace, "anet".to_string())); + let catalog = Catalog::for_tests(site.clone(), BTreeSet::new()).unwrap(); + let layout = Layout::new(site, &schema, catalog).unwrap(); + layout + } +} diff --git a/store/postgres/src/sql/parser.rs b/store/postgres/src/sql/parser.rs index c449803fe86..7e972d96777 100644 --- a/store/postgres/src/sql/parser.rs +++ b/store/postgres/src/sql/parser.rs @@ -52,39 +52,23 @@ pub fn generate_table_prelude_from_layout(layout: &Layout) -> String { } pub struct Parser { - schema: super::Schema, + layout: Arc<Layout>, prelude: String, } impl Parser { pub fn new(layout: Arc<Layout>) -> Self { - Self { - schema: layout - .tables - .iter() - .filter(|(entity, _)| !entity.is_poi()) - .map(|(_, table)| { - ( - table.name.to_string(), - table - .columns - .iter() - .map(|column| column.name.to_string()) - .collect(), - ) - }) - .collect(), - prelude: generate_table_prelude_from_layout(&layout), - } + let prelude = generate_table_prelude_from_layout(&layout); + Self { layout, prelude } } pub fn parse_and_validate(&self, sql: &str) -> Result<String> { let mut statements = sqlparser::parser::Parser::parse_sql(&SQL_DIALECT, sql)?; - let mut validator = Validator::new(&self.schema); + let mut validator = Validator::new(&self.layout); validator.validate_statements(&statements)?; - let mut formatter = Formatter::new(&self.prelude, &self.schema); + let mut formatter = Formatter::new(&self.prelude, &self.layout); let statement = statements .get_mut(0) diff --git a/store/postgres/src/sql/validation.rs b/store/postgres/src/sql/validation.rs index 674cee4cc5d..4d188e0cc38 100644 --- a/store/postgres/src/sql/validation.rs +++ b/store/postgres/src/sql/validation.rs @@ -2,7 +2,9 @@ use sqlparser::ast::{Expr, ObjectName, Query, SetExpr, Statement, TableFactor, V use std::result::Result; use std::{collections::HashSet, ops::ControlFlow}; -use super::{constants::ALLOWED_FUNCTIONS, Schema}; +use crate::relational::Layout; + +use super::constants::ALLOWED_FUNCTIONS; #[derive(thiserror::Error, Debug, PartialEq)] pub enum Error { @@ -17,14 +19,14 @@ pub enum Error { } pub struct Validator<'a> { - schema: &'a Schema, + layout: &'a Layout, ctes: HashSet<String>, } impl<'a> Validator<'a> { - pub fn new(schema: &'a Schema) -> Self { + pub fn new(layout: &'a Layout) -> Self { Self { - schema, + layout, ctes: Default::default(), } } @@ -54,9 +56,9 @@ impl<'a> Validator<'a> { fn validate_table_name(&mut self, name: &ObjectName) -> ControlFlow<Error> { if let Some(table_name) = name.0.last() { - let table_name = table_name.to_string().to_lowercase(); - if !self.schema.contains_key(&table_name) && !self.ctes.contains(&table_name) { - return ControlFlow::Break(Error::UnknownTable(table_name)); + let name = &table_name.value; + if !self.layout.table(name).is_some() && !self.ctes.contains(name) { + return ControlFlow::Break(Error::UnknownTable(name.to_string())); } } ControlFlow::Continue(()) @@ -114,38 +116,35 @@ impl Visitor for Validator<'_> { #[cfg(test)] mod test { use super::*; - use crate::sql::constants::SQL_DIALECT; - use std::collections::{HashMap, HashSet}; + use crate::sql::{constants::SQL_DIALECT, test::make_layout}; fn validate(sql: &str) -> Result<(), Error> { let statements = sqlparser::parser::Parser::parse_sql(&SQL_DIALECT, sql).unwrap(); - let schema: Schema = HashMap::from([( - "swap".to_owned(), - HashSet::from([ - "vid".to_owned(), - "block$".to_owned(), - "id".to_owned(), - "sender".to_owned(), - "input_amount".to_owned(), - "input_token".to_owned(), - "amount_out".to_owned(), - "output_token".to_owned(), - "slippage".to_owned(), - "referral_code".to_owned(), - "block_number".to_owned(), - "block_timestamp".to_owned(), - "transaction_hash".to_owned(), - ]), - )]); - - let mut validator = Validator::new(&schema); + const GQL: &str = " + type Swap @entity { + id: ID! + sender: Bytes! + inputAmount: BigDecimal! + inputToken: Bytes! + amountOut: BigDecimal! + outputToken: Bytes! + slippage: BigDecimal! + referralCode: String + blockNumber: Int! + blockTimestamp: Timestamp! + transactionHash: Bytes! + }"; + + let layout = make_layout(GQL); + + let mut validator = Validator::new(&layout); validator.validate_statements(&statements) } #[test] - fn test_function_blacklisted() { + fn test_function_disallowed() { let result = validate( " SELECT @@ -161,7 +160,7 @@ mod test { } #[test] - fn test_table_function_blacklisted() { + fn test_table_function_disallowed() { let result = validate( " SELECT @@ -181,7 +180,7 @@ mod test { } #[test] - fn test_function_blacklisted_without_paranthesis() { + fn test_function_disallowed_without_paranthesis() { let result = validate( " SELECT @@ -195,7 +194,7 @@ mod test { } #[test] - fn test_function_whitelisted() { + fn test_function_allowed() { let result = validate( " SELECT From a2dca57777c0ea720bffddb18a315987a114c730 Mon Sep 17 00:00:00 2001 From: David Lutterkort <lutter@watzmann.net> Date: Thu, 23 Jan 2025 21:07:59 -0800 Subject: [PATCH 04/13] store: Rewrite table names in from clauses --- store/postgres/src/sql/formatter.rs | 103 --------------------------- store/postgres/src/sql/mod.rs | 1 - store/postgres/src/sql/parser.rs | 102 ++++++-------------------- store/postgres/src/sql/validation.rs | 78 ++++++++++++++------ 4 files changed, 78 insertions(+), 206 deletions(-) delete mode 100644 store/postgres/src/sql/formatter.rs diff --git a/store/postgres/src/sql/formatter.rs b/store/postgres/src/sql/formatter.rs deleted file mode 100644 index 0a2aafb6413..00000000000 --- a/store/postgres/src/sql/formatter.rs +++ /dev/null @@ -1,103 +0,0 @@ -use sqlparser::ast::{ObjectName, Statement, TableFactor, VisitMut, VisitorMut}; -use std::ops::ControlFlow; - -use crate::relational::{Layout, SqlName}; - -pub struct Formatter<'a> { - prelude: &'a str, - layout: &'a Layout, -} - -impl<'a> Formatter<'a> { - pub fn new(prelude: &'a str, layout: &'a Layout) -> Self { - Self { prelude, layout } - } - - fn prepend_prefix_to_object_name_mut(&self, name: &mut ObjectName) { - let table_identifier = &mut name.0; - // remove all but the last identifier - table_identifier.drain(0..table_identifier.len() - 1); - - // Ensure schema tables has quotation to match up with prelude generated cte. - if let Some(table_name) = table_identifier.last_mut() { - let sql_name = SqlName::verbatim(table_name.to_string()); - if self.layout.table(&sql_name).is_some() { - table_name.quote_style = Some('"'); - } - } - } - - pub fn format(&mut self, statement: &mut Statement) -> String { - statement.visit(self); - - format!( - "{} SELECT to_jsonb(sub.*) AS data FROM ( {} ) AS sub", - self.prelude, statement - ) - } -} - -impl VisitorMut for Formatter<'_> { - type Break = (); - - fn pre_visit_table_factor( - &mut self, - table_factor: &mut TableFactor, - ) -> ControlFlow<Self::Break> { - if let TableFactor::Table { name, .. } = table_factor { - self.prepend_prefix_to_object_name_mut(name); - } - ControlFlow::Continue(()) - } -} - -#[cfg(test)] -mod test { - use super::*; - use crate::sql::{constants::SQL_DIALECT, test::make_layout}; - - const GQL: &str = " - type Swap @entity { - id: ID! - amountIn: BigDecimal! - amountOut: BigDecimal! - tokenIn: Bytes! - tokenOut: Bytes! - } - "; - - const CTE_PREFIX: &str = "WITH \"swap\" AS ( - SELECT - id, - amount_in, - amount_out, - concat('0x',encode(token_in,'hex') as token_in, - concat('0x',token_out,'hex') AS token_out - FROM - sdg1.swap - )"; - - #[test] - fn format_sql() { - let layout = make_layout(GQL); - - let mut formatter = Formatter::new(CTE_PREFIX, &layout); - - let sql = "SELECT token_in, SUM(amount_in) AS amount FROM unknown.swap GROUP BY token_in"; - - let mut statements = sqlparser::parser::Parser::parse_sql(&SQL_DIALECT, sql).unwrap(); - - let mut statement = statements.get_mut(0).unwrap(); - - let result = formatter.format(&mut statement); - - assert_eq!( - result, - format!( - "{} SELECT to_jsonb(sub.*) AS data FROM ( {} ) AS sub", - CTE_PREFIX, - "SELECT token_in, SUM(amount_in) AS amount FROM \"swap\" GROUP BY token_in" - ) - ); - } -} diff --git a/store/postgres/src/sql/mod.rs b/store/postgres/src/sql/mod.rs index cf655050753..55917f854c4 100644 --- a/store/postgres/src/sql/mod.rs +++ b/store/postgres/src/sql/mod.rs @@ -1,5 +1,4 @@ mod constants; -mod formatter; mod parser; mod validation; diff --git a/store/postgres/src/sql/parser.rs b/store/postgres/src/sql/parser.rs index 7e972d96777..ded1eac9b45 100644 --- a/store/postgres/src/sql/parser.rs +++ b/store/postgres/src/sql/parser.rs @@ -1,92 +1,41 @@ -use super::{constants::SQL_DIALECT, formatter::Formatter, validation::Validator}; -use crate::relational::{ColumnType, Layout}; +use super::{constants::SQL_DIALECT, validation::Validator}; +use crate::relational::Layout; use anyhow::{anyhow, Ok, Result}; -use graph::components::store::BLOCK_NUMBER_MAX; -use itertools::Itertools; use std::sync::Arc; -pub fn generate_table_prelude_from_layout(layout: &Layout) -> String { - let schema = &layout.catalog.site.namespace; - let ctes = layout - .tables - .iter() - .filter(|(entity, _)| !entity.is_poi()) - .map(|(_, table)| { - let table_name = table.name.as_str(); - - let (block_column, filter) = if !table.immutable { - ( - "block_range", - Some(format!(" WHERE \"block_range\" @> {}", BLOCK_NUMBER_MAX)), - ) - } else { - ("block$", None) - }; - - let columns = table - .columns - .iter() - .map(|col| { - if !col.is_list() && col.column_type == ColumnType::Bytes { - format!( - r#"concat('0x', encode("{}", 'hex')) AS "{}""#, - col.name.as_str(), - col.name.as_str() - ) - } else { - format!(r#""{}""#, col.name.as_str()) - } - }) - .chain(std::iter::once(format!(r#""{}""#, block_column))) - .collect::<Vec<_>>() - .join(", "); - format!( - "\"{table_name}\" AS (SELECT {columns} FROM \"{schema}\".\"{table_name}\"{})", - filter.unwrap_or_default() - ) - }) - .sorted() - .collect::<Vec<_>>() - .join(",\n"); - format!("WITH {ctes}") -} - pub struct Parser { layout: Arc<Layout>, - prelude: String, } impl Parser { pub fn new(layout: Arc<Layout>) -> Self { - let prelude = generate_table_prelude_from_layout(&layout); - Self { layout, prelude } + Self { layout } } pub fn parse_and_validate(&self, sql: &str) -> Result<String> { let mut statements = sqlparser::parser::Parser::parse_sql(&SQL_DIALECT, sql)?; let mut validator = Validator::new(&self.layout); - validator.validate_statements(&statements)?; - - let mut formatter = Formatter::new(&self.prelude, &self.layout); + validator.validate_statements(&mut statements)?; let statement = statements .get_mut(0) .ok_or_else(|| anyhow!("No SQL statements found"))?; - let result = formatter.format(statement); - - Ok(result) + let sql = format!( + "select to_jsonb(sub.*) as data from ( {} ) as sub", + statement + ); + Ok(sql) } } #[cfg(test)] mod test { - use crate::layout_for_tests::{make_dummy_site, Catalog, Namespace}; + use crate::sql::test::make_layout; use super::*; - use graph::{data::subgraph::DeploymentHash, schema::InputSchema}; const TEST_GQL: &str = " type SwapMulti @entity(immutable: true) { @@ -111,8 +60,6 @@ mod test { } "; - const NAMESPACE: &str = "sgd0815"; - const SQL_QUERY: &str = " with tokens as ( select * from (values @@ -145,26 +92,15 @@ mod test { "; - fn test_layout() -> Layout { - let subgraph = DeploymentHash::new("subgraph").unwrap(); - let schema = - InputSchema::parse_latest(TEST_GQL, subgraph.clone()).expect("Test schema invalid"); - let namespace = Namespace::new(NAMESPACE.to_owned()).unwrap(); - let site = Arc::new(make_dummy_site(subgraph, namespace, "anet".to_string())); - let catalog = - Catalog::for_tests(site.clone(), Default::default()).expect("Can not create catalog"); - Layout::new(site, &schema, catalog).expect("Failed to construct Layout") + fn parse_and_validate(sql: &str) -> Result<String, anyhow::Error> { + let parser = Parser::new(Arc::new(make_layout(TEST_GQL))); + + parser.parse_and_validate(sql) } #[test] fn parse_sql() { - let parser = Parser::new(Arc::new(test_layout())); - - let result = parser.parse_and_validate(SQL_QUERY); - - assert!(result.is_ok()); - - let query = result.unwrap(); + let query = parse_and_validate(SQL_QUERY).unwrap(); assert_eq!( query, @@ -172,4 +108,12 @@ mod test { "token" AS (SELECT "id", concat('0x', encode("address", 'hex')) AS "address", "symbol", "name", "decimals", "block_range" FROM "sgd0815"."token" WHERE "block_range" @> 2147483647) SELECT to_jsonb(sub.*) AS data FROM ( WITH tokens AS (SELECT * FROM (VALUES ('0x0000000000000000000000000000000000000000', 'ETH', 'Ethereum', 18), ('0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48', 'USDC', 'USD Coin', 6)) AS t (address, symbol, name, decimals)) SELECT date, t.symbol, SUM(amount) / pow(10, t.decimals) AS amount FROM (SELECT date(to_timestamp(block_timestamp) AT TIME ZONE 'utc') AS date, token, amount FROM "swap_multi" AS sm, UNNEST(sm.amounts_in, sm.tokens_in) AS smi (amount, token) UNION ALL SELECT date(to_timestamp(block_timestamp) AT TIME ZONE 'utc') AS date, token, amount FROM "swap_multi" AS sm, UNNEST(sm.amounts_out, sm.tokens_out) AS smo (amount, token)) AS tp JOIN tokens AS t ON t.address = '0x' || encode(tp.token, 'hex') GROUP BY tp.date, t.symbol, t.decimals ORDER BY tp.date DESC, amount DESC ) AS sub"# ); } + + #[test] + fn parse_simple_sql() { + let query = + parse_and_validate("select symbol, address from token where decimals > 10").unwrap(); + + println!("{}", query); + } } diff --git a/store/postgres/src/sql/validation.rs b/store/postgres/src/sql/validation.rs index 4d188e0cc38..6eb8606d128 100644 --- a/store/postgres/src/sql/validation.rs +++ b/store/postgres/src/sql/validation.rs @@ -1,10 +1,14 @@ -use sqlparser::ast::{Expr, ObjectName, Query, SetExpr, Statement, TableFactor, Visit, Visitor}; +use sqlparser::ast::{ + Expr, Ident, ObjectName, Query, SetExpr, Statement, TableAlias, TableFactor, VisitMut, + VisitorMut, +}; +use sqlparser::parser::Parser; use std::result::Result; use std::{collections::HashSet, ops::ControlFlow}; use crate::relational::Layout; -use super::constants::ALLOWED_FUNCTIONS; +use super::constants::{ALLOWED_FUNCTIONS, SQL_DIALECT}; #[derive(thiserror::Error, Debug, PartialEq)] pub enum Error { @@ -40,7 +44,7 @@ impl<'a> Validator<'a> { } } - pub fn validate_statements(&mut self, statements: &Vec<Statement>) -> Result<(), Error> { + pub fn validate_statements(&mut self, statements: &mut Vec<Statement>) -> Result<(), Error> { self.ctes.clear(); if statements.len() > 1 { @@ -53,29 +57,19 @@ impl<'a> Validator<'a> { Ok(()) } - - fn validate_table_name(&mut self, name: &ObjectName) -> ControlFlow<Error> { - if let Some(table_name) = name.0.last() { - let name = &table_name.value; - if !self.layout.table(name).is_some() && !self.ctes.contains(name) { - return ControlFlow::Break(Error::UnknownTable(name.to_string())); - } - } - ControlFlow::Continue(()) - } } -impl Visitor for Validator<'_> { +impl VisitorMut for Validator<'_> { type Break = Error; - fn pre_visit_statement(&mut self, _statement: &Statement) -> ControlFlow<Self::Break> { + fn pre_visit_statement(&mut self, _statement: &mut Statement) -> ControlFlow<Self::Break> { match _statement { Statement::Query(_) => ControlFlow::Continue(()), _ => ControlFlow::Break(Error::NotSelectQuery), } } - fn pre_visit_query(&mut self, _query: &Query) -> ControlFlow<Self::Break> { + fn pre_visit_query(&mut self, _query: &mut Query) -> ControlFlow<Self::Break> { // Add common table expressions to the set of known tables if let Some(ref with) = _query.with { self.ctes.extend( @@ -93,19 +87,57 @@ impl Visitor for Validator<'_> { /// Invoked for any table function in the AST. /// See [TableFactor::Table.args](sqlparser::ast::TableFactor::Table::args) for more details identifying a table function - fn pre_visit_table_factor(&mut self, table_factor: &TableFactor) -> ControlFlow<Self::Break> { - if let TableFactor::Table { name, args, .. } = table_factor { + fn post_visit_table_factor( + &mut self, + table_factor: &mut TableFactor, + ) -> ControlFlow<Self::Break> { + if let TableFactor::Table { + name, args, alias, .. + } = table_factor + { if args.is_some() { return self.validate_function_name(name); - } else { - return self.validate_table_name(name); } + let table = if let Some(table_name) = name.0.last() { + let name = &table_name.value; + let Some(table) = self.layout.table(name) else { + if !self.ctes.contains(name) { + return ControlFlow::Break(Error::UnknownTable(name.to_string())); + } else { + return ControlFlow::Continue(()); + } + }; + table + } else { + return ControlFlow::Continue(()); + }; + + // Change 'from table [as alias]' to 'from (select * from table) as alias' + let query = format!("select * from {}", table.qualified_name); + let Statement::Query(subquery) = Parser::parse_sql(&SQL_DIALECT, &query) + .unwrap() + .pop() + .unwrap() + else { + unreachable!(); + }; + let alias = alias.as_ref().map(|alias| alias.clone()).or_else(|| { + Some(TableAlias { + name: Ident::new(table.name.as_str()), + columns: vec![], + }) + }); + *table_factor = TableFactor::Derived { + lateral: false, + subquery, + alias, + }; } ControlFlow::Continue(()) } /// Invoked for any function expressions that appear in the AST - fn pre_visit_expr(&mut self, _expr: &Expr) -> ControlFlow<Self::Break> { + fn pre_visit_expr(&mut self, _expr: &mut Expr) -> ControlFlow<Self::Break> { if let Expr::Function(function) = _expr { return self.validate_function_name(&function.name); } @@ -119,7 +151,7 @@ mod test { use crate::sql::{constants::SQL_DIALECT, test::make_layout}; fn validate(sql: &str) -> Result<(), Error> { - let statements = sqlparser::parser::Parser::parse_sql(&SQL_DIALECT, sql).unwrap(); + let mut statements = sqlparser::parser::Parser::parse_sql(&SQL_DIALECT, sql).unwrap(); const GQL: &str = " type Swap @entity { @@ -140,7 +172,7 @@ mod test { let mut validator = Validator::new(&layout); - validator.validate_statements(&statements) + validator.validate_statements(&mut statements) } #[test] From bb3850b742b971d42ac13b40430c0ab510c8ac60 Mon Sep 17 00:00:00 2001 From: David Lutterkort <lutter@watzmann.net> Date: Fri, 24 Jan 2025 13:18:11 -0800 Subject: [PATCH 05/13] store: Revamp query rewriting 1. Do not use CTE's to inject a view of a table at a certain block. Instead rewrite the 'from' clause 2. Do not turn bytea columns into string columns since that is hugely wasteful --- store/postgres/src/query_store.rs | 2 +- store/postgres/src/sql/parser.rs | 16 ++++++++++++---- store/postgres/src/sql/validation.rs | 22 +++++++++++++++++++--- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/store/postgres/src/query_store.rs b/store/postgres/src/query_store.rs index f6b2a22712c..c0eeff16dab 100644 --- a/store/postgres/src/query_store.rs +++ b/store/postgres/src/query_store.rs @@ -30,7 +30,7 @@ impl QueryStore { ) -> Self { let sql_parser = store .find_layout(site.clone()) - .map(|layout| Parser::new(layout)); + .map(|layout| Parser::new(layout, BLOCK_NUMBER_MAX)); QueryStore { site, replica_id, diff --git a/store/postgres/src/sql/parser.rs b/store/postgres/src/sql/parser.rs index ded1eac9b45..7ea0d74f107 100644 --- a/store/postgres/src/sql/parser.rs +++ b/store/postgres/src/sql/parser.rs @@ -1,21 +1,23 @@ use super::{constants::SQL_DIALECT, validation::Validator}; use crate::relational::Layout; use anyhow::{anyhow, Ok, Result}; +use graph::prelude::BlockNumber; use std::sync::Arc; pub struct Parser { layout: Arc<Layout>, + block: BlockNumber, } impl Parser { - pub fn new(layout: Arc<Layout>) -> Self { - Self { layout } + pub fn new(layout: Arc<Layout>, block: BlockNumber) -> Self { + Self { layout, block } } pub fn parse_and_validate(&self, sql: &str) -> Result<String> { let mut statements = sqlparser::parser::Parser::parse_sql(&SQL_DIALECT, sql)?; - let mut validator = Validator::new(&self.layout); + let mut validator = Validator::new(&self.layout, self.block); validator.validate_statements(&mut statements)?; let statement = statements @@ -33,6 +35,8 @@ impl Parser { #[cfg(test)] mod test { + use graph::prelude::BLOCK_NUMBER_MAX; + use crate::sql::test::make_layout; use super::*; @@ -93,7 +97,7 @@ mod test { "; fn parse_and_validate(sql: &str) -> Result<String, anyhow::Error> { - let parser = Parser::new(Arc::new(make_layout(TEST_GQL))); + let parser = Parser::new(Arc::new(make_layout(TEST_GQL)), BLOCK_NUMBER_MAX); parser.parse_and_validate(sql) } @@ -114,6 +118,10 @@ mod test { let query = parse_and_validate("select symbol, address from token where decimals > 10").unwrap(); + assert_eq!( + query, + r#"select to_jsonb(sub.*) as data from ( SELECT symbol, address FROM (SELECT * FROM "sgd0815"."token" WHERE block_range @> 2147483647) AS token WHERE decimals > 10 ) as sub"# + ); println!("{}", query); } } diff --git a/store/postgres/src/sql/validation.rs b/store/postgres/src/sql/validation.rs index 6eb8606d128..7639216aa0a 100644 --- a/store/postgres/src/sql/validation.rs +++ b/store/postgres/src/sql/validation.rs @@ -1,3 +1,4 @@ +use graph::prelude::BlockNumber; use sqlparser::ast::{ Expr, Ident, ObjectName, Query, SetExpr, Statement, TableAlias, TableFactor, VisitMut, VisitorMut, @@ -6,6 +7,7 @@ use sqlparser::parser::Parser; use std::result::Result; use std::{collections::HashSet, ops::ControlFlow}; +use crate::block_range::{BLOCK_COLUMN, BLOCK_RANGE_COLUMN}; use crate::relational::Layout; use super::constants::{ALLOWED_FUNCTIONS, SQL_DIALECT}; @@ -25,13 +27,15 @@ pub enum Error { pub struct Validator<'a> { layout: &'a Layout, ctes: HashSet<String>, + block: BlockNumber, } impl<'a> Validator<'a> { - pub fn new(layout: &'a Layout) -> Self { + pub fn new(layout: &'a Layout, block: BlockNumber) -> Self { Self { layout, ctes: Default::default(), + block, } } @@ -113,7 +117,17 @@ impl VisitorMut for Validator<'_> { }; // Change 'from table [as alias]' to 'from (select * from table) as alias' - let query = format!("select * from {}", table.qualified_name); + let query = if table.immutable { + format!( + "select * from {} where {} <= {}", + table.qualified_name, BLOCK_COLUMN, self.block + ) + } else { + format!( + "select * from {} where {} @> {}", + table.qualified_name, BLOCK_RANGE_COLUMN, self.block + ) + }; let Statement::Query(subquery) = Parser::parse_sql(&SQL_DIALECT, &query) .unwrap() .pop() @@ -147,6 +161,8 @@ impl VisitorMut for Validator<'_> { #[cfg(test)] mod test { + use graph::prelude::BLOCK_NUMBER_MAX; + use super::*; use crate::sql::{constants::SQL_DIALECT, test::make_layout}; @@ -170,7 +186,7 @@ mod test { let layout = make_layout(GQL); - let mut validator = Validator::new(&layout); + let mut validator = Validator::new(&layout, BLOCK_NUMBER_MAX); validator.validate_statements(&mut statements) } From 82d9ec764ac5b4b426a262407450cdcd7c61a87c Mon Sep 17 00:00:00 2001 From: David Lutterkort <lutter@watzmann.net> Date: Fri, 24 Jan 2025 14:40:25 -0800 Subject: [PATCH 06/13] store: Extract SQL parsing tests into a YAML file That setup makes it much easier to add more tests that check that we scrub dangerous constructs from SQL --- store/postgres/src/sql/parser.rs | 160 ++++++++++++++--------- store/postgres/src/sql/parser_tests.yaml | 55 ++++++++ 2 files changed, 153 insertions(+), 62 deletions(-) create mode 100644 store/postgres/src/sql/parser_tests.yaml diff --git a/store/postgres/src/sql/parser.rs b/store/postgres/src/sql/parser.rs index 7ea0d74f107..4d19fd444fa 100644 --- a/store/postgres/src/sql/parser.rs +++ b/store/postgres/src/sql/parser.rs @@ -34,25 +34,32 @@ impl Parser { #[cfg(test)] mod test { + use std::sync::Arc; - use graph::prelude::BLOCK_NUMBER_MAX; + use crate::sql::{parser::SQL_DIALECT, test::make_layout}; + use graph::prelude::{lazy_static, serde_yaml, BLOCK_NUMBER_MAX}; + use serde::{Deserialize, Serialize}; - use crate::sql::test::make_layout; + use pretty_assertions::assert_eq; - use super::*; + use super::Parser; const TEST_GQL: &str = " - type SwapMulti @entity(immutable: true) { + type Swap @entity(immutable: true) { id: Bytes! - sender: Bytes! # address - amountsIn: [BigInt!]! # uint256[] - tokensIn: [Bytes!]! # address[] - amountsOut: [BigInt!]! # uint256[] - tokensOut: [Bytes!]! # address[] - referralCode: BigInt! # uint32 - blockNumber: BigInt! - blockTimestamp: BigInt! - transactionHash: Bytes! + timestamp: BigInt! + pool: Bytes! + token0: Bytes! + token1: Bytes! + sender: Bytes! + recipient: Bytes! + origin: Bytes! # the EOA that initiated the txn + amount0: BigDecimal! + amount1: BigDecimal! + amountUSD: BigDecimal! + sqrtPriceX96: BigInt! + tick: BigInt! + logIndex: BigInt } type Token @entity { @@ -64,64 +71,93 @@ mod test { } "; - const SQL_QUERY: &str = " - with tokens as ( - select * from (values - ('0x0000000000000000000000000000000000000000','ETH','Ethereum',18), - ('0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48','USDC','USD Coin',6) - ) as t(address,symbol,name,decimals) - ) - - select - date, - t.symbol, - SUM(amount)/pow(10,t.decimals) as amount - from (select - date(to_timestamp(block_timestamp) at time zone 'utc') as date, - token, - amount - from swap_multi as sm - ,unnest(sm.amounts_in,sm.tokens_in) as smi(amount,token) - union all - select - date(to_timestamp(block_timestamp) at time zone 'utc') as date, - token, - amount - from sgd1.swap_multi as sm - ,unnest(sm.amounts_out,sm.tokens_out) as smo(amount,token) - ) as tp - inner join tokens as t on t.address = '0x' || encode(tp.token,'hex') - group by tp.date,t.symbol,t.decimals - order by tp.date desc ,amount desc - - "; - fn parse_and_validate(sql: &str) -> Result<String, anyhow::Error> { let parser = Parser::new(Arc::new(make_layout(TEST_GQL)), BLOCK_NUMBER_MAX); parser.parse_and_validate(sql) } - #[test] - fn parse_sql() { - let query = parse_and_validate(SQL_QUERY).unwrap(); + #[derive(Debug, Serialize, Deserialize)] + struct TestCase { + name: Option<String>, + sql: String, + ok: Option<String>, + err: Option<String>, + } - assert_eq!( - query, - r#"WITH "swap_multi" AS (SELECT concat('0x', encode("id", 'hex')) AS "id", concat('0x', encode("sender", 'hex')) AS "sender", "amounts_in", "tokens_in", "amounts_out", "tokens_out", "referral_code", "block_number", "block_timestamp", concat('0x', encode("transaction_hash", 'hex')) AS "transaction_hash", "block$" FROM "sgd0815"."swap_multi"), -"token" AS (SELECT "id", concat('0x', encode("address", 'hex')) AS "address", "symbol", "name", "decimals", "block_range" FROM "sgd0815"."token" WHERE "block_range" @> 2147483647) SELECT to_jsonb(sub.*) AS data FROM ( WITH tokens AS (SELECT * FROM (VALUES ('0x0000000000000000000000000000000000000000', 'ETH', 'Ethereum', 18), ('0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48', 'USDC', 'USD Coin', 6)) AS t (address, symbol, name, decimals)) SELECT date, t.symbol, SUM(amount) / pow(10, t.decimals) AS amount FROM (SELECT date(to_timestamp(block_timestamp) AT TIME ZONE 'utc') AS date, token, amount FROM "swap_multi" AS sm, UNNEST(sm.amounts_in, sm.tokens_in) AS smi (amount, token) UNION ALL SELECT date(to_timestamp(block_timestamp) AT TIME ZONE 'utc') AS date, token, amount FROM "swap_multi" AS sm, UNNEST(sm.amounts_out, sm.tokens_out) AS smo (amount, token)) AS tp JOIN tokens AS t ON t.address = '0x' || encode(tp.token, 'hex') GROUP BY tp.date, t.symbol, t.decimals ORDER BY tp.date DESC, amount DESC ) AS sub"# - ); + impl TestCase { + fn fail( + &self, + name: &str, + msg: &str, + exp: impl std::fmt::Display, + actual: impl std::fmt::Display, + ) { + panic!( + "case {name} failed: {}\n expected: {}\n actual: {}", + msg, exp, actual + ); + } + + fn run(&self, num: usize) { + fn normalize(query: &str) -> String { + sqlparser::parser::Parser::parse_sql(&SQL_DIALECT, query) + .unwrap() + .pop() + .unwrap() + .to_string() + } + + let name = self + .name + .as_ref() + .map(|name| format!("{num} ({name})")) + .unwrap_or_else(|| num.to_string()); + let result = parse_and_validate(&self.sql); + + match (&self.ok, &self.err, result) { + (Some(expected), None, Ok(actual)) => { + let actual = normalize(&actual); + let expected = normalize(expected); + assert_eq!(actual, expected, "case {} failed", name); + } + (None, Some(expected), Err(actual)) => { + let actual = actual.to_string(); + if !actual.contains(expected) { + self.fail(&name, "expected error message not found", expected, actual); + } + } + (Some(_), Some(_), _) => { + panic!("case {} has both ok and err", name); + } + (None, None, _) => { + panic!("case {} has neither ok nor err", name) + } + (None, Some(exp), Ok(actual)) => { + self.fail(&name, "expected an error", exp, actual); + } + (Some(exp), None, Err(actual)) => self.fail(&name, "expected success", exp, actual), + } + } } - #[test] - fn parse_simple_sql() { - let query = - parse_and_validate("select symbol, address from token where decimals > 10").unwrap(); + lazy_static! { + static ref TESTS: Vec<TestCase> = { + let file = std::path::PathBuf::from_iter([ + env!("CARGO_MANIFEST_DIR"), + "src", + "sql", + "parser_tests.yaml", + ]); + let tests = std::fs::read_to_string(file).unwrap(); + serde_yaml::from_str(&tests).unwrap() + }; + } - assert_eq!( - query, - r#"select to_jsonb(sub.*) as data from ( SELECT symbol, address FROM (SELECT * FROM "sgd0815"."token" WHERE block_range @> 2147483647) AS token WHERE decimals > 10 ) as sub"# - ); - println!("{}", query); + #[test] + fn parse_sql() { + for (num, case) in TESTS.iter().enumerate() { + case.run(num); + } } } diff --git a/store/postgres/src/sql/parser_tests.yaml b/store/postgres/src/sql/parser_tests.yaml new file mode 100644 index 00000000000..11d2a976e21 --- /dev/null +++ b/store/postgres/src/sql/parser_tests.yaml @@ -0,0 +1,55 @@ +# Test cases for the SQL parser. Each test case has the following fields: +# name : an optional name for error messages +# sql : the SQL query to parse +# ok : the expected rewritten query +# err : a part of the error message if parsing should fail +# Of course, only one of ok and err can be specified + +- sql: select symbol, address from token where decimals > 10 + ok: > + select to_jsonb(sub.*) as data from ( + SELECT symbol, address FROM ( + SELECT * FROM "sgd0815"."token" WHERE block_range @> 2147483647) AS token + WHERE decimals > 10 ) as sub +- sql: > + with tokens as ( + select * from (values + ('0x0000000000000000000000000000000000000000','ETH','Ethereum',18), + ('0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48','USDC','USD Coin',6) + ) as t(address,symbol,name,decimals) + ) + + select date, t.symbol, SUM(amount)/pow(10,t.decimals) as amount + from (select + date(to_timestamp(block_timestamp) at time zone 'utc') as date, + token, amount + from swap as sm, + unnest(sm.amounts_in,sm.tokens_in) as smi(amount,token) + union all + select + date(to_timestamp(block_timestamp) at time zone 'utc') as date, + token, amount + from swap as sm, + unnest(sm.amounts_out,sm.tokens_out) as smo(amount,token)) as tp + inner join + tokens as t on t.address = tp.token + group by tp.date, t.symbol, t.decimals + order by tp.date desc, amount desc + ok: > + select to_jsonb(sub.*) as data from ( + WITH tokens AS ( + SELECT * FROM ( + VALUES ('0x0000000000000000000000000000000000000000', 'ETH', 'Ethereum', 18), + ('0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48', 'USDC', 'USD Coin', 6)) + AS t (address, symbol, name, decimals)) + SELECT date, t.symbol, SUM(amount) / pow(10, t.decimals) AS amount + FROM (SELECT date(to_timestamp(block_timestamp) AT TIME ZONE 'utc') AS date, token, amount + FROM (SELECT * FROM "sgd0815"."swap" WHERE block$ <= 2147483647) AS sm, + UNNEST(sm.amounts_in, sm.tokens_in) AS smi (amount, token) + UNION ALL + SELECT date(to_timestamp(block_timestamp) AT TIME ZONE 'utc') AS date, token, amount + FROM (SELECT * FROM "sgd0815"."swap" WHERE block$ <= 2147483647) AS sm, + UNNEST(sm.amounts_out, sm.tokens_out) AS smo (amount, token)) AS tp + JOIN tokens AS t ON t.address = tp.token + GROUP BY tp.date, t.symbol, t.decimals + ORDER BY tp.date DESC, amount DESC ) as sub From 35bd6fe796c5c8b54c1761f8ed8b7b03390910e4 Mon Sep 17 00:00:00 2001 From: David Lutterkort <lutter@watzmann.net> Date: Mon, 27 Jan 2025 17:19:59 -0800 Subject: [PATCH 07/13] graph, graphql, server: Add rudimentary SQL query support --- graph/src/components/graphql.rs | 10 ++++-- graph/src/data/query/mod.rs | 2 +- graph/src/data/query/query.rs | 26 +++++++++++++- graph/src/data/store/mod.rs | 1 + graphql/src/runner.rs | 43 +++++++++++++++++++++- server/http/src/service.rs | 63 +++++++++++++++++++++++++++++++-- server/http/tests/server.rs | 12 ++++++- 7 files changed, 149 insertions(+), 8 deletions(-) diff --git a/graph/src/components/graphql.rs b/graph/src/components/graphql.rs index b5fc4273860..8d42cecb9d8 100644 --- a/graph/src/components/graphql.rs +++ b/graph/src/components/graphql.rs @@ -1,6 +1,7 @@ -use crate::data::query::QueryResults; use crate::data::query::{Query, QueryTarget}; -use crate::prelude::DeploymentHash; +use crate::data::query::{QueryResults, SqlQueryReq}; +use crate::data::store::SqlQueryObject; +use crate::prelude::{DeploymentHash, QueryExecutionError}; use async_trait::async_trait; use std::sync::Arc; @@ -28,6 +29,11 @@ pub trait GraphQlRunner: Send + Sync + 'static { ) -> QueryResults; fn metrics(&self) -> Arc<dyn GraphQLMetrics>; + + async fn run_sql_query( + self: Arc<Self>, + req: SqlQueryReq, + ) -> Result<Vec<SqlQueryObject>, QueryExecutionError>; } pub trait GraphQLMetrics: Send + Sync + 'static { diff --git a/graph/src/data/query/mod.rs b/graph/src/data/query/mod.rs index 73a6f1fe220..407c2218525 100644 --- a/graph/src/data/query/mod.rs +++ b/graph/src/data/query/mod.rs @@ -6,6 +6,6 @@ mod trace; pub use self::cache_status::CacheStatus; pub use self::error::{QueryError, QueryExecutionError}; -pub use self::query::{Query, QueryTarget, QueryVariables}; +pub use self::query::{Query, QueryTarget, QueryVariables, SqlQueryMode, SqlQueryReq}; pub use self::result::{LatestBlockInfo, QueryResult, QueryResults}; pub use self::trace::Trace; diff --git a/graph/src/data/query/query.rs b/graph/src/data/query/query.rs index 2ca93f0cc43..5bb64a8a134 100644 --- a/graph/src/data/query/query.rs +++ b/graph/src/data/query/query.rs @@ -1,7 +1,8 @@ use serde::de::Deserializer; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use std::collections::{BTreeMap, HashMap}; use std::convert::TryFrom; +use std::hash::{DefaultHasher, Hash as _, Hasher as _}; use std::ops::{Deref, DerefMut}; use std::sync::Arc; @@ -165,3 +166,26 @@ impl Query { } } } + +#[derive(Copy, Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum SqlQueryMode { + Data, + Info, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SqlQueryReq { + pub deployment: DeploymentHash, + pub query: String, + pub mode: SqlQueryMode, +} + +impl SqlQueryReq { + pub fn query_hash(&self) -> u64 { + let mut hasher = DefaultHasher::new(); + self.deployment.hash(&mut hasher); + self.query.hash(&mut hasher); + hasher.finish() + } +} diff --git a/graph/src/data/store/mod.rs b/graph/src/data/store/mod.rs index cf464ff7b6b..f52c70b7ce3 100644 --- a/graph/src/data/store/mod.rs +++ b/graph/src/data/store/mod.rs @@ -1103,6 +1103,7 @@ pub struct QueryObject { } /// An object that is returned from a SQL query. It wraps an `r::Value` +#[derive(CacheWeight, Serialize)] pub struct SqlQueryObject(pub r::Value); impl CacheWeight for QueryObject { diff --git a/graphql/src/runner.rs b/graphql/src/runner.rs index 96f30e8bc9d..46169d2f678 100644 --- a/graphql/src/runner.rs +++ b/graphql/src/runner.rs @@ -4,12 +4,14 @@ use std::time::Instant; use crate::metrics::GraphQLMetrics; use crate::prelude::{QueryExecutionOptions, StoreResolver}; use crate::query::execute_query; +use graph::data::query::{CacheStatus, SqlQueryReq}; +use graph::data::store::SqlQueryObject; use graph::futures03::future; -use graph::prelude::MetricsRegistry; use graph::prelude::{ async_trait, o, CheapClone, DeploymentState, GraphQLMetrics as GraphQLMetricsTrait, GraphQlRunner as GraphQlRunnerTrait, Logger, Query, QueryExecutionError, ENV_VARS, }; +use graph::prelude::{ApiVersion, MetricsRegistry}; use graph::{data::graphql::load_manager::LoadManager, prelude::QueryStoreManager}; use graph::{ data::query::{LatestBlockInfo, QueryResults, QueryTarget}, @@ -251,4 +253,43 @@ where fn metrics(&self) -> Arc<dyn GraphQLMetricsTrait> { self.graphql_metrics.clone() } + + async fn run_sql_query( + self: Arc<Self>, + req: SqlQueryReq, + ) -> Result<Vec<SqlQueryObject>, QueryExecutionError> { + let store = self + .store + .query_store(QueryTarget::Deployment( + req.deployment.clone(), + ApiVersion::default(), + )) + .await?; + + let query_hash = req.query_hash(); + self.load_manager + .decide( + &store.wait_stats().map_err(QueryExecutionError::from)?, + store.shard(), + store.deployment_id(), + query_hash, + &req.query, + ) + .to_result()?; + + let query_start = Instant::now(); + let result = store + .execute_sql(&req.query) + .map_err(|e| QueryExecutionError::from(e)); + + self.load_manager.record_work( + store.shard(), + store.deployment_id(), + query_hash, + query_start.elapsed(), + CacheStatus::Miss, + ); + + result + } } diff --git a/server/http/src/service.rs b/server/http/src/service.rs index 8e2237b86ff..c69e6428983 100644 --- a/server/http/src/service.rs +++ b/server/http/src/service.rs @@ -9,6 +9,8 @@ use graph::components::server::query::ServerResponse; use graph::components::server::query::ServerResult; use graph::components::versions::ApiVersion; use graph::data::query::QueryResult; +use graph::data::query::SqlQueryMode; +use graph::data::query::SqlQueryReq; use graph::data::subgraph::DeploymentHash; use graph::data::subgraph::SubgraphName; use graph::env::ENV_VARS; @@ -21,6 +23,8 @@ use graph::hyper::{body::Body, header::HeaderValue}; use graph::hyper::{Method, Request, Response, StatusCode}; use graph::prelude::serde_json; use graph::prelude::serde_json::json; +use graph::prelude::CacheWeight as _; +use graph::prelude::QueryError; use graph::semver::VersionReq; use graph::slog::error; use graph::slog::Logger; @@ -195,6 +199,51 @@ where Ok(result.as_http_response()) } + async fn handle_sql_query<T: Body>(&self, request: Request<T>) -> ServerResult { + let body = request + .collect() + .await + .map_err(|_| ServerError::InternalError("Failed to read request body".into()))? + .to_bytes(); + let sql_req: SqlQueryReq = serde_json::from_slice(&body) + .map_err(|e| ServerError::ClientError(format!("{}", e)))?; + + let mode = sql_req.mode; + let result = self + .graphql_runner + .cheap_clone() + .run_sql_query(sql_req) + .await + .map_err(|e| ServerError::QueryError(QueryError::from(e))); + + use SqlQueryMode::*; + let response_obj = match (result, mode) { + (Ok(result), Info) => { + json!({ + "count": result.len(), + "bytes" : result.weight(), + }) + } + (Ok(result), Data) => { + json!({ + "data": result, + }) + } + (Err(e), _) => json!({ + "error": e.to_string(), + }), + }; + + let response_str = serde_json::to_string(&response_obj).unwrap(); + + Ok(Response::builder() + .status(200) + .header(ACCESS_CONTROL_ALLOW_ORIGIN, "*") + .header(CONTENT_TYPE, "application/json") + .body(Full::from(response_str)) + .unwrap()) + } + // Handles OPTIONS requests fn handle_graphql_options<T>(&self, _request: Request<T>) -> ServerResult { Ok(Response::builder() @@ -327,7 +376,9 @@ where let dest = format!("/{}/graphql", filtered_path); self.handle_temp_redirect(dest) } - + (Method::POST, &["subgraphs", "sql"] | &["subgraphs", "sql", ""]) => { + self.handle_sql_query(req).await + } (Method::POST, &["subgraphs", "id", subgraph_id]) => { self.handle_graphql_query_by_id(subgraph_id.to_owned(), req) .await @@ -395,6 +446,7 @@ where #[cfg(test)] mod tests { + use graph::data::store::SqlQueryObject; use graph::data::value::{Object, Word}; use graph::http_body_util::{BodyExt, Full}; use graph::hyper::body::Bytes; @@ -402,7 +454,7 @@ mod tests { use graph::hyper::{Method, Request, StatusCode}; use graph::prelude::serde_json::json; - use graph::data::query::{QueryResults, QueryTarget}; + use graph::data::query::{QueryResults, QueryTarget, SqlQueryReq}; use graph::prelude::*; use crate::test_utils; @@ -449,6 +501,13 @@ mod tests { fn metrics(&self) -> Arc<dyn GraphQLMetrics> { Arc::new(TestGraphQLMetrics) } + + async fn run_sql_query( + self: Arc<Self>, + _req: SqlQueryReq, + ) -> Result<Vec<SqlQueryObject>, QueryExecutionError> { + unimplemented!() + } } #[tokio::test] diff --git a/server/http/tests/server.rs b/server/http/tests/server.rs index 3ad78138437..b027a73764c 100644 --- a/server/http/tests/server.rs +++ b/server/http/tests/server.rs @@ -1,4 +1,7 @@ -use graph::http::StatusCode; +use graph::{ + data::{query::SqlQueryReq, store::SqlQueryObject}, + http::StatusCode, +}; use std::time::Duration; use graph::data::{ @@ -66,6 +69,13 @@ impl GraphQlRunner for TestGraphQlRunner { fn metrics(&self) -> Arc<dyn GraphQLMetrics> { Arc::new(TestGraphQLMetrics) } + + async fn run_sql_query( + self: Arc<Self>, + _req: SqlQueryReq, + ) -> Result<Vec<SqlQueryObject>, QueryExecutionError> { + unimplemented!(); + } } #[cfg(test)] From bd5817f6546226eb49e6e83683ccd7fdfeffc0c0 Mon Sep 17 00:00:00 2001 From: David Lutterkort <lutter@watzmann.net> Date: Mon, 27 Jan 2025 17:28:03 -0800 Subject: [PATCH 08/13] store: Restrict the selectable columns to actual attributes --- store/postgres/src/sql/parser_tests.yaml | 8 +++++--- store/postgres/src/sql/validation.rs | 12 +++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/store/postgres/src/sql/parser_tests.yaml b/store/postgres/src/sql/parser_tests.yaml index 11d2a976e21..d6eacaa1e2a 100644 --- a/store/postgres/src/sql/parser_tests.yaml +++ b/store/postgres/src/sql/parser_tests.yaml @@ -9,7 +9,7 @@ ok: > select to_jsonb(sub.*) as data from ( SELECT symbol, address FROM ( - SELECT * FROM "sgd0815"."token" WHERE block_range @> 2147483647) AS token + SELECT id, address, symbol, name, decimals FROM "sgd0815"."token" WHERE block_range @> 2147483647) AS token WHERE decimals > 10 ) as sub - sql: > with tokens as ( @@ -44,11 +44,13 @@ AS t (address, symbol, name, decimals)) SELECT date, t.symbol, SUM(amount) / pow(10, t.decimals) AS amount FROM (SELECT date(to_timestamp(block_timestamp) AT TIME ZONE 'utc') AS date, token, amount - FROM (SELECT * FROM "sgd0815"."swap" WHERE block$ <= 2147483647) AS sm, + FROM (SELECT id, timestamp, pool, token_0, token_1, sender, recipient, origin, amount_0, amount_1, amount_usd, sqrt_price_x96, tick, log_index + FROM "sgd0815"."swap" WHERE block$ <= 2147483647) AS sm, UNNEST(sm.amounts_in, sm.tokens_in) AS smi (amount, token) UNION ALL SELECT date(to_timestamp(block_timestamp) AT TIME ZONE 'utc') AS date, token, amount - FROM (SELECT * FROM "sgd0815"."swap" WHERE block$ <= 2147483647) AS sm, + FROM (SELECT id, timestamp, pool, token_0, token_1, sender, recipient, origin, amount_0, amount_1, amount_usd, sqrt_price_x96, tick, log_index + FROM "sgd0815"."swap" WHERE block$ <= 2147483647) AS sm, UNNEST(sm.amounts_out, sm.tokens_out) AS smo (amount, token)) AS tp JOIN tokens AS t ON t.address = tp.token GROUP BY tp.date, t.symbol, t.decimals diff --git a/store/postgres/src/sql/validation.rs b/store/postgres/src/sql/validation.rs index 7639216aa0a..d2370576c22 100644 --- a/store/postgres/src/sql/validation.rs +++ b/store/postgres/src/sql/validation.rs @@ -116,15 +116,21 @@ impl VisitorMut for Validator<'_> { return ControlFlow::Continue(()); }; - // Change 'from table [as alias]' to 'from (select * from table) as alias' + // Change 'from table [as alias]' to 'from (select {columns} from table) as alias' + let columns = table + .columns + .iter() + .map(|column| column.name.as_str()) + .collect::<Vec<_>>() + .join(", "); let query = if table.immutable { format!( - "select * from {} where {} <= {}", + "select {columns} from {} where {} <= {}", table.qualified_name, BLOCK_COLUMN, self.block ) } else { format!( - "select * from {} where {} @> {}", + "select {columns} from {} where {} @> {}", table.qualified_name, BLOCK_RANGE_COLUMN, self.block ) }; From 2b1435a76b2aea483fb12e256671b33586f9c0d4 Mon Sep 17 00:00:00 2001 From: David Lutterkort <lutter@watzmann.net> Date: Tue, 28 Jan 2025 16:34:34 -0800 Subject: [PATCH 09/13] store: Defer wrapping of SQL queries until execution The wrapping with to_jsonb is closely tied to how we run the query --- store/postgres/src/deployment_store.rs | 2 + store/postgres/src/sql/parser.rs | 8 +--- store/postgres/src/sql/parser_tests.yaml | 50 +++++++++++------------- 3 files changed, 27 insertions(+), 33 deletions(-) diff --git a/store/postgres/src/deployment_store.rs b/store/postgres/src/deployment_store.rs index de342075d4d..028df2b09f5 100644 --- a/store/postgres/src/deployment_store.rs +++ b/store/postgres/src/deployment_store.rs @@ -292,6 +292,8 @@ impl DeploymentStore { conn: &mut PgConnection, query: &str, ) -> Result<Vec<SqlQueryObject>, QueryExecutionError> { + let query = format!("select to_jsonb(sub.*) as data from ({}) as sub", query); + let query = diesel::sql_query(query); // Execute the provided SQL query diff --git a/store/postgres/src/sql/parser.rs b/store/postgres/src/sql/parser.rs index 4d19fd444fa..a897c44e657 100644 --- a/store/postgres/src/sql/parser.rs +++ b/store/postgres/src/sql/parser.rs @@ -21,14 +21,10 @@ impl Parser { validator.validate_statements(&mut statements)?; let statement = statements - .get_mut(0) + .get(0) .ok_or_else(|| anyhow!("No SQL statements found"))?; - let sql = format!( - "select to_jsonb(sub.*) as data from ( {} ) as sub", - statement - ); - Ok(sql) + Ok(statement.to_string()) } } diff --git a/store/postgres/src/sql/parser_tests.yaml b/store/postgres/src/sql/parser_tests.yaml index d6eacaa1e2a..3650f0a3620 100644 --- a/store/postgres/src/sql/parser_tests.yaml +++ b/store/postgres/src/sql/parser_tests.yaml @@ -7,36 +7,32 @@ - sql: select symbol, address from token where decimals > 10 ok: > - select to_jsonb(sub.*) as data from ( SELECT symbol, address FROM ( SELECT id, address, symbol, name, decimals FROM "sgd0815"."token" WHERE block_range @> 2147483647) AS token - WHERE decimals > 10 ) as sub + WHERE decimals > 10 - sql: > - with tokens as ( - select * from (values - ('0x0000000000000000000000000000000000000000','ETH','Ethereum',18), - ('0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48','USDC','USD Coin',6) - ) as t(address,symbol,name,decimals) - ) - - select date, t.symbol, SUM(amount)/pow(10,t.decimals) as amount - from (select - date(to_timestamp(block_timestamp) at time zone 'utc') as date, - token, amount - from swap as sm, - unnest(sm.amounts_in,sm.tokens_in) as smi(amount,token) - union all - select - date(to_timestamp(block_timestamp) at time zone 'utc') as date, - token, amount - from swap as sm, - unnest(sm.amounts_out,sm.tokens_out) as smo(amount,token)) as tp - inner join - tokens as t on t.address = tp.token - group by tp.date, t.symbol, t.decimals - order by tp.date desc, amount desc + with tokens as ( + select * from (values + ('0x0000000000000000000000000000000000000000','ETH','Ethereum',18), + ('0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48','USDC','USD Coin',6) + ) as t(address,symbol,name,decimals)) + select date, t.symbol, SUM(amount)/pow(10,t.decimals) as amount + from (select + date(to_timestamp(block_timestamp) at time zone 'utc') as date, + token, amount + from swap as sm, + unnest(sm.amounts_in,sm.tokens_in) as smi(amount,token) + union all + select + date(to_timestamp(block_timestamp) at time zone 'utc') as date, + token, amount + from swap as sm, + unnest(sm.amounts_out,sm.tokens_out) as smo(amount,token)) as tp + inner join + tokens as t on t.address = tp.token + group by tp.date, t.symbol, t.decimals + order by tp.date desc, amount desc ok: > - select to_jsonb(sub.*) as data from ( WITH tokens AS ( SELECT * FROM ( VALUES ('0x0000000000000000000000000000000000000000', 'ETH', 'Ethereum', 18), @@ -54,4 +50,4 @@ UNNEST(sm.amounts_out, sm.tokens_out) AS smo (amount, token)) AS tp JOIN tokens AS t ON t.address = tp.token GROUP BY tp.date, t.symbol, t.decimals - ORDER BY tp.date DESC, amount DESC ) as sub + ORDER BY tp.date DESC, amount DESC From 84349bab663ede9bd550f7635a80458adde3ce3f Mon Sep 17 00:00:00 2001 From: David Lutterkort <lutter@watzmann.net> Date: Tue, 28 Jan 2025 17:02:05 -0800 Subject: [PATCH 10/13] store: Move SQL validation tests to YAML test file --- store/postgres/src/sql/parser_tests.yaml | 95 +++++++++--- store/postgres/src/sql/validation.rs | 177 ----------------------- 2 files changed, 71 insertions(+), 201 deletions(-) diff --git a/store/postgres/src/sql/parser_tests.yaml b/store/postgres/src/sql/parser_tests.yaml index 3650f0a3620..9640b74177f 100644 --- a/store/postgres/src/sql/parser_tests.yaml +++ b/store/postgres/src/sql/parser_tests.yaml @@ -7,16 +7,16 @@ - sql: select symbol, address from token where decimals > 10 ok: > - SELECT symbol, address FROM ( - SELECT id, address, symbol, name, decimals FROM "sgd0815"."token" WHERE block_range @> 2147483647) AS token - WHERE decimals > 10 + select symbol, address from ( + select id, address, symbol, name, decimals from "sgd0815"."token" where block_range @> 2147483647) as token + where decimals > 10 - sql: > with tokens as ( select * from (values - ('0x0000000000000000000000000000000000000000','ETH','Ethereum',18), - ('0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48','USDC','USD Coin',6) + ('0x0000000000000000000000000000000000000000','eth','ethereum',18), + ('0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48','usdc','usd coin',6) ) as t(address,symbol,name,decimals)) - select date, t.symbol, SUM(amount)/pow(10,t.decimals) as amount + select date, t.symbol, sum(amount)/pow(10,t.decimals) as amount from (select date(to_timestamp(block_timestamp) at time zone 'utc') as date, token, amount @@ -33,21 +33,68 @@ group by tp.date, t.symbol, t.decimals order by tp.date desc, amount desc ok: > - WITH tokens AS ( - SELECT * FROM ( - VALUES ('0x0000000000000000000000000000000000000000', 'ETH', 'Ethereum', 18), - ('0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48', 'USDC', 'USD Coin', 6)) - AS t (address, symbol, name, decimals)) - SELECT date, t.symbol, SUM(amount) / pow(10, t.decimals) AS amount - FROM (SELECT date(to_timestamp(block_timestamp) AT TIME ZONE 'utc') AS date, token, amount - FROM (SELECT id, timestamp, pool, token_0, token_1, sender, recipient, origin, amount_0, amount_1, amount_usd, sqrt_price_x96, tick, log_index - FROM "sgd0815"."swap" WHERE block$ <= 2147483647) AS sm, - UNNEST(sm.amounts_in, sm.tokens_in) AS smi (amount, token) - UNION ALL - SELECT date(to_timestamp(block_timestamp) AT TIME ZONE 'utc') AS date, token, amount - FROM (SELECT id, timestamp, pool, token_0, token_1, sender, recipient, origin, amount_0, amount_1, amount_usd, sqrt_price_x96, tick, log_index - FROM "sgd0815"."swap" WHERE block$ <= 2147483647) AS sm, - UNNEST(sm.amounts_out, sm.tokens_out) AS smo (amount, token)) AS tp - JOIN tokens AS t ON t.address = tp.token - GROUP BY tp.date, t.symbol, t.decimals - ORDER BY tp.date DESC, amount DESC + with tokens as ( + select * from ( + values ('0x0000000000000000000000000000000000000000', 'eth', 'ethereum', 18), + ('0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48', 'usdc', 'usd coin', 6)) + as t (address, symbol, name, decimals)) + select date, t.symbol, sum(amount) / pow(10, t.decimals) as amount + from (select date(to_timestamp(block_timestamp) at time zone 'utc') as date, token, amount + from (select id, timestamp, pool, token_0, token_1, sender, recipient, origin, amount_0, amount_1, amount_usd, sqrt_price_x96, tick, log_index + from "sgd0815"."swap" where block$ <= 2147483647) as sm, + unnest(sm.amounts_in, sm.tokens_in) as smi (amount, token) + union all + select date(to_timestamp(block_timestamp) at time zone 'utc') as date, token, amount + from (select id, timestamp, pool, token_0, token_1, sender, recipient, origin, amount_0, amount_1, amount_usd, sqrt_price_x96, tick, log_index + from "sgd0815"."swap" where block$ <= 2147483647) as sm, + unnest(sm.amounts_out, sm.tokens_out) as smo (amount, token)) as tp + join tokens as t on t.address = tp.token + group by tp.date, t.symbol, t.decimals + order by tp.date desc, amount desc +- name: pg_sleep forbidden + sql: select pool from swap where '' = (select cast(pg_sleep(5) as text)) + err: Unknown or unsupported function pg_sleep +- name: table functions forbidden + sql: > + select vid, k.sname + from swap, + lateral(select current_schemas as sname from current_schemas(true)) as k + err: Unknown or unsupported function current_schemas +- name: function without parens forbidden + sql: select input_token from swap where '' = (select user) + err: Unknown or unsupported function user +- name: aggregation allowed + sql: > + select token0, sum(amount0) as total_amount + from swap + group by token0 + having sum(amount0) > 1000 + ok: > + SELECT token0, sum(amount0) AS total_amount + FROM (SELECT id, timestamp, pool, token_0, token_1, sender, recipient, origin, amount_0, amount_1, amount_usd, sqrt_price_x96, tick, log_index + FROM "sgd0815"."swap" WHERE block$ <= 2147483647) AS swap + GROUP BY token0 + HAVING sum(amount0) > 1000 +- name: arbitrary function forbidden + sql: > + select token0 from swap + where '' = (select cast(do_strange_math(amount_in) as text)) + err: Unknown or unsupported function do_strange_math +- name: create table forbidden + sql: create table foo (id int primary key); + err: Only SELECT query is supported +- name: insert forbidden + sql: insert into foo values (1); + err: Only SELECT query is supported +- name: CTE allowed + sql: with foo as (select 1) select * from foo + ok: with foo as (select 1) select * from foo +- name: CTE with insert forbidden + sql: with foo as (insert into target values(1)) select * from bar + err: Only SELECT query is supported +- name: only single statement + sql: select 1; select 2; + err: Multi statement is not supported +- name: unknown tables forbidden + sql: select * from unknown_table + err: Unknown table unknown_table diff --git a/store/postgres/src/sql/validation.rs b/store/postgres/src/sql/validation.rs index d2370576c22..4309793dc0f 100644 --- a/store/postgres/src/sql/validation.rs +++ b/store/postgres/src/sql/validation.rs @@ -164,180 +164,3 @@ impl VisitorMut for Validator<'_> { ControlFlow::Continue(()) } } - -#[cfg(test)] -mod test { - use graph::prelude::BLOCK_NUMBER_MAX; - - use super::*; - use crate::sql::{constants::SQL_DIALECT, test::make_layout}; - - fn validate(sql: &str) -> Result<(), Error> { - let mut statements = sqlparser::parser::Parser::parse_sql(&SQL_DIALECT, sql).unwrap(); - - const GQL: &str = " - type Swap @entity { - id: ID! - sender: Bytes! - inputAmount: BigDecimal! - inputToken: Bytes! - amountOut: BigDecimal! - outputToken: Bytes! - slippage: BigDecimal! - referralCode: String - blockNumber: Int! - blockTimestamp: Timestamp! - transactionHash: Bytes! - }"; - - let layout = make_layout(GQL); - - let mut validator = Validator::new(&layout, BLOCK_NUMBER_MAX); - - validator.validate_statements(&mut statements) - } - - #[test] - fn test_function_disallowed() { - let result = validate( - " - SELECT - input_token - FROM swap - WHERE '' = ( - SELECT - CAST(pg_sleep(5) AS text - ) - )", - ); - assert_eq!(result, Err(Error::UnknownFunction("pg_sleep".to_owned()))); - } - - #[test] - fn test_table_function_disallowed() { - let result = validate( - " - SELECT - vid, - k.sname - FROM swap, - LATERAL( - SELECT - current_schemas as sname - FROM current_schemas(true) - ) as k", - ); - assert_eq!( - result, - Err(Error::UnknownFunction("current_schemas".to_owned())) - ); - } - - #[test] - fn test_function_disallowed_without_paranthesis() { - let result = validate( - " - SELECT - input_token - FROM swap - WHERE '' = ( - SELECT user - )", - ); - assert_eq!(result, Err(Error::UnknownFunction("user".to_owned()))); - } - - #[test] - fn test_function_allowed() { - let result = validate( - " - SELECT - input_token, - SUM(input_amount) AS total_amount - FROM swap - GROUP BY input_token - HAVING SUM(input_amount) > 1000 - ", - ); - assert_eq!(result, Ok(())); - } - - #[test] - fn test_function_unknown() { - let result = validate( - " - SELECT - input_token - FROM swap - WHERE '' = ( - SELECT - CAST(do_strange_math(amount_in) AS text - ) - )", - ); - assert_eq!( - result, - Err(Error::UnknownFunction("do_strange_math".to_owned())) - ); - } - - #[test] - fn test_not_select_ddl() { - let result = validate( - " - CREATE TABLE foo (id INT PRIMARY KEY); - ", - ); - assert_eq!(result, Err(Error::NotSelectQuery)); - } - - #[test] - fn test_not_select_insert() { - let result = validate( - " - INSERT INTO foo VALUES (1); - ", - ); - assert_eq!(result, Err(Error::NotSelectQuery)); - } - - #[test] - fn test_common_table_expression() { - let result = validate( - " - WITH foo AS (SELECT 1) SELECT * FROM foo; - ", - ); - assert_eq!(result, Ok(())); - } - - #[test] - fn test_common_table_expression_with_effect() { - let result = validate( - " - WITH foo AS (INSERT INTO target VALUES(1)) SELECT * FROM bar; - ", - ); - assert_eq!(result, Err(Error::NotSelectQuery)); - } - - #[test] - fn test_no_multi_statement() { - let result = validate( - " - SELECT 1; SELECT 2; - ", - ); - assert_eq!(result, Err(Error::MultiStatementUnSupported)); - } - - #[test] - fn test_table_unknown() { - let result = validate( - " - SELECT * FROM unknown_table; - ", - ); - assert_eq!(result, Err(Error::UnknownTable("unknown_table".to_owned()))); - } -} From 133cadbaf1aa9551b86e372778fc7ab4720e7322 Mon Sep 17 00:00:00 2001 From: David Lutterkort <lutter@watzmann.net> Date: Mon, 3 Feb 2025 15:16:18 -0800 Subject: [PATCH 11/13] store: Limit maximum execution time for SQL queries --- store/postgres/src/deployment_store.rs | 15 ++++++++++----- store/postgres/src/relational.rs | 2 +- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/store/postgres/src/deployment_store.rs b/store/postgres/src/deployment_store.rs index 028df2b09f5..057dcd88acd 100644 --- a/store/postgres/src/deployment_store.rs +++ b/store/postgres/src/deployment_store.rs @@ -54,7 +54,7 @@ use crate::detail::ErrorDetail; use crate::dynds::DataSourcesTable; use crate::primary::DeploymentId; use crate::relational::index::{CreateIndex, IndexList, Method}; -use crate::relational::{Layout, LayoutCache, SqlName, Table}; +use crate::relational::{Layout, LayoutCache, SqlName, Table, STATEMENT_TIMEOUT}; use crate::relational_queries::{FromEntityData, JSONData}; use crate::{advisory_lock, catalog, retry}; use crate::{connection_pool::ConnectionPool, detail}; @@ -293,12 +293,17 @@ impl DeploymentStore { query: &str, ) -> Result<Vec<SqlQueryObject>, QueryExecutionError> { let query = format!("select to_jsonb(sub.*) as data from ({}) as sub", query); - let query = diesel::sql_query(query); - // Execute the provided SQL query - let results = query - .load::<JSONData>(conn) + let results = conn + .transaction(|conn| { + if let Some(ref timeout_sql) = *STATEMENT_TIMEOUT { + conn.batch_execute(timeout_sql)?; + } + + // Execute the provided SQL query + query.load::<JSONData>(conn) + }) .map_err(|e| QueryExecutionError::SqlError(e.to_string()))?; Ok(results diff --git a/store/postgres/src/relational.rs b/store/postgres/src/relational.rs index c5c929e189a..d4561f23771 100644 --- a/store/postgres/src/relational.rs +++ b/store/postgres/src/relational.rs @@ -95,7 +95,7 @@ pub const STRING_PREFIX_SIZE: usize = 256; pub const BYTE_ARRAY_PREFIX_SIZE: usize = 64; lazy_static! { - static ref STATEMENT_TIMEOUT: Option<String> = ENV_VARS + pub(crate) static ref STATEMENT_TIMEOUT: Option<String> = ENV_VARS .graphql .sql_statement_timeout .map(|duration| format!("set local statement_timeout={}", duration.as_millis())); From 804a20818ce89369dba5c924b5952b980f261a03 Mon Sep 17 00:00:00 2001 From: David Lutterkort <lutter@watzmann.net> Date: Mon, 3 Feb 2025 15:27:53 -0800 Subject: [PATCH 12/13] docs: Add some details on the SQL interface --- docs/implementation/sql-interface.md | 82 ++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 docs/implementation/sql-interface.md diff --git a/docs/implementation/sql-interface.md b/docs/implementation/sql-interface.md new file mode 100644 index 00000000000..6c8740d200e --- /dev/null +++ b/docs/implementation/sql-interface.md @@ -0,0 +1,82 @@ +# SQL Queries + +**This interface is extremely experimental. There is no guarantee that this +interface will ever be brought to production use. It's solely here to help +evaluate the utility of such an interface** + +SQL queries can be issued by posting a JSON document to +`/subgraphs/sql`. The server will respond with a JSON response that +contains the records matching the query in JSON form. + +The body of the request must contain the following keys: + +* `deployment`: the hash of the deployment against which the query should + be run +* `query`: the SQL query +* `mode`: either `info` or `data`. When the mode is `info` only some + information of the response is reported, with a mode of `data` the query + result is sent in the response + +The SQL query can use all the tables of the given subgraph. Table and +attribute names are snake-cased from their form in the GraphQL schema, so +that data for `SomeDailyStuff` is stored in a table `some_daily_stuff`. + +The query can use fairly arbitrary SQL, including aggregations and most +functions built into PostgreSQL. + +## Example + +For a subgraph whose schema defines an entity `Block`, the following query +```json +{ + "query": "select number, hash, parent_hash, timestamp from block order by number desc limit 2", + "deployment": "QmSoMeThInG", + "mode": "data" +} +``` + +might result in this response +```json +{ + "data": [ + { + "hash": "\\x5f91e535ee4d328725b869dd96f4c42059e3f2728dfc452c32e5597b28ce68d6", + "number": 5000, + "parent_hash": "\\x82e95c1ee3a98cd0646225b5ae6afc0b0229367b992df97aeb669c898657a4bb", + "timestamp": "2015-07-30T20:07:44+00:00" + }, + { + "hash": "\\x82e95c1ee3a98cd0646225b5ae6afc0b0229367b992df97aeb669c898657a4bb", + "number": 4999, + "parent_hash": "\\x875c9a0f8215258c3b17fd5af5127541121cca1f594515aae4fbe5a7fbef8389", + "timestamp": "2015-07-30T20:07:36+00:00" + } + ] +} +``` + +## Limitations/Ideas/Disclaimers + +Most of these are fairly easy to address: + +* queries must finish within `GRAPH_SQL_STATEMENT_TIMEOUT` (unlimited by + default) +* queries are always executed at the subgraph head. It would be easy to add + a way to specify a block at which the query should be executed +* the interface right now pretty much exposes the raw SQL schema for a + subgraph, though system columns like `vid` or `block_range` are made + inaccessible. +* it is not possible to join across subgraphs, though it would be possible + to add that. Implenting that would require some additional plumbing that + hides the effects of sharding. +* JSON as the response format is pretty terrible, and we should change that + to something that isn't so inefficient +* the response contains data that's pretty raw; as the example shows, + binary data uses Postgres' notation for hex strings +* because of how broad the supported SQL is, it is pretty easy to issue + queries that take a very long time. It will therefore not be hard to take + down a `graph-node`, especially when no query timeout is set + +Most importantly: while quite a bit of effort has been put into making this +interface safe, in particular, making sure it's not possible to write +through this interface, there's no guarantee that this works without bugs. From ef3ee188e235271365af0f40d6db047debfe37a5 Mon Sep 17 00:00:00 2001 From: David Lutterkort <lutter@watzmann.net> Date: Wed, 12 Mar 2025 11:57:22 +0000 Subject: [PATCH 13/13] store: Enforce existing GraphQL first/skip limits for SQL queries --- store/postgres/src/deployment_store.rs | 5 +- store/postgres/src/sql/parser.rs | 7 ++- store/postgres/src/sql/validation.rs | 72 +++++++++++++++++++++++--- 3 files changed, 73 insertions(+), 11 deletions(-) diff --git a/store/postgres/src/deployment_store.rs b/store/postgres/src/deployment_store.rs index 057dcd88acd..a1736521191 100644 --- a/store/postgres/src/deployment_store.rs +++ b/store/postgres/src/deployment_store.rs @@ -292,7 +292,10 @@ impl DeploymentStore { conn: &mut PgConnection, query: &str, ) -> Result<Vec<SqlQueryObject>, QueryExecutionError> { - let query = format!("select to_jsonb(sub.*) as data from ({}) as sub", query); + let query = format!( + "select to_jsonb(sub.*) as data from ({}) as sub limit {}", + query, ENV_VARS.graphql.max_first + ); let query = diesel::sql_query(query); let results = conn diff --git a/store/postgres/src/sql/parser.rs b/store/postgres/src/sql/parser.rs index a897c44e657..afdaef26292 100644 --- a/store/postgres/src/sql/parser.rs +++ b/store/postgres/src/sql/parser.rs @@ -1,7 +1,7 @@ use super::{constants::SQL_DIALECT, validation::Validator}; use crate::relational::Layout; use anyhow::{anyhow, Ok, Result}; -use graph::prelude::BlockNumber; +use graph::{env::ENV_VARS, prelude::BlockNumber}; use std::sync::Arc; pub struct Parser { @@ -17,7 +17,10 @@ impl Parser { pub fn parse_and_validate(&self, sql: &str) -> Result<String> { let mut statements = sqlparser::parser::Parser::parse_sql(&SQL_DIALECT, sql)?; - let mut validator = Validator::new(&self.layout, self.block); + let max_offset = ENV_VARS.graphql.max_skip; + let max_limit = ENV_VARS.graphql.max_first; + + let mut validator = Validator::new(&self.layout, self.block, max_limit, max_offset); validator.validate_statements(&mut statements)?; let statement = statements diff --git a/store/postgres/src/sql/validation.rs b/store/postgres/src/sql/validation.rs index 4309793dc0f..17e424bd947 100644 --- a/store/postgres/src/sql/validation.rs +++ b/store/postgres/src/sql/validation.rs @@ -1,7 +1,7 @@ use graph::prelude::BlockNumber; use sqlparser::ast::{ - Expr, Ident, ObjectName, Query, SetExpr, Statement, TableAlias, TableFactor, VisitMut, - VisitorMut, + Expr, Ident, ObjectName, Offset, Query, SetExpr, Statement, TableAlias, TableFactor, Value, + VisitMut, VisitorMut, }; use sqlparser::parser::Parser; use std::result::Result; @@ -22,20 +22,30 @@ pub enum Error { NotSelectQuery, #[error("Unknown table {0}")] UnknownTable(String), + #[error("Only constant numbers are supported for LIMIT and OFFSET.")] + UnsupportedLimitOffset, + #[error("The limit of {0} is greater than the maximum allowed limit of {1}.")] + UnsupportedLimit(u32, u32), + #[error("The offset of {0} is greater than the maximum allowed offset of {1}.")] + UnsupportedOffset(u32, u32), } pub struct Validator<'a> { layout: &'a Layout, ctes: HashSet<String>, block: BlockNumber, + max_limit: u32, + max_offset: u32, } impl<'a> Validator<'a> { - pub fn new(layout: &'a Layout, block: BlockNumber) -> Self { + pub fn new(layout: &'a Layout, block: BlockNumber, max_limit: u32, max_offset: u32) -> Self { Self { layout, ctes: Default::default(), block, + max_limit, + max_offset, } } @@ -61,6 +71,45 @@ impl<'a> Validator<'a> { Ok(()) } + + pub fn validate_limit_offset(&mut self, query: &mut Query) -> ControlFlow<Error> { + let Query { limit, offset, .. } = query; + + if let Some(limit) = limit { + match limit { + Expr::Value(Value::Number(s, _)) => match s.parse::<u32>() { + Err(_) => return ControlFlow::Break(Error::UnsupportedLimitOffset), + Ok(limit) => { + if limit > self.max_limit { + return ControlFlow::Break(Error::UnsupportedLimit( + limit, + self.max_limit, + )); + } + } + }, + _ => return ControlFlow::Break(Error::UnsupportedLimitOffset), + } + } + + if let Some(Offset { value, .. }) = offset { + match value { + Expr::Value(Value::Number(s, _)) => match s.parse::<u32>() { + Err(_) => return ControlFlow::Break(Error::UnsupportedLimitOffset), + Ok(offset) => { + if offset > self.max_offset { + return ControlFlow::Break(Error::UnsupportedOffset( + offset, + self.max_offset, + )); + } + } + }, + _ => return ControlFlow::Break(Error::UnsupportedLimitOffset), + } + } + ControlFlow::Continue(()) + } } impl VisitorMut for Validator<'_> { @@ -73,9 +122,9 @@ impl VisitorMut for Validator<'_> { } } - fn pre_visit_query(&mut self, _query: &mut Query) -> ControlFlow<Self::Break> { + fn pre_visit_query(&mut self, query: &mut Query) -> ControlFlow<Self::Break> { // Add common table expressions to the set of known tables - if let Some(ref with) = _query.with { + if let Some(ref with) = query.with { self.ctes.extend( with.cte_tables .iter() @@ -83,10 +132,17 @@ impl VisitorMut for Validator<'_> { ); } - match *_query.body { - SetExpr::Update(_) | SetExpr::Insert(_) => ControlFlow::Break(Error::NotSelectQuery), - _ => ControlFlow::Continue(()), + match *query.body { + SetExpr::Select(_) | SetExpr::Query(_) => { /* permitted */ } + SetExpr::SetOperation { .. } => { /* permitted */ } + SetExpr::Table(_) => { /* permitted */ } + SetExpr::Values(_) => { /* permitted */ } + SetExpr::Insert(_) | SetExpr::Update(_) => { + return ControlFlow::Break(Error::NotSelectQuery) + } } + + self.validate_limit_offset(query) } /// Invoked for any table function in the AST.