Skip to content

Commit 6f1787b

Browse files
committed
Replace parquet.js with hyparquet
1 parent 1e364ba commit 6f1787b

File tree

3 files changed

+14
-118
lines changed

3 files changed

+14
-118
lines changed

package.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,14 @@
2727
"fix": "./scripts/format"
2828
},
2929
"dependencies": {
30-
"parquetjs": "^0.11.2"
30+
"hyparquet": "1.14.0"
3131
},
3232
"devDependencies": {
3333
"@arethetypeswrong/cli": "^0.17.0",
3434
"@swc/core": "^1.3.102",
3535
"@swc/jest": "^0.2.29",
3636
"@types/jest": "^29.4.0",
3737
"@types/node": "^20.17.6",
38-
"@types/parquetjs": "^0.10.6",
3938
"@typescript-eslint/eslint-plugin": "8.31.1",
4039
"@typescript-eslint/parser": "8.31.1",
4140
"eslint": "^9.20.1",

src/lib/check-file.ts

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -646,17 +646,6 @@ async function _check_parquet(
646646
file: string,
647647
purpose: FilePurpose | string,
648648
): Promise<Partial<CheckFileReport>> {
649-
let ParquetReader: any;
650-
try {
651-
// ParquetJS is optional as it's large and isn't compatible with older systems.
652-
const pkg = await import('parquetjs');
653-
ParquetReader = pkg.ParquetReader;
654-
} catch {
655-
throw new Error(
656-
'parquetjs is not installed and is required to use parquet files. Please install it via `npm install parquetjs`',
657-
);
658-
}
659-
660649
const report_dict: Partial<CheckFileReport> = {};
661650

662651
if (purpose === 'eval') {
@@ -666,14 +655,17 @@ async function _check_parquet(
666655
}
667656

668657
try {
669-
const reader = await ParquetReader.openFile(file);
670-
const schema = reader.schema;
671-
const column_names = Object.keys(schema.fields);
658+
const { asyncBufferFromFile, parquetMetadataAsync, parquetSchema } = await import('hyparquet');
659+
660+
const asyncBuffer = await asyncBufferFromFile(file);
661+
const metadata = await parquetMetadataAsync(asyncBuffer);
662+
const { children } = parquetSchema(metadata);
663+
664+
const column_names = children.map((child: any) => child.element.name);
672665

673666
if (!column_names.includes('input_ids')) {
674667
report_dict.load_parquet = `Parquet file ${file} does not contain the \`input_ids\` column.`;
675668
report_dict.is_check_passed = false;
676-
await reader.close();
677669
return report_dict;
678670
}
679671

@@ -683,24 +675,21 @@ async function _check_parquet(
683675
', ',
684676
)} are supported.`;
685677
report_dict.is_check_passed = false;
686-
await reader.close();
687678
return report_dict;
688679
}
689680
}
690681

691-
const num_samples = reader.getRowCount() as number;
682+
const num_samples = Number(metadata.num_rows);
692683

693684
if (num_samples < MIN_SAMPLES) {
694685
report_dict.has_min_samples = false;
695686
report_dict.message = `Processing ${file} resulted in only ${num_samples} samples. Our minimum is ${MIN_SAMPLES} samples. `;
696687
report_dict.is_check_passed = false;
697-
await reader.close();
698688
return report_dict;
699689
} else {
700690
report_dict.num_samples = num_samples;
701691
}
702692

703-
await reader.close();
704693
report_dict.is_check_passed = true;
705694
} catch (e) {
706695
const errorMessage = e instanceof Error ? e.message : String(e);

yarn.lock

Lines changed: 5 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -930,13 +930,6 @@
930930
resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.15.tgz#596a1747233694d50f6ad8a7869fcb6f56cf5841"
931931
integrity sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==
932932

933-
"@types/node-int64@*":
934-
version "0.4.32"
935-
resolved "https://registry.yarnpkg.com/@types/node-int64/-/node-int64-0.4.32.tgz#a540bcb9e48816ca1b5329d1ab907d6ad134b856"
936-
integrity sha512-xf/JsSlnXQ+mzvc0IpXemcrO4BrCfpgNpMco+GLcXkFk01k/gW9lGJu+Vof0ZSvHK6DsHJDPSbjFPs36QkWXqw==
937-
dependencies:
938-
"@types/node" "*"
939-
940933
"@types/node@*":
941934
version "20.10.5"
942935
resolved "https://registry.yarnpkg.com/@types/node/-/node-20.10.5.tgz#47ad460b514096b7ed63a1dae26fad0914ed3ab2"
@@ -951,13 +944,6 @@
951944
dependencies:
952945
undici-types "~6.21.0"
953946

954-
"@types/parquetjs@^0.10.6":
955-
version "0.10.6"
956-
resolved "https://registry.yarnpkg.com/@types/parquetjs/-/parquetjs-0.10.6.tgz#7e4b54d9d336a8dda9c7a9091ec7f60db98744af"
957-
integrity sha512-ZCsD6j97YD0mGU8/VnVs3NjORXa7zeHvqlpJpCqy4jU8a1O21dalL+MFn9QNbdEfy8rszR1N7NHeT7/LdtHf+A==
958-
dependencies:
959-
"@types/node-int64" "*"
960-
961947
"@types/stack-utils@^2.0.0":
962948
version "2.0.3"
963949
resolved "https://registry.yarnpkg.com/@types/stack-utils/-/stack-utils-2.0.3.tgz#6209321eb2c1712a7e7466422b8cb1fc0d9dd5d8"
@@ -1232,16 +1218,6 @@ balanced-match@^1.0.0:
12321218
resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.2.tgz#e83e3a7e3f300b34cb9d87f615fa0cbf357690ee"
12331219
integrity sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==
12341220

1235-
base64-js@^1.1.2:
1236-
version "1.5.1"
1237-
resolved "https://registry.yarnpkg.com/base64-js/-/base64-js-1.5.1.tgz#1b1b440160a5bf7ad40b650f095963481903930a"
1238-
integrity sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==
1239-
1240-
bindings@~1.2.1:
1241-
version "1.2.1"
1242-
resolved "https://registry.yarnpkg.com/bindings/-/bindings-1.2.1.tgz#14ad6113812d2d37d72e67b4cacb4bb726505f11"
1243-
integrity sha512-u4cBQNepWxYA55FunZSM7wMi55yQaN0otnhhilNoWHq0MfOfJeQx0v0mRRpolGOExPjZcl6FtB0BB8Xkb88F0g==
1244-
12451221
brace-expansion@^1.1.7:
12461222
version "1.1.11"
12471223
resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd"
@@ -1264,13 +1240,6 @@ braces@^3.0.3:
12641240
dependencies:
12651241
fill-range "^7.1.1"
12661242

1267-
brotli@^1.3.0:
1268-
version "1.3.3"
1269-
resolved "https://registry.yarnpkg.com/brotli/-/brotli-1.3.3.tgz#7365d8cc00f12cf765d2b2c898716bcf4b604d48"
1270-
integrity sha512-oTKjJdShmDuGW94SyyaoQvAjf30dZaHnjJ8uAF+u2/vGJkJbJPJAT1gDiOJP5v1Zb6f9KEyW/1HpuaWIXtGHPg==
1271-
dependencies:
1272-
base64-js "^1.1.2"
1273-
12741243
browserslist@^4.22.2:
12751244
version "4.22.2"
12761245
resolved "https://registry.yarnpkg.com/browserslist/-/browserslist-4.22.2.tgz#704c4943072bd81ea18997f3bd2180e89c77874b"
@@ -1295,11 +1264,6 @@ bser@2.1.1:
12951264
dependencies:
12961265
node-int64 "^0.4.0"
12971266

1298-
bson@^1.0.4:
1299-
version "1.1.6"
1300-
resolved "https://registry.yarnpkg.com/bson/-/bson-1.1.6.tgz#fb819be9a60cd677e0853aee4ca712a785d6618a"
1301-
integrity sha512-EvVNVeGo4tHxwi8L6bPj3y3itEvStdwvvlojVxxbyYfoaxJ6keLgrTuKdyfEAszFK+H3olzBuafE0yoh0D1gdg==
1302-
13031267
buffer-from@^1.0.0:
13041268
version "1.1.2"
13051269
resolved "https://registry.yarnpkg.com/buffer-from/-/buffer-from-1.1.2.tgz#2b146a6fd72e80b4f55d255f35ed59a3a9a41bd5"
@@ -1945,6 +1909,11 @@ human-signals@^2.1.0:
19451909
resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0"
19461910
integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==
19471911

1912+
hyparquet@1.14.0:
1913+
version "1.14.0"
1914+
resolved "https://registry.yarnpkg.com/hyparquet/-/hyparquet-1.14.0.tgz#9339d06dc52ee9edc606e74ce6d65c32ff2ed50f"
1915+
integrity sha512-qhDmkQwDrpd+7UESp0gkDoCgJ3m2uyy754Xm49xzZnn49FEvNC2Sm2/oKhbSkmfs0rNepcMh5E2KUiRKE64N0w==
1916+
19481917
iconv-lite@^0.6.3:
19491918
version "0.6.3"
19501919
resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.6.3.tgz#a52f80bf38da1952eb5c681790719871a1a72501"
@@ -2003,11 +1972,6 @@ inherits@2, inherits@^2.0.3:
20031972
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c"
20041973
integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
20051974

2006-
int53@^0.2.4:
2007-
version "0.2.4"
2008-
resolved "https://registry.yarnpkg.com/int53/-/int53-0.2.4.tgz#5ed8d7aad6c5c6567cae69aa7ffc4a109ee80f86"
2009-
integrity sha512-a5jlKftS7HUOhkUyYD7j2sJ/ZnvWiNlZS1ldR+g1ifQ+/UuZXIE+YTc/lK1qGj/GwAU5F8Z0e1eVq2t1J5Ob2g==
2010-
20111975
is-arrayish@^0.2.1:
20121976
version "0.2.1"
20131977
resolved "https://registry.yarnpkg.com/is-arrayish/-/is-arrayish-0.2.1.tgz#77c99840527aa8ecb1a8ba697b80645a7a926a9d"
@@ -2596,13 +2560,6 @@ lru-cache@^6.0.0:
25962560
dependencies:
25972561
yallist "^4.0.0"
25982562

2599-
lzo@^0.4.0:
2600-
version "0.4.11"
2601-
resolved "https://registry.yarnpkg.com/lzo/-/lzo-0.4.11.tgz#0e76d582567b29e285cb84a6aa392cb94c6283f8"
2602-
integrity sha512-apQHNoW2Alg72FMqaC/7pn03I7umdgSVFt2KRkCXXils4Z9u3QBh1uOtl2O5WmZIDLd9g6Lu4lIdOLmiSTFVCQ==
2603-
dependencies:
2604-
bindings "~1.2.1"
2605-
26062563
make-dir@^4.0.0:
26072564
version "4.0.0"
26082565
resolved "https://registry.yarnpkg.com/make-dir/-/make-dir-4.0.0.tgz#c3c2307a771277cd9638305f915c29ae741b614e"
@@ -2777,11 +2734,6 @@ object-assign@^4.0.1:
27772734
resolved "https://registry.yarnpkg.com/object-assign/-/object-assign-4.1.1.tgz#2109adc7965887cfc05cbbd442cac8bfbb360863"
27782735
integrity sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==
27792736

2780-
object-stream@0.0.1:
2781-
version "0.0.1"
2782-
resolved "https://registry.yarnpkg.com/object-stream/-/object-stream-0.0.1.tgz#3a03a26e94fd112c9abffeb4651e07a5e23cf840"
2783-
integrity sha512-+NPJnRvX9RDMRY9mOWOo/NDppBjbZhXirNNSu2IBnuNboClC9h1ZGHXgHBLDbJMHsxeJDq922aVmG5xs24a/cA==
2784-
27852737
once@^1.3.0:
27862738
version "1.4.0"
27872739
resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1"
@@ -2862,21 +2814,6 @@ parent-module@^1.0.0:
28622814
dependencies:
28632815
callsites "^3.0.0"
28642816

2865-
parquetjs@^0.11.2:
2866-
version "0.11.2"
2867-
resolved "https://registry.yarnpkg.com/parquetjs/-/parquetjs-0.11.2.tgz#ea13221b3583cb1277f8b4b879776420f8863660"
2868-
integrity sha512-Y6FOc3Oi2AxY4TzJPz7fhICCR8tQNL3p+2xGQoUAMbmlJBR7+JJmMrwuyMjIpDiM7G8Wj/8oqOH4UDUmu4I5ZA==
2869-
dependencies:
2870-
brotli "^1.3.0"
2871-
bson "^1.0.4"
2872-
int53 "^0.2.4"
2873-
object-stream "0.0.1"
2874-
snappyjs "^0.6.0"
2875-
thrift "^0.11.0"
2876-
varint "^5.0.0"
2877-
optionalDependencies:
2878-
lzo "^0.4.0"
2879-
28802817
parse-json@^5.2.0:
28812818
version "5.2.0"
28822819
resolved "https://registry.yarnpkg.com/parse-json/-/parse-json-5.2.0.tgz#c76fc66dee54231c962b22bcc8a72cf2f99753cd"
@@ -3004,11 +2941,6 @@ pure-rand@^6.0.0:
30042941
resolved "https://registry.yarnpkg.com/pure-rand/-/pure-rand-6.0.4.tgz#50b737f6a925468679bff00ad20eade53f37d5c7"
30052942
integrity sha512-LA0Y9kxMYv47GIPJy6MI84fqTd2HmYZI83W/kM/SkKfDlajnZYfmXFTxkbY+xSBPkLJxltMa9hIkmdc29eguMA==
30062943

3007-
q@^1.5.0:
3008-
version "1.5.1"
3009-
resolved "https://registry.yarnpkg.com/q/-/q-1.5.1.tgz#7e32f75b41381291d04611f1bf14109ac00651d7"
3010-
integrity sha512-kV/CThkXo6xyFEZUugw/+pIOywXcDbFYgSct5cT3gqlbkBE1SJdwy6UQoZvodiWF/ckQLZyDE/Bu1M6gVu5lVw==
3011-
30122944
queue-microtask@^1.2.2:
30132945
version "1.2.3"
30142946
resolved "https://registry.yarnpkg.com/queue-microtask/-/queue-microtask-1.2.3.tgz#4929228bbc724dfac43e0efb058caf7b6cfb6243"
@@ -3149,11 +3081,6 @@ slash@^3.0.0:
31493081
resolved "https://registry.yarnpkg.com/slash/-/slash-3.0.0.tgz#6539be870c165adbd5240220dbe361f1bc4d4634"
31503082
integrity sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==
31513083

3152-
snappyjs@^0.6.0:
3153-
version "0.6.1"
3154-
resolved "https://registry.yarnpkg.com/snappyjs/-/snappyjs-0.6.1.tgz#9bca9ff8c54b133a9cc84a71d22779e97fc51878"
3155-
integrity sha512-YIK6I2lsH072UE0aOFxxY1dPDCS43I5ktqHpeAsuLNYWkE5pGxRGWfDM4/vSUfNzXjC1Ivzt3qx31PCLmc9yqg==
3156-
31573084
source-map-support@0.5.13:
31583085
version "0.5.13"
31593086
resolved "https://registry.yarnpkg.com/source-map-support/-/source-map-support-0.5.13.tgz#31b24a9c2e73c2de85066c0feb7d44767ed52932"
@@ -3306,15 +3233,6 @@ thenify-all@^1.0.0:
33063233
dependencies:
33073234
any-promise "^1.0.0"
33083235

3309-
thrift@^0.11.0:
3310-
version "0.11.0"
3311-
resolved "https://registry.yarnpkg.com/thrift/-/thrift-0.11.0.tgz#256115e4ff87871e12537f4b510bd2b425e13990"
3312-
integrity sha512-UpsBhOC45a45TpeHOXE4wwYwL8uD2apbHTbtBvkwtUU4dNwCjC7DpQTjw2Q6eIdfNtw+dKthdwq94uLXTJPfFw==
3313-
dependencies:
3314-
node-int64 "^0.4.0"
3315-
q "^1.5.0"
3316-
ws ">= 2.2.3"
3317-
33183236
tmpl@1.0.5:
33193237
version "1.0.5"
33203238
resolved "https://registry.yarnpkg.com/tmpl/-/tmpl-1.0.5.tgz#8683e0b902bb9c20c4f726e3c0b69f36518c07cc"
@@ -3489,11 +3407,6 @@ validate-npm-package-name@^5.0.0:
34893407
resolved "https://registry.yarnpkg.com/validate-npm-package-name/-/validate-npm-package-name-5.0.1.tgz#a316573e9b49f3ccd90dbb6eb52b3f06c6d604e8"
34903408
integrity sha512-OljLrQ9SQdOUqTaQxqL5dEfZWrXExyyWsozYlAWFawPVNuD83igl7uJD2RTkNMbniIYgt8l81eCJGIdQF7avLQ==
34913409

3492-
varint@^5.0.0:
3493-
version "5.0.2"
3494-
resolved "https://registry.yarnpkg.com/varint/-/varint-5.0.2.tgz#5b47f8a947eb668b848e034dcfa87d0ff8a7f7a4"
3495-
integrity sha512-lKxKYG6H03yCZUpAGOPOsMcGxd1RHCu1iKvEHYDPmTyq2HueGhD73ssNBqqQWfvYs04G9iUFRvmAVLW20Jw6ow==
3496-
34973410
walker@^1.0.8:
34983411
version "1.0.8"
34993412
resolved "https://registry.yarnpkg.com/walker/-/walker-1.0.8.tgz#bd498db477afe573dc04185f011d3ab8a8d7653f"
@@ -3530,11 +3443,6 @@ write-file-atomic@^4.0.2:
35303443
imurmurhash "^0.1.4"
35313444
signal-exit "^3.0.7"
35323445

3533-
"ws@>= 2.2.3":
3534-
version "8.18.0"
3535-
resolved "https://registry.yarnpkg.com/ws/-/ws-8.18.0.tgz#0d7505a6eafe2b0e712d232b42279f53bc289bbc"
3536-
integrity sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==
3537-
35383446
y18n@^5.0.5:
35393447
version "5.0.8"
35403448
resolved "https://registry.yarnpkg.com/y18n/-/y18n-5.0.8.tgz#7f4934d0f7ca8c56f95314939ddcd2dd91ce1d55"

0 commit comments

Comments
 (0)