-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
Copy pathparse_jsonl_custom.py
148 lines (96 loc) · 6.12 KB
/
parse_jsonl_custom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
""" This script illustrates options for parsing JSON and JSONL files into a Library in LLMWare, including the ability
to provide a custom configured mapping intended for use with 'pseudo-db' structured JSON and JSONL files
Option # 1- Standard JSON/JSON parsing -
-- when using a bulk ingest Parsing method, the parser will route json and jsonl files to the 'standard'
TextParser - which will look for a "text" key in the JSON/JSONL to extract as the intended text
-- if no 'text' key found, then the parse will return an empty output list []
-- you can provide an optional key_list parameter to TextParser, which will then by default capture the
selected fields and aggregate to form the text input, e.g.,
TextParser().jsonl_file_handler(fp,fn, key_list=["context", "source", "ID"]
... where "context", "source" and "ID" represent keys found in the source json/jsonl file
-- the standard TextParser() is designed for ad hoc extraction of text content, not to preserve the keys
-- use of this method is shown in the first example below
Option # 2- Custom Configured JSON/JSONL parsing -
-- in addition to the standard parsing method, there is the ability to customize the mappings for a
JSON/JSONL file in which the keys are intended to be used for follow-up lookup and retrieval
-- Parser().parse_json_config method - this is shown in the second and third examples below
"""
from llmware.parsers import Parser, TextParser
from llmware.library import Library
from llmware.retrieval import Query
from llmware.configs import LLMWareConfig
import time
import ast
# All three text databases supported (mongo, postgres, and sqlite)
# if it is highly varied unstructured content, we would recommend Mongo given its flexibility
# if any validation errors with Postgres or SQLite, then we would recommend either preprocessing the json or
# ... trying with Mongo
LLMWareConfig().set_active_db("mongo")
def standard_json_parsing(fp, fn):
""" This example shows the 'standard' text handler for json/jsonl """
# the selected keys should map to dictionary keys found in the JSON/JSONL
# if no keys passed, then by default, parser will only look for a "text" key
# the parser objective is extracting/aggregating the content of the file, not using the 'structure' of the keys
# if interpret_as_table is True, then returns each row as a LIST of elements, corresponding to the selected keys
# if interpret_as_table is False, then returns a text string, which is concatenation of the text found in each
# key, and will use the value of the separator to combine each key,
# e.g., value1 + separator + value2 + separator ...
selected_keys = ["key1", "key2", "key3"] # e.g., "context", "source", "ID" or other keys in json
output = TextParser().jsonl_file_handler(fp,fn,key_list=selected_keys,interpret_as_table=False,separator="\n")
return output
def configured_json_parsing(fp, fn, library_name):
""" This example shows how to use mappings for a customized json/jsonl """
# metadata is a dictionary mapping of key names to keys in the json file
# the 'keys' correspond to the keys that will be added to the library
# the 'values' correspond to the keys that will be found in the JSON/JSONL source file
# metadata must have "text" mapping
# if "doc_ID" or "block_ID" mapping provided, then will "over-write" the default doc_ID and block_ID and
# use the mapping provided in the source JSON/JSONL
# for all other attributes (e.g., not text, doc_ID, block_ID), the keys will be stored in "special_field1" of
# the database. For Mongo, the keys will be stored directly as a dictionary, while for Postgres and SQLite,
# it will be stored as text string, which must be converted upon use back into a dictionary (see below for
# retrieval example)
# step 1 - create metadata mapping
# -- must have 'text' key mapped to key in json source
# -- all other keys are 'optional' and can be any number from 0 - N
# -- generally, key2, etc. should map to the name of the key in the JSON file, although you are free to re-name
metadata= {"text": "json_source_key_mapping_to_main_text_input",
"key2": "json_source_key2",
"key3": "key3"}
# step 2 - create new library
lib = Library().create_new_library(library_name)
parser = Parser(lib)
# step 3 - invoke parse_json_config method
print("step 1 - parsing")
t0 = time.time()
parser_output = parser.parse_json_config(fp, fn, mapping_dict=metadata)
print(f"done parsing - time - {time.time() - t0} - summary - {parser_output}")
return parser_output
def run_query_configured_input (library_name=None,query=""):
""" Once the custom json/jsonl is parsed into a Library, it can be used like any other content with the
additional json/jsonl attributes available in special_field1- which can be retrieved as demonstrated below.
-- note: the example below illustrates a 'text_query' but will apply exactly the same for a 'semantic_query'
"""
# run query
lib = Library().load_library(library_name)
q = Query(lib).text_query(query)
for j, results in enumerate(q):
meta = ""
doc_id = -1
# the mapped keys from the json file are all stored in "special_field1" of the library dictionary entry
# and can be retrieved as a string that can be mapped back into a dictionary as outlined below
if "special_field1" in results:
meta = results["special_field1"]
if isinstance(meta,str):
try:
meta = ast.literal_eval(meta)
except:
print(f"could not convert meta string back into dictionary - {meta}")
if "doc_ID" in results:
doc_id = results["doc_ID"]
text = results["text"]
print(f"\nresults - {j} - query - {query}")
print(f"results - text - {text}")
print(f"results - doc_ID - {doc_id} - metadata - {meta}")
print("done")
return 0