You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Import any dataset. This issue seems to sit in the inverted index implementation, therefore when skipping the vector index it's easier to reproduce this, as most of the time is spent on importing. See an example on how to use the wiki dataset without a vector index below
Query the inverted index while the import is running. It won't fail on the first run, but I've been able to reliably reproduce this in less than 2min (see video)
Expected Behavior
No error
Actual Behavior
Weaviate Crashes with a segfault
Notes
Producing SEGFAULTs in Go is rare, so an immediate suspect is the disk segments that are mmaped. We seem to be leaking memory addresses that are still used even after the segment is gone.
I think this happens when a disk segment was removed after a compaction, but somehow the memory address is still around
Out of the lock vs copy solution I have a preference towards copying, I think this is cleaner and more performant at scale than introducing yet another lock holder (locks already appear in profiling). Feel free to message me (@etiennedi) on how I imagine the copying. But first narrow down if the issue is really caused by this. It could also be something else.
Wikipedia dataset without vectors
Obtain the raw data: curl -O https://storage.googleapis.com/semi-technologies-public-data/wikipedia-en-articles.json.gz
Unzip it gunzip wikipedia-en-articles.json.gz
Install the python requirements:
weaviate_client==3.2.0
loguru==0.5.3
Run the following import script. It has already been modified to remove vector index and vectorizer, so it will be a pure inverted index import:
"""
Imports the complete Wiki dataset into Weaviate
"""
import json
import weaviate
from uuid import uuid3, NAMESPACE_DNS
from loguru import logger
def create_weaviate_schema(client):
"""
...
"""
# flush the schema and data
client.schema.delete_all()
# create schema
schema = {
"classes": [
{
"class": "Article",
"description": "A wikipedia article with a title and crefs",
"vectorizer": "none",
"vectorIndexConfig": {
"skip": True
},
"properties": [
{
"dataType": [
"string"
],
"description": "Title of the article",
"name": "title",
"indexInverted": True
},
{
"dataType": [
"Paragraph"
],
"description": "List of paragraphs this article has",
"name": "hasParagraphs",
"indexInverted": True
},
{
"dataType": [
"Article"
],
"description": "Articles this page links to",
"name": "linksToArticles",
"indexInverted": True
}
]
},
{
"class": "Paragraph",
"description": "A wiki paragraph",
"vectorizer": "none",
"vectorIndexConfig": {
"skip": True
},
"properties": [
{
"dataType": [
"string"
],
"description": "Title of the paragraph",
"name": "title",
"indexInverted": True,
"moduleConfig": {
"text2vec-transformers": {
"skip": True,
"vectorizePropertyName": False,
}
}
},
{
"dataType": [
"text"
],
"description": "The content of the paragraph",
"name": "content",
"indexInverted": True,
"moduleConfig": {
"text2vec-transformers": {
"skip": False,
"vectorizePropertyName": False,
}
}
},
{
"dataType": [
"int"
],
"description": "Order of the paragraph",
"name": "order",
"indexInverted": True,
"moduleConfig": {
"text2vec-transformers": {
"skip": True,
"vectorizePropertyName": False,
}
}
},
{
"dataType": [
"Article"
],
"description": "Article this paragraph is in",
"name": "inArticle",
"moduleConfig": {
"text2vec-transformers": {
"skip": True,
"vectorizePropertyName": False,
}
}
}
]
}
]
}
#
# add schema
#
client.schema.create(schema)
def add_article_to_batch(parsed_line):
return [
{
"title": parsed_line["title"]
},
"Article",
str(uuid3(NAMESPACE_DNS, parsed_line["title"].replace(" ", "_")))
]
def add_paragraph_to_batch(parsed_line):
return_array = []
for paragraph in parsed_line["paragraphs"]:
add_object = {
"content": paragraph["content"],
"order": paragraph["count"],
"inArticle": [{
"beacon": "weaviate://localhost/" + str(uuid3(NAMESPACE_DNS, parsed_line["title"].replace(" ", "_")))
}]
}
if "title" in paragraph:
# Skip if wiki paragraph
if ":" in paragraph["title"]:
continue
add_object["title"] = paragraph["title"]
# add to batch
return_array.append([
add_object,
"Paragraph",
str(uuid3(NAMESPACE_DNS, parsed_line["title"].replace(" ", "_") + str(paragraph["count"])))
])
return return_array
def handle_results(results):
if results is not None:
for result in results:
if 'result' in result and 'errors' in result['result'] and 'error' in result['result']['errors']:
for message in result['result']['errors']['error']:
logger.debug(message['message'])
def import_data_without_crefs(wiki_data_file):
counter = 1
counter_article = 1
with open(wiki_data_file) as f:
for line in f:
parsed_line = json.loads(line)
if len(parsed_line["paragraphs"]) > 0:
try:
article_obj = add_article_to_batch(parsed_line)
counter_article += 1
# skip if it is a standard wiki category
if ":" in article_obj[2]:
continue
else:
# add the article obj
client.data_object.create(article_obj[0], article_obj[1], article_obj[2])
counter += 1
# add the paragraphs
for item in add_paragraph_to_batch(parsed_line):
# add data object to batch
client.batch.add_data_object(item[0], item[1], item[2])
# add ref to batch
client.batch.add_reference(article_obj[2], "Article", "hasParagraphs", item[2])
logger.info("Imported (" + str(counter) + " / " + str(counter_article) + ") – " + parsed_line["title"] + " with # of paragraphs " + str(len(parsed_line["paragraphs"])))
counter += 1
if (counter % 500) == 0:
result = client.batch.create_objects()
result_refs = client.batch.create_references()
handle_results(result)
except Exception as e:
counter += 1
logger.debug("Skipping: " + article_obj[2])
logger.debug(e)
pass
client.batch.create_objects()
def import_data_crefs(wiki_data_file):
counter = 1
with open(wiki_data_file) as f:
for line in f:
parsed_line = json.loads(line)
# skip if it is a standard wiki category
if ":" in parsed_line["title"]:
continue
else:
for cref in parsed_line["crefs"]:
article_uuid = str(uuid3(NAMESPACE_DNS, parsed_line["title"].replace(" ", "_")))
link_uuid = str(uuid3(NAMESPACE_DNS, cref))
with client.batch(batch_size=12000, dynamic=True) as batch:
results = client.batch.add_reference(article_uuid, "Article", "linksToArticles", link_uuid)
handle_results(results)
counter += 1
logger.info("Crefs set (" + str(counter) + ") – " + parsed_line["title"])
if __name__ == "__main__":
logger.info("Start import")
# wiki data file
wiki_data_file = "wikipedia-en-articles.json"
# connect Weaviate
client = weaviate.Client("http://localhost:8080")
# create schema
create_weaviate_schema(client)
# import data objects without CREFs
import_data_without_crefs(wiki_data_file)
# import crefs
import_data_crefs(wiki_data_file)
# done
logger.info("Done")
You do not need to import the whole dataset which would take hours, as soon as the import is running you should be able to reproduce the bug
The text was updated successfully, but these errors were encountered:
How to reproduce
Expected Behavior
No error
Actual Behavior
Weaviate Crashes with a segfault
Notes
Possible solutions
lsmkv.MapList
queries eitherWikipedia dataset without vectors
curl -O https://storage.googleapis.com/semi-technologies-public-data/wikipedia-en-articles.json.gz
gunzip wikipedia-en-articles.json.gz
You do not need to import the whole dataset which would take hours, as soon as the import is running you should be able to reproduce the bug
The text was updated successfully, but these errors were encountered: