added a more meaningful status output for every 10k items pre-processed

opentapioca · Sep 5, 2022 · e9ca50c · e9ca50c
1 parent e7779de
commit e9ca50c
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 3 deletions.
diff --git a/docs/indexing.rst b/docs/indexing.rst
@@ -37,7 +37,10 @@ matrix and a pagerank vector in four steps:
 1. preprocess the dump, only
    extracting the information we need: this creates a TSV file containing
    on each line the item id (without leading Q), the list of ids this item
-   points to, and the number of occurences of such links.
+   points to, and the number of occurences of such links. There will be an
+   output for every 10'000 items that have been processed. For a rough
+   estimate about the total number of pages please consult the "Content pages"
+   figure on https://www.wikidata.org/wiki/Special:Statistics
 
    ::
 

diff --git a/opentapioca/wikidatagraph.py b/opentapioca/wikidatagraph.py
@@ -31,14 +31,15 @@ def preprocess_dump(cls, fname, output_fname):
         output_file = open(output_fname, 'w')
 
         with WikidataDumpReader(fname) as reader:
+            counter = 0
             for item in reader:
                 qid = item.get('id')
                 if qid[0] != 'Q':
                     continue
 
                 rowid = int(qid[1:])
-                if rowid % 10000 == 0:
-                    print(str(rowid))
+                if counter % 10000 == 0:
+                    print(str(counter))
 
                 edges = item.get_outgoing_edges()
                 nb_edges = len(edges)
@@ -56,6 +57,7 @@ def preprocess_dump(cls, fname, output_fname):
                     json.dumps(cur_data),
                 ]
                 output_file.write('\t'.join(fields)+'\n')
+                counter = counter + 1
 
     def load_from_preprocessed_dump(self, fname, batch_size=1000000):
         """