Skip to content

wikidata_query: cleanup script#2

Merged
BobBorges merged 1 commit intodevfrom
wd-query
Sep 12, 2024
Merged

wikidata_query: cleanup script#2
BobBorges merged 1 commit intodevfrom
wd-query

Conversation

@BobBorges
Copy link
Copy Markdown
Contributor

sorry about the diff

@BobBorges BobBorges requested a review from ninpnin May 14, 2024 08:43
@ninpnin
Copy link
Copy Markdown
Contributor

ninpnin commented May 15, 2024

What has actually happened here? Tabs -> spaces? I don't really know what has changed..

@BobBorges
Copy link
Copy Markdown
Contributor Author

diff --git a/src/wikidata_query.py b/src/wikidata_query.py
index edc0833..3309873 100755
--- a/src/wikidata_query.py
+++ b/src/wikidata_query.py
@@ -1,19 +1,25 @@
 """
 Query wikidata for metadata, process it and save it in corpus/metadata
 """
-from SPARQLWrapper import SPARQLWrapper, JSON
-import numpy as np
-import pandas as pd
-import argparse, os, re, time
+from pathlib import Path
 from pyriksdagen.data import queries as pyriksdagen_queries
+from pyriksdagen.db import clean_person_duplicates
+from pyriksdagen.utils import get_data_location
 from pyriksdagen.wikidata import (
     query2df,
     separate_name_location,
     move_party_to_party_df,
     elongate_external_ids,
 )
-from pyriksdagen.db import clean_person_duplicates
-from pathlib import Path
+from SPARQLWrapper import SPARQLWrapper, JSON
+import argparse
+import numpy as np
+import os
+import pandas as pd
+import re, time
+
+
+
 
 def track_missing_id(df, l, id_map=None):
     no_id = df.loc[pd.isna(df["person_id"])]
@@ -36,23 +42,24 @@ def track_missing_id(df, l, id_map=None):
     df.drop(columns=["wiki_id"], inplace=True)
     return df.reset_index(drop=True), l
 
-def main(args):
-       # Change query path to be from module!
 
-       if args.metadata_folder:
-               metadata_folder = args.metadata_folder
-       else:
-               metadata_folder = get_data_location("metadata")
 
-       if args.query_folder:
-               query_folder = args.query_folder
+
+def main(args):
+    if args.metadata_folder is None:
+        metadata_folder = get_data_location("metadata")
     else:
+        metadata_folder = args.metadata_folder
+
+    if args.query_folder is None:
         query_folder = pyriksdagen_queries.__path__._path[0]
+    else:
+        query_folder = args.query_folder
 
-       if args.queries:
-               queries = args.queries
+    if args.queries is None:
+        queries = sorted([q.stem for q in Path(query_folder).glob('*.rq')])
     else:
-               queries = sorted([q.stem for q in Path(args.query_folder).glob('*.rq')])
+        queries = args.queries
     input_folders = ['name_location_specifier', 'alias', "member_of_parliament", "party_affiliation"]
 
     # Query for and store cleaned versions of metadata
@@ -64,9 +71,10 @@ def main(args):
         id_map = query2df("wiki_id", args.source)
         print(type(id_map))
         id_map = id_map.drop_duplicates()
-               id_map.to_csv(f'{args.metadata_folder}/wiki_id.csv', index=False)
+        id_map.to_csv(f'{metadata_folder}/wiki_id.csv', index=False)
 
     for q in queries:
+        print(q)
         if q == "wiki_id":
             continue
         print(f"Query {q} started.")
@@ -94,7 +102,7 @@ def main(args):
             df = elongate_external_ids(df)
 
         # Store files needing additional preprocessing in input
-               folder = args.metadata_folder if not q in input_folders else args.input_metadata_folder
+        folder = metadata_folder if not q in input_folders else args.input_metadata_folder
         if folder == args.input_metadata_folder:
             d[q] = df
 
@@ -110,12 +118,12 @@ def main(args):
             if key not in queries:
                 d['key'] = pd.read_csv(f'{args.input_metadata_folder}/{key}.csv')
         name, loc = separate_name_location(d['name_location_specifier'], d['alias'])
-               name.to_csv(f'{args.metadata_folder}/name.csv', index=False)
-               loc.to_csv(f'{args.metadata_folder}/location_specifier.csv', index=False)
+        name.to_csv(f'{metadata_folder}/name.csv', index=False)
+        loc.to_csv(f'{metadata_folder}/location_specifier.csv', index=False)
 
         mp_df, party_df = move_party_to_party_df(d['member_of_parliament'], d['party_affiliation'])
-               mp_df.to_csv(f'{args.metadata_folder}/member_of_parliament.csv', index=False)
-               party_df.to_csv(f'{args.metadata_folder}/party_affiliation.csv', index=False)
+        mp_df.to_csv(f'{metadata_folder}/member_of_parliament.csv', index=False)
+        party_df.to_csv(f'{metadata_folder}/party_affiliation.csv', index=False)
 
     if len(no_swerik_id) > 0:
         print("Some entities returned in the queries seem not to have a swerik ID. Check and add an ID, then requery.")
@@ -123,12 +131,20 @@ def main(args):
         [outf.write(f"{_}\n") for _ in no_swerik_id]
 
 
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument('--input_metadata_folder', type=str, default="input/metadata")
-    parser.add_argument('--metadata_folder', type=str, default="corpus/metadata")
-    parser.add_argument('--query_folder', type=str, default="pyriksdagen/data/queries")
-    parser.add_argument('-q', '--queries', default=None, nargs='+', help='One or more sparql query files (separated by space)')
-    parser.add_argument('-s', '--source', default=None, nargs='+', help='One or more of member_of_parliament | minister | speaker (separated by space)')
+    parser.add_argument('--metadata_folder', type=str, default=None)
+    parser.add_argument('--query_folder', type=str, default=None)
+    parser.add_argument('-q', '--queries',
+                        default=None,
+                        nargs='+',
+                        help='One or more sparql query files (separated by space)')
+    parser.add_argument('-s', '--source',
+                        default=None,
+                        nargs='+',
+                        help='One or more of member_of_parliament | minister | speaker (separated by space)')
     args = parser.parse_args()
     main(args)

Copy link
Copy Markdown
Contributor

@ninpnin ninpnin left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@BobBorges BobBorges merged commit 960ae71 into dev Sep 12, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants