Merge pull request #1389 from henrykironde/auto-and-shell

Repair auto detect and fetch upstream scripts
weecology · Sep 25, 2019 · d0f81cb · d0f81cb
2 parents ceefefb + 2f7e141
commit d0f81cb
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 19 deletions.
diff --git a/retriever/lib/create_scripts.py b/retriever/lib/create_scripts.py
@@ -72,12 +72,27 @@ def create_resources(file, skip_lines):
     clean_table = table.__dict__
     resource_dict = {}
     path_to_table = os.path.basename(clean_table["name"])
-    resource_dict["name"] = os.path.splitext(path_to_table)[0]
+    print("Processing... {file_name}".format(file_name=path_to_table))
+    resource_dict["name"] = os.path.splitext(path_to_table)[0].lower()
+    resource_dict["path"] = path_to_table
     resource_dict["schema"] = {}
-    resource_dict["dialect"] = {}
+    resource_dict["dialect"] = {"delimiter": ","}
     resource_dict["schema"]["fields"] = []
     for cname, ctuple in clean_table["columns"]:
-        resource_dict["schema"]["fields"].append({"name": cname, "type": ctuple[0]})
+        if len(ctuple) >= 2:
+            if ctuple[0] == 'char':
+                # char sizes need quotes
+                char_size = "{a}".format(a=ctuple[1])
+                resource_dict["schema"]["fields"].append({"name": cname,
+                                                          "type": ctuple[0],
+                                                          "size": char_size})
+            else:
+                resource_dict["schema"]["fields"].append({"name": cname,
+                                                          "type": ctuple[0],
+                                                          "size": ctuple[1]})
+        else:
+            resource_dict["schema"]["fields"].append({"name": cname,
+                                                      "type": ctuple[0]})
     resource_dict["url"] = "FILL"
     return resource_dict
 
@@ -90,6 +105,7 @@ def create_script_dict(allpacks, path, file, skip_lines):
     allpacks["citation"] = "FILL"
     allpacks["licenses"] = [{"name": "FILL"}]
     allpacks["keywords"] = []
+    allpacks["archived"] = "fill or remove this field if not archived"
     allpacks["homepage"] = "FILL"
     allpacks["version"] = "1.0.0"
     try:
@@ -132,9 +148,14 @@ def process_singles(single_files_path, out_path, skip_lines):
     If the filepath is a directory, creates a single script for each file in the
     directory.
     """
+    if single_files_path.startswith("."):
+        return
+
     if os.path.isdir(single_files_path):
         for path, _, files in os.walk(single_files_path):
             for file_n in files:
+                if file_n.endswith(".json"):
+                    continue
                 allpacks = collections.OrderedDict()
                 if file_n:
                     allpacks = create_script_dict(allpacks, path, file_n, skip_lines)
@@ -150,7 +171,8 @@ def process_singles(single_files_path, out_path, skip_lines):
 
 def write_out_scripts(script_dict, path, out_path):
     """Writes scripts out to a given path"""
-    file_name = os.path.basename(path).split(".")[0] + ".json"
+    names = os.path.basename(path).split(".")[0] + ".json"
+    file_name = names.lower().replace("-", "_")
     path_dir = get_directory(os.path.expanduser(path))
     if out_path is not None:
         path_dir = os.path.expanduser(out_path)

diff --git a/retriever/lib/engine.py b/retriever/lib/engine.py
@@ -256,15 +256,18 @@ def auto_get_datatypes(self, pk, source, columns):
             if values:
                 for i in range(len(columns)):
                     try:
-                        val = u"{}".format(values[i])
-
+                        val = str(values[i]).strip()
+                        if not val:
+                            continue
                         if self.table.cleanup.function != no_cleanup:
                             val = self.table.cleanup.function(
                                 val, self.table.cleanup.args)
 
                         if val and val.strip():
-                            if len(str(val)) + 100 > max_lengths[i]:
-                                max_lengths[i] = len(str(val)) + 100
+                            # Find length using val.encode() to cater for various
+                            # encoded char for `char` types
+                            if len(val.encode()) > max_lengths[i]:
+                                max_lengths[i] = len(val.encode())
 
                             if column_types[i][0] in ('int', 'bigint'):
                                 try:
@@ -274,19 +277,20 @@ def auto_get_datatypes(self, pk, source, columns):
                                             val > self.max_int:
                                         column_types[i] = ['bigint', ]
                                 except Exception as _:
-                                    column_types[i] = ['double', ]
+                                    column_types[i] = ('double', )
                             if column_types[i][0] == 'double':
                                 try:
                                     val = float(val)
-                                    if "e" in str(val) or ("." in str(val) and len(str(val).split(".")[1]) > 10):
-                                        column_types[i] = ["decimal", "50,30"]
+                                    if "e" in str(val) or \
+                                            ("." in str(val) and len(str(val).split(".")[1]) > 10):
+                                        column_types[i] = ("decimal", "50,30")
                                 except Exception as _:
-                                    column_types[i] = ['char', max_lengths[i]]
+                                    column_types[i] = ('char', max_lengths[i])
                             if column_types[i][0] == 'char':
-                                if len(str(val)) + 100 > column_types[i][1]:
-                                    column_types[i][1] = max_lengths[i]
+                                if len(val.encode()) > column_types[i][1]:
+                                    column_types[i] = ('char', max_lengths[i])
                     except IndexError:
-                        pass
+                        continue
         for i, value in enumerate(columns):
             column = value
             column[1] = column_types[i]

diff --git a/retriever/lib/scripts.py b/retriever/lib/scripts.py
@@ -318,14 +318,11 @@ def get_dataset_names_upstream(keywords=None, licenses=None, repo=REPOSITORY):
         version_file = version_file_request.text.splitlines()[1:]
 
         scripts = []
-        max_scripts = 100
         for line in version_file:
             script = line.strip('\n').split(',')[0]
             script = '.'.join(script.split('.')[:-1])
             script = script.replace('_', '-')
             scripts.append(script)
-            if len(scripts) == max_scripts:
-                break
         return sorted(scripts)
 
     result_scripts = set()

diff --git a/test/test_retriever.py b/test/test_retriever.py
@@ -102,8 +102,9 @@ def test_auto_get_datatypes():
                                    [["ö", 'bb', 'Löve']],
                                    [['a', None], ['b', None], ['c', None]])
     length = test_engine.table.columns
+    # encoded char "?" will return 2 in length
     assert [length[0][1][1], length[1][1][1], length[2][1][1]] == \
-           [101, 102, 104]
+           [2, 2, 5]
 
 
 def test_auto_get_columns_extra_whitespace():