Skip to content

Commit

Permalink
Merge pull request #1389 from henrykironde/auto-and-shell
Browse files Browse the repository at this point in the history
Repair auto detect and fetch upstream scripts
  • Loading branch information
ethanwhite committed Sep 25, 2019
2 parents ceefefb + 2f7e141 commit d0f81cb
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 19 deletions.
30 changes: 26 additions & 4 deletions retriever/lib/create_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,27 @@ def create_resources(file, skip_lines):
clean_table = table.__dict__
resource_dict = {}
path_to_table = os.path.basename(clean_table["name"])
resource_dict["name"] = os.path.splitext(path_to_table)[0]
print("Processing... {file_name}".format(file_name=path_to_table))
resource_dict["name"] = os.path.splitext(path_to_table)[0].lower()
resource_dict["path"] = path_to_table
resource_dict["schema"] = {}
resource_dict["dialect"] = {}
resource_dict["dialect"] = {"delimiter": ","}
resource_dict["schema"]["fields"] = []
for cname, ctuple in clean_table["columns"]:
resource_dict["schema"]["fields"].append({"name": cname, "type": ctuple[0]})
if len(ctuple) >= 2:
if ctuple[0] == 'char':
# char sizes need quotes
char_size = "{a}".format(a=ctuple[1])
resource_dict["schema"]["fields"].append({"name": cname,
"type": ctuple[0],
"size": char_size})
else:
resource_dict["schema"]["fields"].append({"name": cname,
"type": ctuple[0],
"size": ctuple[1]})
else:
resource_dict["schema"]["fields"].append({"name": cname,
"type": ctuple[0]})
resource_dict["url"] = "FILL"
return resource_dict

Expand All @@ -90,6 +105,7 @@ def create_script_dict(allpacks, path, file, skip_lines):
allpacks["citation"] = "FILL"
allpacks["licenses"] = [{"name": "FILL"}]
allpacks["keywords"] = []
allpacks["archived"] = "fill or remove this field if not archived"
allpacks["homepage"] = "FILL"
allpacks["version"] = "1.0.0"
try:
Expand Down Expand Up @@ -132,9 +148,14 @@ def process_singles(single_files_path, out_path, skip_lines):
If the filepath is a directory, creates a single script for each file in the
directory.
"""
if single_files_path.startswith("."):
return

if os.path.isdir(single_files_path):
for path, _, files in os.walk(single_files_path):
for file_n in files:
if file_n.endswith(".json"):
continue
allpacks = collections.OrderedDict()
if file_n:
allpacks = create_script_dict(allpacks, path, file_n, skip_lines)
Expand All @@ -150,7 +171,8 @@ def process_singles(single_files_path, out_path, skip_lines):

def write_out_scripts(script_dict, path, out_path):
"""Writes scripts out to a given path"""
file_name = os.path.basename(path).split(".")[0] + ".json"
names = os.path.basename(path).split(".")[0] + ".json"
file_name = names.lower().replace("-", "_")
path_dir = get_directory(os.path.expanduser(path))
if out_path is not None:
path_dir = os.path.expanduser(out_path)
Expand Down
26 changes: 15 additions & 11 deletions retriever/lib/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,15 +256,18 @@ def auto_get_datatypes(self, pk, source, columns):
if values:
for i in range(len(columns)):
try:
val = u"{}".format(values[i])

val = str(values[i]).strip()
if not val:
continue
if self.table.cleanup.function != no_cleanup:
val = self.table.cleanup.function(
val, self.table.cleanup.args)

if val and val.strip():
if len(str(val)) + 100 > max_lengths[i]:
max_lengths[i] = len(str(val)) + 100
# Find length using val.encode() to cater for various
# encoded char for `char` types
if len(val.encode()) > max_lengths[i]:
max_lengths[i] = len(val.encode())

if column_types[i][0] in ('int', 'bigint'):
try:
Expand All @@ -274,19 +277,20 @@ def auto_get_datatypes(self, pk, source, columns):
val > self.max_int:
column_types[i] = ['bigint', ]
except Exception as _:
column_types[i] = ['double', ]
column_types[i] = ('double', )
if column_types[i][0] == 'double':
try:
val = float(val)
if "e" in str(val) or ("." in str(val) and len(str(val).split(".")[1]) > 10):
column_types[i] = ["decimal", "50,30"]
if "e" in str(val) or \
("." in str(val) and len(str(val).split(".")[1]) > 10):
column_types[i] = ("decimal", "50,30")
except Exception as _:
column_types[i] = ['char', max_lengths[i]]
column_types[i] = ('char', max_lengths[i])
if column_types[i][0] == 'char':
if len(str(val)) + 100 > column_types[i][1]:
column_types[i][1] = max_lengths[i]
if len(val.encode()) > column_types[i][1]:
column_types[i] = ('char', max_lengths[i])
except IndexError:
pass
continue
for i, value in enumerate(columns):
column = value
column[1] = column_types[i]
Expand Down
3 changes: 0 additions & 3 deletions retriever/lib/scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,14 +318,11 @@ def get_dataset_names_upstream(keywords=None, licenses=None, repo=REPOSITORY):
version_file = version_file_request.text.splitlines()[1:]

scripts = []
max_scripts = 100
for line in version_file:
script = line.strip('\n').split(',')[0]
script = '.'.join(script.split('.')[:-1])
script = script.replace('_', '-')
scripts.append(script)
if len(scripts) == max_scripts:
break
return sorted(scripts)

result_scripts = set()
Expand Down
3 changes: 2 additions & 1 deletion test/test_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,9 @@ def test_auto_get_datatypes():
[["ö", 'bb', 'Löve']],
[['a', None], ['b', None], ['c', None]])
length = test_engine.table.columns
# encoded char "?" will return 2 in length
assert [length[0][1][1], length[1][1][1], length[2][1][1]] == \
[101, 102, 104]
[2, 2, 5]


def test_auto_get_columns_extra_whitespace():
Expand Down

0 comments on commit d0f81cb

Please sign in to comment.