Skip to content

Commit

Permalink
Merge pull request #1504 from ashishpriyadarshiCIC/test-script-geojson
Browse files Browse the repository at this point in the history
Geojson to csv conversion test
  • Loading branch information
henrykironde committed Sep 1, 2020
2 parents 70872d8 + ee1b5a3 commit fb3f418
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ services:

# command to run tests using py.test
script:
- docker-compose run --service-ports python_retriever pytest -v --cov=retriever
- docker-compose run --service-ports python_retriever pytest -v -k "not test_geojson2csv" --cov=retriever
- docker-compose run python_retriever flake8 --ignore=E501,W503,E402,F401,F403,E722,F841,W504 retriever --max-line-length=90 2>&1
- docker-compose run python_retriever yapf -d --recursive retriever/ --style=.style.yapf 2>&1
# - docker-compose run python_retriever pylint -rn retriever/ -f colorized --rcfile=.pylintrc > /dev/null 2>&1
Expand Down
4 changes: 4 additions & 0 deletions retriever/lib/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
walk_relative_path,
excel_csv,
)
from retriever.lib.engine_tools import geojson2csv
from retriever.lib.warning import Warning


Expand Down Expand Up @@ -581,6 +582,9 @@ def excel_to_csv(self, src_path, path_to_csv, excel_info=None, encoding=ENCODING
if self.find_file(src_path) and excel_info:
excel_csv(src_path, path_to_csv, excel_info, encoding)

def process_geojson2csv(self, src_path, path_to_csv, encoding=ENCODING):
geojson2csv(src_path, path_to_csv, encoding)

def extract_gz(
self,
archive_path,
Expand Down
16 changes: 16 additions & 0 deletions retriever/lib/engine_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
import xml.etree.ElementTree as ET
import os
import csv
try:
# Geopanda installation is not smooth on the CI tests platforms
import geopandas
except ModuleNotFoundError:
pass

warnings.filterwarnings("ignore")
from retriever.lib.tools import open_fr, open_csvw, open_fw
Expand Down Expand Up @@ -162,6 +167,17 @@ def xml2csv(input_file, outputfile=None, header_values=None, row_tag="row"):
return outputfile


def geojson2csv(input_file, output_file, encoding):
"""Convert Geojson file to csv.
Function is used for testing only.
"""
file = open(input_file)
df = geopandas.read_file(file)
df.to_csv(output_file, index=False)
return output_file


def getmd5(data, data_type='lines', encoding='utf-8'):
"""Get MD5 of a data source."""
checksum = md5()
Expand Down
2 changes: 1 addition & 1 deletion retriever/lib/load_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def read_json(json_file):

if isinstance(json_object, dict) and "resources" in json_object.keys():
# Note::formats described by frictionless data may need to change
tabular_exts = {"csv", "tab"}
tabular_exts = {"csv", "tab", "geojson"}
vector_exts = {"shp", "kmz"}
raster_exts = {"tif", "tiff", "bil", "hdr", "h5", "hdf5", "hr", "image"}
for resource_item in json_object["resources"]:
Expand Down
6 changes: 6 additions & 0 deletions retriever/lib/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,12 @@ def process_tables(self, table_obj, url):
self.engine.excel_to_csv(src_path, path_to_csv, table_obj.xls_sheets,
self.encoding)

if hasattr(table_obj, "geojson_data"):
src_path = self.engine.format_filename(table_obj.geojson_data)
path_to_csv = self.engine.format_filename(table_obj.path)
self.engine.download_file(url, table_obj.geojson_data)
self.engine.process_geojson2csv(src_path, path_to_csv)

if hasattr(table_obj, "path"):
self.engine.auto_create_table(table_obj, url=url, filename=table_obj.path)
else:
Expand Down
71 changes: 71 additions & 0 deletions scripts/lake_county_illinois_cancer_rates.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
{
"citation": "",
"description": "geospatial data of cancer rates in Lake County, Illinois",
"encoding": "latin-1",
"homepage": "https://catalog.data.gov/dataset/cancer-rates",
"keywords": [],
"licenses": [],
"name": "lake-county-illinois-cancer-rates",
"resources": [
{
"dialect": {
"delimiter": ","
},
"name": "lakecounty_health",
"path": "LakeCounty_Health.csv",
"format": "tabular",
"geojson_data": "mytest.geojson",
"schema": {
"fields": [
{
"name": "fid",
"type": "int"
},
{
"name": "zip",
"type": "int"
},
{
"name": "colorectal",
"type": "double"
},
{
"name": "lung_bronc",
"type": "double"
},
{
"name": "breast_can",
"type": "double"
},
{
"name": "prostate_c",
"type": "double"
},
{
"name": "urinary_sy",
"type": "double"
},
{
"name": "all_cancer",
"type": "double"
},
{
"name": "shape_length",
"size": "50,30",
"type": "decimal"
},
{
"name": "shape_area",
"size": "50,30",
"type": "decimal"
}
]
},
"url": "http://data-lakecountyil.opendata.arcgis.com/datasets/cd63911cc52841f38b289aeeeff0f300_1.geojson"
}
],
"retriever": "True",
"retriever_minimum_version": "2.1.0",
"title": " Cancer Rates Lake County",
"version": "1.0.0"
}
2 changes: 1 addition & 1 deletion test/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def test_interface_table_registry(tmpdir):
# Test if script_table_registry keeps only the latest
# table names of the installed data packages in
# script_table_registry

workdir = tmpdir.mkdtemp()
workdir.chdir()
rt.install_csv("iris")
Expand Down
26 changes: 26 additions & 0 deletions test/test_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import os
import subprocess
import random
import pytest
import requests

import retriever as rt
from retriever.lib.engine import Engine
Expand All @@ -12,6 +14,12 @@
from retriever.lib.engine_tools import getmd5
from retriever.lib.engine_tools import xml2csv
from retriever.lib.engine_tools import json2csv

try:
from retriever.lib.engine_tools import geojson2csv
except ModuleNotFoundError:
pass

from retriever.lib.engine_tools import sort_file
from retriever.lib.engine_tools import sort_csv
from retriever.lib.engine_tools import create_file
Expand All @@ -26,6 +34,10 @@
**{"tables": test_engine.table, "name": "test"})
test_engine.opts = {'database_name': '{db}_abc'}

geojson2csv_dataset = [
("simple_geojson2csv", "lake_county.geojson", "http://data-lakecountyil.opendata.arcgis.com/datasets/cd63911cc52841f38b289aeeeff0f300_1.geojson", 'fid,zip,colorectal,lung_bronc,breast_can,prostate_c,urinary_sy,all_cancer,shape_length,shape_area,geometry')
]

# Main paths
HOMEDIR = os.path.expanduser('~')
file_location = os.path.dirname(os.path.realpath(__file__))
Expand Down Expand Up @@ -534,6 +546,20 @@ def test_json2csv():
assert obs_out == ['User,Country,Age', 'Alex,US,25']


@pytest.mark.parametrize("test_name, table_name, geojson_data_url, expected", geojson2csv_dataset)
def test_geojson2csv(test_name, table_name, geojson_data_url, expected):
if not os.environ.get("CI"):
r = requests.get(geojson_data_url, allow_redirects=True)
open(table_name, 'wb').write(r.content)
output_geojson = geojson2csv(table_name, "output_file_geojson.csv", encoding=test_engine.encoding)
header_val = None
with open(output_geojson, 'r') as fh:
header_val = fh.readline().split()
header_val = header_val[0].lower()
os.remove(output_geojson)
os.remove(table_name)
assert header_val == expected

def test_xml2csv():
"""Test xml2csv function.
Expand Down

0 comments on commit fb3f418

Please sign in to comment.