## Read CSV file and return list of shapes (as dicts)

Source: [csvreader.py](https://github.com/tombaker/csv2shex/blob/master/csv2shex/csvreader.py)

Shape elements
- shapeID (URI)
- shapeClosed (later)
- shapeLabel (later: annotation)
- start (later)

Triple constraint elements
- propertyID (URI)
- propertyLabel (later: annotation)
- mandatory
- repeatable
- valueNodeType
- valueDataType (URI)
- valueConstraint (URI - maybe)
- valueConstraintType
- valueShape (URI)
- note (later: annotation)

```
def csvreader(csvfile):
    """Read CSV file and return list of CSV shapes, one dict per CSV shape."""
    csvrow_dicts_list = _get_csvrow_dicts_list(csvfile)
    corrected_csvrow_dicts_list = _get_corrected_csvrows_list(csvrow_dicts_list)
    csvshapes_list = _get_csvshape_dicts_list(corrected_csvrow_dicts_list)
    return csvshapes_list
```

In [28]:
# from csv2shex.csvreader import (
#     csvreader, 
#     _get_csvrow_dicts_list,
#     _get_corrected_csvrows_list, 
#     _get_csvshape_dicts_list, 
# )
from csv2shex.csvrow import CSVRow
from csv2shex.utils import pprint_df
import pandas as pd

In [29]:
from csv import DictReader
from pathlib import Path
def _get_csvrow_dicts_list(csvfile):
    """Read CSV file and return list of header:value dicts, one per row."""
    csv_dictreader = DictReader(Path(csvfile).open(newline="", encoding="utf-8-sig"))
    csvrow_dicts_list = list(csv_dictreader)
    if "propertyID" not in list(csvrow_dicts_list[0].keys()):
        raise CsvError("Valid DCAP CSV must have a 'propertyID' column.")
    return csvrow_dicts_list

In [34]:
# Reads CSV file, returns list of dicts - one per row.
csvrow_dicts_list = _get_csvrow_dicts_list('minimal.csv')
csvrow_dicts_list

[{'shapeID': ':book',
  'propertyID': 'dc:creator',
  'valueConstraint': '',
  'valueShape': ':author'},
 {'shapeID': '',
  'propertyID': 'dc:type',
  'valueConstraint': 'so:Book',
  'valueShape': ''},
 {'shapeID': ':author',
  'propertyID': 'foaf:name',
  'valueConstraint': '',
  'valueShape': ''}]

In [35]:
from csv2shex.config import CSV_MODEL
from dataclasses import asdict
import ruamel.yaml as yaml

CSV_MODEL_DICT = yaml.safe_load(CSV_MODEL)

def _get_corrected_csvrows_list(csvrow_dicts_list=None, csv_model_dict=CSV_MODEL_DICT):
    """Turn list of dicts into list of CSVRow objects."""
    corrected_csvrow_dicts_list = []
    shapeids_list = []
    first_shape_encountered = True
    keys = csv_model_dict["shape_elements"] + csv_model_dict["tconstraint_elements"]
    keys.remove("shapeID")
    for row in csvrow_dicts_list:
        if not row.get("propertyID") and row.get("shapeID"):
            shapeids_list.append(row["shapeID"])
            continue

        stat = CSVRow()

        if row.get("shapeID"):
            stat.shapeID = row["shapeID"]
        else:
            if shapeids_list:
                stat.shapeID = shapeids_list[-1]
            elif not shapeids_list:
                stat.shapeID = ":default"
        if stat.shapeID not in shapeids_list:
            shapeids_list.append(stat.shapeID)
        if first_shape_encountered:
            first_shape_encountered = False

        for key in keys:
            if key in row:
                setattr(stat, key, row[key])

        stat.normalize()
        stat.validate()
        corrected_csvrow_dicts_list.append(asdict(stat))
    return corrected_csvrow_dicts_list

In [36]:
corrected_csvrows_list = _get_corrected_csvrows_list(csvrow_dicts_list)
corrected_csvrows_list

[{'shapeID': ':book',
  'shapeLabel': None,
  'shapeClosed': False,
  'propertyID': 'dc:creator',
  'propertyLabel': None,
  'mandatory': False,
  'repeatable': False,
  'valueNodeType': None,
  'valueDataType': None,
  'valueConstraint': '',
  'valueConstraintType': None,
  'valueShape': ':author',
  'note': None},
 {'shapeID': ':book',
  'shapeLabel': None,
  'shapeClosed': False,
  'propertyID': 'dc:type',
  'propertyLabel': None,
  'mandatory': False,
  'repeatable': False,
  'valueNodeType': None,
  'valueDataType': None,
  'valueConstraint': 'so:Book',
  'valueConstraintType': None,
  'valueShape': '',
  'note': None},
 {'shapeID': ':author',
  'shapeLabel': None,
  'shapeClosed': False,
  'propertyID': 'foaf:name',
  'propertyLabel': None,
  'mandatory': False,
  'repeatable': False,
  'valueNodeType': None,
  'valueDataType': None,
  'valueConstraint': '',
  'valueConstraintType': None,
  'valueShape': '',
  'note': None}]

In [24]:
from typing import List
from collections import defaultdict
from csv2shex.csvshape import CSVShape, CSVTripleConstraint

def _get_csvshape_dicts_list(csvrow_dicts_list, csv_model=CSV_MODEL) -> List[dict]:
    """Get list of csvshape dicts from list of csvrow dicts."""
    aggregator_ddict = defaultdict(dict)
    is_first_csvrow_encountered = True
    pvdict = dict()
    csv_model_dict = yaml.safe_load(csv_model)

    for csvrow_dict in csvrow_dicts_list:
        if csvrow_dict["shapeID"] not in aggregator_ddict.keys():
            shap_obj = CSVShape()
            shap_obj.shapeID = csvrow_dict["shapeID"]
            shap_obj.shapeLabel = csvrow_dict["shapeLabel"]
            shap_obj.start = bool(is_first_csvrow_encountered)
            shap_obj.tripleconstraints_list = list()
            aggregator_ddict[shap_obj.shapeID] = shap_obj
            is_first_csvrow_encountered = False

        for key in csv_model_dict["tconstraint_elements"]:
            pvdict[key] = csvrow_dict[key]

        aggregator_ddict[shap_obj.shapeID].tripleconstraints_list.append(pvdict.copy())
        pvdict.clear()

    csvshape_dicts_list = []
    for key in aggregator_ddict.keys():
        csvshape_dict = aggregator_ddict[key]
        csvshape_dicts_list.append(csvshape_dict)
        
    return csvshape_dicts_list

In [27]:
csvshapes_list = _get_csvshape_dicts_list(corrected_csvrows_list)
[vars(x) for x in csvshapes_list]

[{'shapeID': 'http://example.org/book',
  'shapeLabel': None,
  'shapeClosed': None,
  'start': True,
  'tripleconstraints_list': [{'propertyID': 'http://purl.org/dc/terms/creator',
    'propertyLabel': None,
    'mandatory': True,
    'repeatable': False,
    'valueNodeType': '',
    'valueDataType': '',
    'valueConstraint': '',
    'valueConstraintType': '',
    'valueShape': 'http://example.org/author',
    'note': None},
   {'propertyID': 'http://purl.org/dc/terms/subject',
    'propertyLabel': None,
    'mandatory': False,
    'repeatable': False,
    'valueNodeType': '',
    'valueDataType': '',
    'valueConstraint': 'http://id.loc.gov/',
    'valueConstraintType': 'IriStem',
    'valueShape': '',
    'note': None}]},
 {'shapeID': 'http://example.org/author',
  'shapeLabel': None,
  'shapeClosed': None,
  'start': False,
  'tripleconstraints_list': [{'propertyID': 'http://xmlns.com/foaf/0.1/name',
    'propertyLabel': None,
    'mandatory': False,
    'repeatable': False,
    

In [5]:
from ShExJSG.SchemaWithContext import Schema
from pyjsg.jsglib import loader
from pyjsg.jsglib.jsg_array import JSGArray
from pyjsg.jsglib.loader import is_valid, StringIO
from pyshex.utils.schema_loader import SchemaLoader
from typing import cast, Union, List, Optional
from ShExJSG import ShExC, ShExJ
from ShExJSG.ShExJ import (
    EachOf,
    IRIREF,
    IriStem,
    NodeConstraint,
    Shape,
    TripleConstraint,
    shapeExpr,
)
from csv2shex.mkshex import generate_node_constraint, add_statement, shape_to_shex

In [6]:
df = pd.read_csv("book_ap.csv", skip_blank_lines=True)

In [7]:
pprint_df(df)

Unnamed: 0,shapeID,propertyID,mandatory,repeatable,valueNodeType,valueDataType,valueConstraint,valueConstraintType,valueShape
0,http://example.org/book,http://purl.org/dc/terms/creator,Y,N,,,,,http://example.org/author
1,,http://purl.org/dc/terms/subject,,,,,http://id.loc.gov/,IriStem,
2,http://example.org/author,http://xmlns.com/foaf/0.1/name,,,literal,http://www.w3.org/2001/XMLSchema#string,,,
