Merge pull request #594 from usc-isi-i2/dev

Dev
usc-isi-i2 · Dec 17, 2021 · 1ceda22 · 1ceda22
2 parents f74001a + 2214592
commit 1ceda22
Show file tree

Hide file tree

Showing 10 changed files with 1,716 additions and 16 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -16,4 +16,7 @@ to keep features modular.
 * Make commits of logical and atomic units.
 * Do pull requests **only** against the `dev` branch. Please avoid working directly
 on the `main` branch.
-  * Specify which issues does your pull request address.
+  * Specify which issues does your pull request address. 
+* Add/Update unit tests for the new changes.
+* Update the documentation, if there are such changes, in the consequent file under the `docs` folder.
+* Add a link to a new documentation file in the `mkdocs.yml` file, otherwise it will not show up in https://kgtk.readthedocs.io/en/latest/
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -14,7 +14,6 @@ RUN apt-get update && apt-get install -y \
 
 RUN apt-get install --reinstall build-essential -y
 
-RUN pip install huggingface-hub==0.0.17
 
 RUN git clone https://github.com/usc-isi-i2/kgtk/
 

diff --git a/docker/dev/Dockerfile b/docker/dev/Dockerfile
@@ -14,8 +14,6 @@ RUN apt-get update && apt-get install -y \
 
 RUN apt-get install --reinstall build-essential -y
 
-RUN pip install huggingface-hub==0.0.17
-
 RUN git clone https://github.com/usc-isi-i2/kgtk/ --branch dev
 
 RUN cd /kgtk && python setup.py install

diff --git a/docs/analysis/community-detection.md b/docs/analysis/community-detection.md
@@ -105,7 +105,7 @@ optional arguments:
 The following file will be used to illustrate some of the capabilities of `kgtk reachable-nodes`.
 
 ```bash
-head arnold_family.tsv
+head examples/docs/community-detection-arnold.tsv
 ```
 
 | node1 | label | node2 |
@@ -135,7 +135,7 @@ head arnold_family.tsv
 Find the communities using blockmodel.
 
 ```bash
-kgtk community-detection -i arnold_family.tsv --method blockmodel
+kgtk community-detection -i examples/docs/community-detection-arnold.tsv --method blockmodel
 ```
 
 |node1                                        |label|node2    |
@@ -218,7 +218,7 @@ kgtk community-detection -i arnold_family.tsv --method blockmodel
 ### nested model
 
 ```bash
-kgtk community-detection -i arnold_family.tsv --method nested
+kgtk community-detection -i examples/docs/community-detection-arnold.tsv --method nested
 ```
 
 |node1                                        |label|node2    |
@@ -301,7 +301,7 @@ kgtk community-detection -i arnold_family.tsv --method nested
 ### MCMC model
 
 ```bash
-kgtk community-detection -i arnold_family.tsv --method mcmc
+kgtk community-detection -i examples/docs/community-detection-arnold.tsv --method mcmc
 ```
 |node1                                        |label|node2    |node2;prob         |
 |---------------------------------------------|-----|---------|-------------------|

diff --git a/docs/analysis/reachable_nodes.md b/docs/analysis/reachable_nodes.md
@@ -239,6 +239,8 @@ usage: kgtk reachable-nodes [-h] [-i INPUT_FILE] [-o OUTPUT_FILE]
                             [--show-properties [True|False]]
                             [--breadth-first [True|False]]
                             [--depth-limit DEPTH_LIMIT]
+                            [--show-distance [True|False]]
+                            [--dist-col-name DIST_COL_NAME]
                             [-v [optional True|False]]
 
 optional arguments:
@@ -321,6 +323,13 @@ optional arguments:
 
   -v [optional True|False], --verbose [optional True|False]
                         Print additional progress messages (default=False).
+                        
+  --show-distance [True|False]
+                        When True, also given breadth first true, append
+                        another column showing the shortest distance, default
+                        col name is distance
+  --dist-col-name DIST_COL_NAME
+                        The column name for distance, default is distance
 ```
 
 ## Examples
@@ -882,3 +891,15 @@ kgtk reachable-nodes -i examples/docs/reachable-nodes-depth-limit.tsv \
 | node1 | label | node2 |
 | -- | -- | -- |
 | red_top | reachable | red_one |
+
+
+```bash
+kgtk reachable-nodes -i examples/docs/reachable-nodes-blocks.tsv  --root metal-block \
+--prop isa --breadth-first True --show-distance True --depth-limit 1 --undirected
+```
+
+| node1 | label | node2 | distance |
+| -- | -- | -- | -- |
+| metal-block | reachable | block | 1 |
+| metal-block | reachable | gold-block | 1 |
+| metal-block | reachable | silver-block | 1 |
diff --git a/examples/docs/community-detection-arnold.tsv b/examples/docs/community-detection-arnold.tsv
diff --git a/kgtk/io/kgtkbase.py b/kgtk/io/kgtkbase.py
@@ -110,14 +110,19 @@ def check_column_names(cls,
                            who: str,
                            error_action: ValidationAction,
                            error_file: typing.TextIO = sys.stderr,
-                           prohibit_whitespace_in_column_names=False,
+                           prohibit_whitespace_in_column_names: bool = False,
+                           supply_missing_column_names: bool = False
                            )->bool:
         """
         Returns True if the column names are OK.
         """
         complaints: typing.List[str] = [ ]
+        column_idx: int
         column_name: str
-        for column_name in column_names:
+        for column_idx, column_name in enumerate(column_names):
+            if supply_missing_column_names and column_name is None or len(column_name) == 0:
+                column_name = 'COL' + str(column_idx + 1)
+                column_names[column_idx] = column_name
             gripes: typing.List[str] = cls.check_column_name(column_name, header_line, error_action, error_file,
                                                              prohibit_whitespace_in_column_names=prohibit_whitespace_in_column_names)
             complaints.extend(gripes)

diff --git a/kgtk/io/kgtkreader.py b/kgtk/io/kgtkreader.py
@@ -72,6 +72,12 @@ class KgtkReaderOptions():
                                                                                                                                      iterable_validator=attr.validators.instance_of(list))),
                                                                     default=None)
     no_input_header: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False)
+    supply_missing_column_names: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False)
+    number_of_columns: bool = attr.ib(validator=attr.validators.optional(attr.validators.instance_of(int)), default=None)
+    require_column_names: typing.Optional[typing.List[str]] = attr.ib(validator=attr.validators.optional(attr.validators.deep_iterable(member_validator=attr.validators.instance_of(str),
+                                                                                                                                     iterable_validator=attr.validators.instance_of(list))),
+                                                                    default=None)
+    no_additional_columns: bool = attr.ib(validator=attr.validators.instance_of(int), default=False)
 
     # Data record sampling, pre-validation.
     #
@@ -268,6 +274,32 @@ def d(default: typing.Any)->typing.Mapping[str, typing.Any]:
                                    "has not been specified. (default=%(default)s)."),
                             type=optional_bool, nargs='?', const=True, **d(default=False))
 
+        hgroup.add_argument(prefix1 + "supply-missing-column-names",
+                            dest=prefix2 + "supply_missing_column_names",
+                            metavar="optional True|False",
+                            help=h(prefix3 + "Supply column names that are missing. (default=%(default)s)."),
+                            type=optional_bool, nargs='?', const=True, **d(default=False))
+
+        hgroup.add_argument(prefix1 + "number-of-columns",
+                            dest=prefix2 + "number_of_columns",
+                            metavar="COUNT",
+                            help=h(prefix3 + "The expected number of columns in the header. (default=%(default)s)."),
+                            type=int, **d(default=None))
+
+        hgroup.add_argument(prefix1 + "require-column-names",
+                            dest=prefix2 + "require_column_names",
+                            help=h(prefix3 + "The list of column names required in the input file. (default=None)."),
+                            nargs='+')
+
+        hgroup.add_argument(prefix1 + "no-additional-columns",
+                            dest=prefix2 + "no_additional_columns",
+                            metavar="optional True|False",
+                            help=h(prefix3 + "When True, do not allow any column names other than the required " +
+                                   "column names.  When --require-column-names is not specified, then " +
+                                   "disallow  columns other than [node1, label, node2, id] (or aliases) " +
+                                   "for an edge file, and [id] for a node file. (default=%(default)s)."),
+                            type=optional_bool, nargs='?', const=True, **d(default=False))
+
         hgroup.add_argument(prefix1 + "header-error-action",
                             dest=prefix2 + "header_error_action",
                             help=h(prefix3 + "The action to take when a header error is detected.  Only ERROR or EXIT are supported (default=%(default)s)."),
@@ -413,6 +445,10 @@ def lookup(name: str, default):
             fill_short_lines=lookup("fill_short_lines", False),
             force_column_names=lookup("force_column_names", None),
             no_input_header=lookup("no_input_header", False),
+            supply_missing_column_names=lookup("supply_missing_column_names", False),
+            number_of_columns=lookup("number_of_columns", None),
+            require_column_names=lookup("require_column_names", None),
+            no_additional_columns=lookup("no_additional_columns", False),
             use_mgzip=lookup("use_mgzip", False),
             mgzip_threads=lookup("mgzip_threads", cls.MGZIP_THREAD_COUNT_DEFAULT),
             gzip_in_parallel=lookup("gzip_in_parallel", False),
@@ -456,6 +492,12 @@ def show(self, who: str="", out: typing.TextIO=sys.stderr):
         if self.force_column_names is not None:
             print("%sforce-column-names=%s" % (prefix, " ".join(self.force_column_names)), file=out)
         print("%sno-input-header=%s" % (prefix, str(self.no_input_header)), file=out)
+        print("%ssupply-missing-column-names=%s" % (prefix, str(self.supply_missing_column_names)), file=out)
+        if self.number_of_columns is not None:
+            print("%snumber-of-columns=%d" % (prefix, self.number_of_columns), file=out)
+        if self.require_column_names is not None:
+            print("%srequire-column-names=%s" % (prefix, " ".join(self.require_column_names)), file=out)
+        print("%sno-additional-columns=%s" % (prefix, str(self.no_additional_columns)), file=out)
         print("%serror-limit=%s" % (prefix, str(self.error_limit)), file=out)
         print("%srepair-and-validate-lines=%s" % (prefix, str(self.repair_and_validate_lines)), file=out)
         print("%srepair-and-validate-values=%s" % (prefix, str(self.repair_and_validate_values)), file=out)
@@ -662,10 +704,6 @@ def open(cls,
         if verbose:
             print("input format: %s" % input_format, file=error_file, flush=True)
 
-        # If an input_filter has been supplied, check it for validity:
-        if input_filter is not None:
-            cls._validate_input_filter(input_filter, column_names)
-
         # Get the graph cache from the options or an envar:
         graph_cache: typing.Optional[str] = options.graph_cache
         if options.use_graph_cache_envar and graph_cache is None:
@@ -675,6 +713,7 @@ def open(cls,
 
         source: ClosableIter[str]
         header: str
+        column_name: str
         column_names: typing.List[str]
 
         # Decide whether or not to use the graph cache or the fast read path.
@@ -726,18 +765,28 @@ def open(cls,
             # header back, too, for use in debugging and error messages.
             (header, column_names) = cls._build_column_names(source, options, input_format, error_file=error_file, verbose=verbose)
 
+        if options.number_of_columns is not None and len(column_names) != options.number_of_columns:
+            cls._yelp("Expected %d columns, got %d in the header" % (options.number_of_columns, len(column_names)),
+                      header_line=header,
+                      who=who,
+                      error_action=options.header_error_action,
+                      error_file=error_file)
+
         # If there's an implied label, add the column to the end.  If a label column
         # already exists, then later we'll detect a duplicate column name.
         if options.implied_label is not None:
             column_names.append(cls.LABEL)
+
 
         # Check for unsafe column names.
         cls.check_column_names(column_names,
                                header_line=header,
                                who=who,
                                error_action=options.unsafe_column_name_action,
                                error_file=error_file,
-                               prohibit_whitespace_in_column_names=options.prohibit_whitespace_in_column_names)
+                               prohibit_whitespace_in_column_names=options.prohibit_whitespace_in_column_names,
+                               supply_missing_column_names=options.supply_missing_column_names)
+
 
         # Build a map from column name to column index.
         column_name_map: typing.Mapping[str, int] = cls.build_column_name_map(column_names,
@@ -746,6 +795,19 @@ def open(cls,
                                                                               error_action=options.header_error_action,
                                                                               error_file=error_file)
 
+        # If there is a list of required columns names, are they all present?
+        if options.require_column_names is not None and len(options.require_column_names) > 0:
+            missing_column_names: typing.List(str) = list()
+            for column_name in options.require_column_names:
+                if column_name not in column_name_map:
+                    missing_column_names.append(column_name)
+            if len(missing_column_names) > 0:
+                cls._yelp("The following required columns were missing: %s" % repr(missing_column_names),
+                          header_line=header,
+                          who=who,
+                          error_action=options.header_error_action,
+                          error_file=error_file)
+
         # Should we automatically determine if this is an edge file or a node file?
         if mode is None:
             mode = options.mode
@@ -778,6 +840,10 @@ def open(cls,
         elif mode is KgtkReaderMode.NONE:
             pass
 
+        if verbose:
+            print("KgtkReader: is_edge_file=%s is_node_file=%s" % (repr(is_edge_file), repr(is_node_file)),
+                  file=error_file, flush=True)
+
         # Get the indices of the special columns.
         node1_column_idx: int
         label_column_idx: int
@@ -798,7 +864,52 @@ def open(cls,
             print("KgtkReader: Special columns: node1=%d label=%d node2=%d id=%d" % (node1_column_idx,
                                                                                      label_column_idx,
                                                                                      node2_column_idx,
-                                                                                     id_column_idx), file=error_file, flush=True)
+                                                                                     id_column_idx),
+                  file=error_file, flush=True)
+
+        # Are additional columns allowed?
+        if options.no_additional_columns:
+            unexpected_column_names: typing.List[str] = list()
+            if options.require_column_names is not None and len(options.require_column_names) > 0:
+                if verbose:
+                    print("KgtkReader: disallowing additional columns based on required column names",
+                          file=error_file, flush=True) # ***
+                for column_name in column_names:
+                    if column_name not in options.require_column_names:
+                        unexpected_column_names.append(column_name)
+
+            elif is_edge_file:
+                if verbose:
+                    print("KgtkReader: disallowing additional columns: edge file", file=error_file, flush=True)
+                for column_name in column_names:
+                    idx: int = column_name_map[column_name]
+                    if idx not in (node1_column_idx,
+                                   label_column_idx,
+                                   node2_column_idx,
+                                   id_column_idx):
+                        unexpected_column_names.append(column_name)
+
+            elif is_node_file:
+                if verbose:
+                    print("KgtkReader: disallowing additional columns: node file", file=error_file, flush=True)
+                for column_name in column_names:
+                    if column_name_map[column_name] != id_column_idx:
+                        unexpected_column_names.append(column_name)
+            else:
+                if verbose:
+                    print("KgtkReader: disallowing additional columns: neither edge nor node file", file=error_file, flush=True)
+
+            if len(unexpected_column_names) > 0:
+                cls._yelp("The following additional columns are unexpected: %s" % repr(unexpected_column_names),
+                          header_line=header,
+                          who=who,
+                          error_action=options.header_error_action,
+                          error_file=error_file)
+
+
+        # If an input_filter has been supplied, check it for validity:
+        if input_filter is not None:
+            cls._validate_input_filter(input_filter, column_names)
 
         # Select the best inplementation class.
         if use_graph_cache and gca is not None:
@@ -1003,6 +1114,12 @@ def _build_column_names(cls,
         """
         column_names: typing.List[str]
         if options.force_column_names is None:
+            if options.no_input_header:
+                if options.number_of_columns is not None:
+                    column_names: typing.List[str] = [ "" ] * options.number_of_columns
+                    return options.column_separator.join(column_names), column_names
+                else:
+                    raise ValueError("Cannot read a file with no header and an unnown number of columns.")
             # Read the column names from the first line, stripping end-of-line characters.
             #
             # TODO: if the read fails, throw a more useful exception with the line number.

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -62,6 +62,7 @@ nav:
       - 'validate': 'curate/validate.md'
       - 'validate-properties': 'curate/validate_properties.md'
   - 'Analysis commands':
+      - 'community-detection': 'analysis/community-detection.md'
       - 'connected-components': 'analysis/connected_components.md'
       - 'graph-embeddings': 'analysis/graph_embeddings.md'
       - 'graph-statistics': 'analysis/graph_statistics.md'