MRG: #169 from vocalpy/rename-annot-notated-path

ENH: Rename `Annotation.audio_path` -> `Annotation.notated_path`
vocalpy · May 14, 2022 · d80715b · d80715b
2 parents de1bfba + 474da86
commit d80715b
Show file tree

Hide file tree

Showing 20 changed files with 83 additions and 74 deletions.
diff --git a/src/crowsetta/annotation.py b/src/crowsetta/annotation.py
@@ -17,8 +17,10 @@ class Annotation:
     ----------
     annot_path : str, pathlib.Path
         path to file from which annotations were loaded
-    audio_path : str, pathlib.Path
-        path to audio file that ``annot_path`` annotates.
+    notated_path : str, pathlib.Path
+        path to file that ``annot_path`` annotates.
+        E.g., an audio file, or an array file
+        that contains a spectrogram generated from audio.
         Optional, default is None.
     seq : crowsetta.Sequence
         a sequence of annotated segments,
@@ -38,7 +40,7 @@ class Annotation:
     """
     def __init__(self,
                  annot_path: PathLike,
-                 audio_path: Optional[PathLike] = None,
+                 notated_path: Optional[PathLike] = None,
                  seq: Optional[Sequence] = None,
                  bboxes: Optional[List[BBox]] = None):
         if seq is None and bboxes is None:
@@ -73,13 +75,13 @@ def __init__(self,
             self.bboxes = bboxes
 
         self.annot_path = Path(annot_path)
-        if audio_path:
-            self.audio_path = Path(audio_path)
+        if notated_path:
+            self.notated_path = Path(notated_path)
         else:
-            self.audio_path = audio_path
+            self.notated_path = notated_path
 
     def __repr__(self):
-        repr_ = f'Annotation(annot_path={repr(self.annot_path)}, audio_path={repr(self.audio_path)}, '
+        repr_ = f'Annotation(annot_path={repr(self.annot_path)}, notated_path={repr(self.notated_path)}, '
         if hasattr(self, 'seq'):
             repr_ += f'seq={self.seq})'
         elif hasattr(self, 'bboxes'):
@@ -88,7 +90,7 @@ def __repr__(self):
 
     def __eq__(self, other):
         is_annot_and_audio_eq = (self.annot_path == other.annot_path and
-                                 self.audio_path == other.audio_path)
+                                 self.notated_path == other.notated_path)
         if hasattr(self, 'seq') and hasattr(other, 'seq'):
             return is_annot_and_audio_eq and self.seq == other.seq
         elif hasattr(self, 'bboxes') and hasattr(other, 'bboxes'):

diff --git a/src/crowsetta/formats/bbox/raven.py b/src/crowsetta/formats/bbox/raven.py
@@ -147,7 +147,7 @@ def to_annot(self) -> crowsetta.Annotation:
         """
         bboxes = self.to_bbox()
         return crowsetta.Annotation(annot_path=self.annot_path,
-                                    audio_path=self.audio_path,
+                                    notated_path=self.audio_path,
                                     bboxes=bboxes)
 
     def to_file(self,

diff --git a/src/crowsetta/formats/seq/birdsongrec.py b/src/crowsetta/formats/seq/birdsongrec.py
@@ -288,6 +288,6 @@ def to_annot(self,
         annot_list = []
         for seq, wav_filename in zip(seqs, wav_filenames):
             annot_list.append(
-                crowsetta.Annotation(seq=seq, annot_path=self.xml_path, audio_path=wav_filename)
+                crowsetta.Annotation(seq=seq, annot_path=self.xml_path, notated_path=wav_filename)
             )
         return annot_list
diff --git a/src/crowsetta/formats/seq/generic.py b/src/crowsetta/formats/seq/generic.py
@@ -47,7 +47,7 @@ class GenericSeqSchema(pandera.SchemaModel):
     onset_ind: Optional[Series[int]] = pandera.Field()
     offset_ind: Optional[Series[int]] = pandera.Field()
 
-    audio_path: Series[str] = pandera.Field()
+    notated_path: Series[str] = pandera.Field()
     annot_path: Series[str] = pandera.Field()
     sequence: Series[int] = pandera.Field()
     annotation: Series[int] = pandera.Field()
@@ -144,20 +144,20 @@ def annot2csv(annot: Union[crowsetta.Annotation, List[crowsetta.Annotation]],
                     if not(val is None and any([key.startswith(prefix) for prefix in ('onset', 'offset')]))
                 })  # OrderedDict is default; being extra explicit here
                 annot_path = annot_.annot_path
-                audio_path = annot_.audio_path
+                notated_path = annot_.notated_path
                 if abspath:
                     annot_path = os.path.abspath(annot_path)
-                    if audio_path is not None:
-                        audio_path = os.path.abspath(audio_path)
+                    if notated_path is not None:
+                        notated_path = os.path.abspath(notated_path)
                 elif basename:
                     annot_path = os.path.basename(annot_path)
-                    if audio_path is not None:
-                        audio_path = os.path.basename(audio_path)
-                # need to put in audio_path before annot_path
-                if audio_path is not None:
-                    row['audio_path'] = audio_path
+                    if notated_path is not None:
+                        notated_path = os.path.basename(notated_path)
+                # need to put in notated_path before annot_path
+                if notated_path is not None:
+                    row['notated_path'] = notated_path
                 else:
-                    row['audio_path'] = 'None'
+                    row['notated_path'] = 'None'
                 row['annot_path'] = annot_path
                 # we use 'sequence' and 'annotation' fields when we are
                 # loading back into Annotations
@@ -203,14 +203,14 @@ def csv2annot(csv_path: PathLike) -> List[crowsetta.Annotation]:
                 f"\n{annot_path}"
             )
         annot_path = annot_path[0]
-        # 2. audio_path
-        audio_path = df_annot.audio_path.unique()
-        if len(audio_path) > 1:
+        # 2. notated_path
+        notated_path = df_annot.notated_path.unique()
+        if len(notated_path) > 1:
             raise ValueError(
-                f"found multiple values for 'audio_path' for annotation #{annotation_ind}:"
-                f"\n{audio_path}"
+                f"found multiple values for 'notated_path' for annotation #{annotation_ind}:"
+                f"\n{notated_path}"
             )
-        audio_path = audio_path[0]
+        notated_path = notated_path[0]
         # 3. Sequence
         seq_uniq = df_annot.sequence.unique()
         assert len(seq_uniq) > 0
@@ -240,7 +240,7 @@ def csv2annot(csv_path: PathLike) -> List[crowsetta.Annotation]:
         )
         annot = crowsetta.Annotation(
             annot_path=annot_path,
-            audio_path=audio_path,
+            notated_path=notated_path,
             seq=seq
         )
         annot_list.append(annot)

diff --git a/src/crowsetta/formats/seq/notmat.py b/src/crowsetta/formats/seq/notmat.py
@@ -150,7 +150,7 @@ def to_annot(self,
         seq = self.to_seq(round_times=round_times, decimals=decimals)
 
         return crowsetta.Annotation(annot_path=self.notmat_path,
-                                    audio_path=self.audio_path,
+                                    notated_path=self.audio_path,
                                     seq=seq)
 
     def to_file(self,

diff --git a/src/crowsetta/formats/seq/simple.py b/src/crowsetta/formats/seq/simple.py
@@ -87,8 +87,11 @@ class SimpleSeq:
         or a single phonetic transcription code.
     annot_path : str, pathlib.Path
         Path to file from which annotations were loaded.
-    audio_path : str. pathlib.Path
-        Path to audio file that the ``annot_path`` annotates.
+    notated_path : str. pathlib.Path
+        path to file that ``annot_path`` annotates.
+        E.g., an audio file, or an array file
+        that contains a spectrogram generated from audio.
+        Optional, default is None.
     """
     name: ClassVar[str] = 'simple-seq'
     ext: ClassVar[str] = ('.csv', '.txt')
@@ -97,13 +100,13 @@ class SimpleSeq:
     offsets_s: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal))
     labels: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal))
     annot_path: pathlib.Path
-    audio_path: Optional[pathlib.Path] = attr.field(default=None,
-                                                    converter=attr.converters.optional(pathlib.Path))
+    notated_path: Optional[pathlib.Path] = attr.field(default=None,
+                                                      converter=attr.converters.optional(pathlib.Path))
 
     @classmethod
     def from_file(cls,
                   annot_path: PathLike,
-                  audio_path: Optional[PathLike] = None,
+                  notated_path: Optional[PathLike] = None,
                   columns_map: Optional[Mapping] = None,
                   read_csv_kwargs: Optional[Mapping] = None
                   ) -> 'Self':
@@ -114,14 +117,18 @@ def from_file(cls,
         annot_path : str, pathlib.Path
             Path to an annotation file,
             with one of the extensions {'.csv', '.txt'}.
-        audio_path : str, pathlib.Path
-            Optional, path to audio file
-            that ``annot_path`` annotates.
+        notated_path : str, pathlib.Path
+            path to file that ``annot_path`` annotates.
+            E.g., an audio file, or an array file
+            that contains a spectrogram generated from audio.
+            Optional, default is None.
         columns_map : dict-like
             Maps column names in header of ``annot_path``
             to the standardized names
             used by this format.
-            E.g., ``{'begin_time': 'onset_s', 'end_time': 'offset_s', 'text': 'label'}``
+            E.g., ``{'begin_time': 'onset_s', 'end_time': 'offset_s', 'text': 'label'}``.
+            Optional, default is None--assumes that
+            columns have the standardized names.
         read_csv_kwargs : dict
             keyword arguments passed to
             ``pandas.read_csv``. Default is None,
@@ -147,7 +154,7 @@ def from_file(cls,
             offsets_s=df['offset_s'].values,
             labels=df['label'].values,
             annot_path=annot_path,
-            audio_path=audio_path,
+            notated_path=notated_path,
         )
 
     def to_seq(self,
@@ -217,7 +224,7 @@ def to_annot(self,
         the result should be the same on Windows and Linux.
         """
         seq = self.to_seq(round_times, decimals)
-        return crowsetta.Annotation(annot_path=self.annot_path, audio_path=self.audio_path, seq=seq)
+        return crowsetta.Annotation(annot_path=self.annot_path, notated_path=self.notated_path, seq=seq)
 
     def to_file(self,
                 annot_path: PathLike,

diff --git a/src/crowsetta/formats/seq/textgrid.py b/src/crowsetta/formats/seq/textgrid.py
@@ -189,5 +189,5 @@ def to_annot(self,
                           decimals=decimals)
 
         return crowsetta.Annotation(annot_path=self.textgrid_path,
-                                    audio_path=self.audio_path,
+                                    notated_path=self.audio_path,
                                     seq=seq)
diff --git a/src/crowsetta/formats/seq/timit.py b/src/crowsetta/formats/seq/timit.py
@@ -221,7 +221,7 @@ def to_annot(self,
         the result should be the same on Windows and Linux.
         """
         phn_seq = self.to_seq(round_times, decimals, samplerate)
-        return crowsetta.Annotation(annot_path=self.transcript_path, audio_path=self.audio_path, seq=phn_seq)
+        return crowsetta.Annotation(annot_path=self.transcript_path, notated_path=self.audio_path, seq=phn_seq)
 
     def to_file(self,
                 transcript_path: PathLike) -> None:

diff --git a/src/crowsetta/formats/seq/yarden.py b/src/crowsetta/formats/seq/yarden.py
@@ -213,7 +213,7 @@ def to_annot(self,
         for audio_path, seq in zip(self.audio_paths, seqs):
             annots.append(
                 crowsetta.Annotation(annot_path=self.annot_path,
-                                     audio_path=audio_path,
+                                     notated_path=audio_path,
                                      seq=seq)
             )
         return annots
diff --git a/tests/data_for_tests/csv/birdsongrec_Bird0_Annotation.csv b/tests/data_for_tests/csv/birdsongrec_Bird0_Annotation.csv
@@ -1,4 +1,4 @@
-label,onset_s,offset_s,onset_ind,offset_ind,audio_path,annot_path,sequence,annotation
+label,onset_s,offset_s,onset_ind,offset_ind,notated_path,annot_path,sequence,annotation
 0,1.07,1.154,34240,36928,0.wav,Annotation.xml,0,0
 0,1.258,1.345,40256,43040,0.wav,Annotation.xml,0,0
 0,1.467,1.555,46944,49760,0.wav,Annotation.xml,0,0

diff --git a/tests/data_for_tests/csv/example_user_annotation.csv b/tests/data_for_tests/csv/example_user_annotation.csv
@@ -1,4 +1,4 @@
-label,onset_s,offset_s,audio_path,annot_path,sequence,annotation
+label,onset_s,offset_s,notated_path,annot_path,sequence,annotation
 1,0.0029761904761904,0.141504329004329,lbr3009_0005_2017_04_27_06_14_46.wav,../test_data/example_user_format/bird1_annotation.mat,0,0
 1,0.279125,0.504625,lbr3009_0005_2017_04_27_06_14_46.wav,../test_data/example_user_format/bird1_annotation.mat,0,0
 5,0.5556472915365209,0.5962916666666667,lbr3009_0005_2017_04_27_06_14_46.wav,../test_data/example_user_format/bird1_annotation.mat,0,0

diff --git a/tests/data_for_tests/csv/invalid_fields_in_header.csv b/tests/data_for_tests/csv/invalid_fields_in_header.csv
@@ -1,4 +1,4 @@
-label,onset_s,offset_s,audio_path,annot_path,sequence,annotation,invalid
+label,onset_s,offset_s,notated_path,annot_path,sequence,annotation,invalid
 i,1.278,1.351,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0,i
 i,1.452,1.536,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0,i
 i,1.605,1.712,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0,i

diff --git a/tests/data_for_tests/csv/missing_fields_in_header.csv b/tests/data_for_tests/csv/missing_fields_in_header.csv
@@ -1,4 +1,4 @@
-onset_s,offset_s,audio_path,annot_path,sequence,annotation
+onset_s,offset_s,notated_path,annot_path,sequence,annotation
 1.278,1.351,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0
 1.452,1.536,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0
 1.605,1.712,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0

diff --git a/tests/data_for_tests/csv/no_onset_or_offset_column.csv b/tests/data_for_tests/csv/no_onset_or_offset_column.csv
@@ -1,4 +1,4 @@
-label,audio_path,annot_path,sequence,annotation
+label,notated_path,annot_path,sequence,annotation
 i,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0
 i,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0
 i,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0

diff --git a/tests/data_for_tests/csv/notmat_gy6or6_032312.csv b/tests/data_for_tests/csv/notmat_gy6or6_032312.csv
@@ -1,4 +1,4 @@
-label,onset_s,offset_s,audio_path,annot_path,sequence,annotation
+label,onset_s,offset_s,notated_path,annot_path,sequence,annotation
 i,1.278,1.351,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0
 i,1.452,1.536,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0
 i,1.605,1.712,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0

diff --git a/tests/data_for_tests/csv/onset_ind_column_but_no_offset_ind_column.csv b/tests/data_for_tests/csv/onset_ind_column_but_no_offset_ind_column.csv
@@ -1,4 +1,4 @@
-label,onset_s,offset_s,onset_ind,audio_path,annot_path,sequence,annotation
+label,onset_s,offset_s,onset_ind,notated_path,annot_path,sequence,annotation
 h#,0.0,0.488,0,sa1.wav,sa1.phn,0,0
 sh,0.488,0.594,7812,sa1.wav,sa1.phn,0,0
 iy,0.594,0.663,9507,sa1.wav,sa1.phn,0,0

diff --git a/tests/data_for_tests/csv/onset_s_column_but_no_offset_s_column.csv b/tests/data_for_tests/csv/onset_s_column_but_no_offset_s_column.csv
@@ -1,4 +1,4 @@
-label,onset_s,audio_path,annot_path,sequence,annotation
+label,onset_s,notated_path,annot_path,sequence,annotation
 i,1.278,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0
 i,1.452,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0
 i,1.605,gy6or6_baseline_230312_0808.138.cbin,gy6or6_baseline_230312_0808.138.cbin.not.mat,0,0

diff --git a/tests/data_for_tests/csv/timit-dr1-fvmh0-phn.csv b/tests/data_for_tests/csv/timit-dr1-fvmh0-phn.csv
@@ -1,4 +1,4 @@
-label,onset_s,offset_s,onset_ind,offset_ind,audio_path,annot_path,sequence,annotation
+label,onset_s,offset_s,onset_ind,offset_ind,notated_path,annot_path,sequence,annotation
 h#,0.0,0.488,0,7812,sa1.wav,sa1.phn,0,0
 sh,0.488,0.594,7812,9507,sa1.wav,sa1.phn,0,0
 iy,0.594,0.663,9507,10610,sa1.wav,sa1.phn,0,0

diff --git a/tests/scripts/remake_test_csv.py b/tests/scripts/remake_test_csv.py
@@ -14,43 +14,43 @@
 def remake_notmat_as_generic_seq_csv():
     cbin_dir = TEST_DATA.joinpath('cbins/gy6or6/032312/')
     notmat_paths = sorted(cbin_dir.glob('*.not.mat'))
-    annots = [crowsetta.formats.NotMat.from_file(notmat_path).to_annot()
+    annots = [crowsetta.formats.seq.NotMat.from_file(notmat_path).to_annot()
               for notmat_path in notmat_paths]
-    notmat_generic_seq = crowsetta.formats.GenericSeq(annots=annots)
+    notmat_generic_seq = crowsetta.formats.seq.GenericSeq(annots=annots)
     csv_path = TEST_DATA / 'csv' / 'notmat_gy6or6_032312.csv'
     print(
         f'saving csv: {csv_path}'
     )
-    notmat_generic_seq.to_csv(csv_path=csv_path, basename=True)
+    notmat_generic_seq.to_file(csv_path=csv_path, basename=True)
 
 
 def remake_birdsongrec_as_generic_seq_csv():
     birdsongrec_dir = TEST_DATA / 'birdsongrec' / 'Bird0'
     birdsongrec_xml_file = birdsongrec_dir / 'Annotation.xml'
     birdsongrec_wavpath = birdsongrec_dir / 'Wave'
-    birdsongrec = crowsetta.formats.BirdsongRec.from_file(xml_path=birdsongrec_xml_file,
-                                                          wav_path=birdsongrec_wavpath,
-                                                          concat_seqs_into_songs=True)
+    birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(xml_path=birdsongrec_xml_file,
+                                                              wav_path=birdsongrec_wavpath,
+                                                              concat_seqs_into_songs=True)
     annots = birdsongrec.to_annot()
-    generic_seq = crowsetta.formats.GenericSeq(annots=annots)
+    generic_seq = crowsetta.formats.seq.GenericSeq(annots=annots)
     csv_path = TEST_DATA / 'csv' / 'birdsongrec_Bird0_Annotation.csv'
     print(
         f'saving csv: {csv_path}'
     )
-    generic_seq.to_csv(csv_path=csv_path, basename=True)
+    generic_seq.to_file(csv_path=csv_path, basename=True)
 
 
 def remake_timit_phn_as_generic_seq_csv():
     timit_kaggle_dir = TEST_DATA / 'timit_kaggle' / 'dr1-fvmh0'
     phn_paths = sorted(timit_kaggle_dir.glob('*.phn'))
-    annots = [crowsetta.formats.Timit.from_file(phn_path).to_annot()
+    annots = [crowsetta.formats.seq.Timit.from_file(phn_path).to_annot()
               for phn_path in phn_paths]
-    timit_generic_seq = crowsetta.formats.GenericSeq(annots=annots)
+    timit_generic_seq = crowsetta.formats.seq.GenericSeq(annots=annots)
     csv_path = TEST_DATA / 'csv' / 'timit-dr1-fvmh0-phn.csv'
     print(
         f'saving csv: {csv_path}'
     )
-    timit_generic_seq.to_csv(csv_path=csv_path, basename=True)
+    timit_generic_seq.to_file(csv_path=csv_path, basename=True)
 
 
 def remake_invalid_fields_in_header_csv(source_csv_path):