automatic onset/duration/order management

PsychoinformaticsLab · Feb 16, 2018 · 21f085e · 21f085e
1 parent ece187e
commit 21f085e
Show file tree

Hide file tree

Showing 12 changed files with 57 additions and 64 deletions.
diff --git a/pliers/converters/api.py b/pliers/converters/api.py
@@ -52,7 +52,7 @@ def _convert(self, audio):
 
         text = getattr(self.recognizer, self.recognize_method)(clip, self.api_key)
 
-        return ComplexTextStim(text=text, onset=audio.onset)
+        return ComplexTextStim(text=text)
 
 
 class WitTranscriptionConverter(SpeechRecognitionAPIConverter):

diff --git a/pliers/converters/google.py b/pliers/converters/google.py
@@ -89,7 +89,7 @@ def _convert(self, stim):
                                           onset=offset + onset,
                                           duration=duration))
 
-        return ComplexTextStim(elements=words, onset=stim.onset)
+        return ComplexTextStim(elements=words)
 
 
 class GoogleVisionAPITextConverter(GoogleVisionAPITransformer,
@@ -121,30 +121,24 @@ def _convert(self, stims):
         responses = self._query_api(request)
         texts = []
 
-        for i, response in enumerate(responses):
-            stim = stims[i]
+        for response in responses:
             if response and self.response_object in response:
                 annotations = response[self.response_object]
                 # Combine the annotations
                 if self.handle_annotations == 'first':
                     text = annotations[0]['description']
-                    texts.append(TextStim(text=text, onset=stim.onset,
-                                          duration=stim.duration))
+                    texts.append(TextStim(text=text))
                 elif self.handle_annotations == 'concatenate':
                     text = ''
                     for annotation in annotations:
                         text = ' '.join([text, annotation['description']])
-                    texts.append(TextStim(text=text, onset=stim.onset,
-                                          duration=stim.duration))
+                    texts.append(TextStim(text=text))
                 elif self.handle_annotations == 'list':
                     for annotation in annotations:
-                        texts.append(TextStim(text=annotation['description'],
-                                              onset=stim.onset,
-                                              duration=stim.duration))
+                        texts.append(TextStim(text=annotation['description']))
             elif 'error' in response:
                 raise Exception(response['error']['message'])
             else:
-                texts.append(TextStim(text='', onset=stim.onset,
-                                      duration=stim.duration))
+                texts.append(TextStim(text=''))
 
         return texts
diff --git a/pliers/converters/image.py b/pliers/converters/image.py
@@ -26,5 +26,4 @@ class TesseractConverter(ImageToTextConverter):
     def _convert(self, stim):
         verify_dependencies(['pytesseract'])
         text = pytesseract.image_to_string(Image.fromarray(stim.data))
-        return TextStim(text=text, onset=stim.onset, duration=stim.duration,
-                        order=stim.order)
+        return TextStim(text=text)
diff --git a/pliers/converters/microsoft.py b/pliers/converters/microsoft.py
@@ -29,4 +29,4 @@ def _convert(self, stim):
                 lines.append(' '.join([w['text'] for w in l['words']]))
 
         text = '\n'.join(lines)
-        return TextStim(text=text, onset=stim.onset, duration=stim.duration)
+        return TextStim(text=text)
diff --git a/pliers/converters/video.py b/pliers/converters/video.py
@@ -16,5 +16,4 @@ class VideoToAudioConverter(Converter):
     def _convert(self, video):
         fps = AudioStim.get_sampling_rate(video.filename)
         return AudioStim(sampling_rate=fps,
-                         clip=video.clip.audio,
-                         onset=video.onset)
+                         clip=video.clip.audio)
diff --git a/pliers/extractors/base.py b/pliers/extractors/base.py
@@ -66,25 +66,16 @@ def __init__(self, data, stim, extractor, features=None, onsets=None,
         self.features = features
         self.raw = raw
         self._history = None
+        self.onset = onsets
+        self.duration = durations
+        self.order = orders
 
         # Eventually, the goal is to make raw mandatory, and always
         # generate the .data property via calls to to_array() or to_df()
         # implemented in the Extractor. But to avoid breaking the API without
         # warning, we provide a backward-compatible version for the time being.
         self.data = np.array(data)
 
-        if onsets is None:
-            onsets = stim.onset
-        self.onsets = onsets if onsets is not None else np.nan
-
-        if durations is None:
-            durations = stim.duration
-        self.durations = durations if durations is not None else np.nan
-
-        if orders is None:
-            orders = stim.order
-        self.orders = orders if orders is not None else np.nan
-
     def to_df(self, timing=True, metadata=False, format='wide',
               extractor_name=False, object_id=True, **to_df_kwargs):
         ''' Convert current instance to a pandas DatasFrame.
@@ -130,6 +121,10 @@ def to_df(self, timing=True, metadata=False, format='wide',
                             for i in range(self.data.shape[1])]
             df = pd.DataFrame(self.data, columns=features)
 
+        onsets = np.nan if self.onset is None else self.onset
+        durations = np.nan if self.duration is None else self.duration
+        orders = np.nan if self.order is None else self.order
+
         index_cols = []
 
         # Generally we leave it to Extractors to properly track the number of
@@ -139,8 +134,8 @@ def to_df(self, timing=True, metadata=False, format='wide',
         # counter for any row in the DF that cannot be uniquely distinguished
         # from other rows by onset and duration.
         if object_id and 'object_id' not in df.columns:
-            index = pd.Series(self.onsets).astype(str) + '_' + \
-                pd.Series(self.durations).astype(str)
+            index = pd.Series(onsets).astype(str) + '_' + \
+                pd.Series(durations).astype(str)
             if object_id is True or (object_id == 'auto' and
                                      len(set(index)) > 1):
                 ids = np.arange(len(df)) if len(index) == 1 \
@@ -149,11 +144,11 @@ def to_df(self, timing=True, metadata=False, format='wide',
                 index_cols = ['object_id']
 
         if timing is True or (timing == 'auto' and
-                              (np.isfinite(self.durations).any() or
-                               np.isfinite(self.orders).any())):
-            df.insert(0, 'duration', self.durations)
-            df.insert(0, 'order', self.orders)
-            df.insert(0, 'onset', self.onsets)
+                              (np.isfinite(durations).any() or
+                               np.isfinite(orders).any())):
+            df.insert(0, 'onset', onsets)
+            df.insert(0, 'duration', durations)
+            df.insert(0, 'order', orders)
             index_cols.extend(['onset', 'order', 'duration'])
 
         if format == 'long':

diff --git a/pliers/filters/image.py b/pliers/filters/image.py
@@ -40,9 +40,7 @@ def _filter(self, stim):
             x0, y0, x1, y1 = pillow_img.getbbox()
         new_img = stim.data[y0:y1, x0:x1]
         return ImageStim(stim.filename,
-                         data=new_img,
-                         onset=stim.onset,
-                         duration=stim.duration)
+                         data=new_img)
 
 
 class PillowImageFilter(ImageFilter):
@@ -87,6 +85,4 @@ def _filter(self, stim):
         pillow_img = Image.fromarray(stim.data)
         new_img = np.array(pillow_img.filter(self.filter))
         return ImageStim(stim.filename,
-                         data=new_img,
-                         onset=stim.onset,
-                         duration=stim.duration)
+                         data=new_img)
diff --git a/pliers/filters/text.py b/pliers/filters/text.py
@@ -67,8 +67,7 @@ def _filter(self, stim):
             stemmed = ' '.join([self.stemmer.stem(tok) for tok in tokens])
         else:
             stemmed = self.stemmer.stem(stim.text)
-        return TextStim(stim.filename, stemmed, onset=stim.onset,
-                        duration=stim.duration, order=stim.order)
+        return TextStim(stim.filename, stemmed)
 
 
 class TokenizingFilter(TextFilter):
@@ -97,8 +96,7 @@ def _filter(self, stim):
             tokens = self.tokenizer.tokenize(stim.text)
         else:
             tokens = word_tokenize(stim.text)
-        stims = [TextStim(stim.filename, token, onset=stim.onset,
-                          duration=stim.duration, order=i)
+        stims = [TextStim(stim.filename, token, order=i)
                  for i, token in enumerate(tokens)]
         return stims
 
@@ -134,8 +132,7 @@ def _filter(self, stim):
         tokens = word_tokenize(stim.text)
         tokens = [tok for tok in tokens if tok not in self.tokens]
         text = ' '.join(tokens)
-        return TextStim(stim.filename, text, onset=stim.onset,
-                        duration=stim.duration, order=stim.order)
+        return TextStim(stim.filename, text)
 
 
 class PunctuationRemovalFilter(TokenRemovalFilter):
@@ -151,5 +148,4 @@ class LowerCasingFilter(TextFilter):
     ''' Lower cases the text in a TextStim. '''
 
     def _filter(self, stim):
-        return TextStim(stim.filename, stim.text.lower(), onset=stim.onset,
-                        duration=stim.duration, order=stim.order)
+        return TextStim(stim.filename, stim.text.lower())
diff --git a/pliers/filters/video.py b/pliers/filters/video.py
@@ -57,8 +57,7 @@ def _filter(self, video):
         frame_index = sorted(list(set(video.frame_index).intersection(new_idx)))
 
         return VideoFrameCollectionStim(filename=video.filename,
-                                        frame_index=frame_index,
-                                        onset=video.onset)
+                                        frame_index=frame_index)
 
 
 class VideoTrimmingFilter(TemporalTrimmingFilter, VideoFilter):

diff --git a/pliers/stimuli/text.py b/pliers/stimuli/text.py
@@ -121,7 +121,7 @@ def __init__(self, filename=None, onset=None, duration=None, columns=None,
 
     @property
     def elements(self):
-        return self._elements
+        return [f for f in self]
 
     def _from_file(self, filename, columns, default_duration):
         tod_names = {'t': 'text', 'o': 'onset', 'd': 'duration'}
@@ -142,7 +142,7 @@ def _from_file(self, filename, columns, default_duration):
                 if duration is None:
                     duration = default_duration
                 elem = TextStim(filename, r['text'], r['onset'], duration)
-            self.add_elem(elem)
+            self._elements.append(elem)
 
     def save(self, path):
         with open(path, 'w') as f:
@@ -173,16 +173,13 @@ def _from_srt(self, filename):
         for i, r in df.iterrows():
             elem = TextStim(filename, text=r['text'], onset=r['onset'],
                             duration=r['duration'], order=i)
-            self.add_elem(elem)
-
-    def add_elem(self, elem):
-        offset = 0.0 if self.onset is None else self.onset
-        elem.onset = offset if elem.onset is None else offset + elem.onset
-        self._elements.append(elem)
+            self._elements.append(elem)
 
     def __iter__(self):
         """ Iterate text elements. """
         for elem in self._elements:
+            offset = 0.0 if self.onset is None else self.onset
+            elem.onset = offset if elem.onset is None else offset + elem.onset
             yield elem
 
     def _to_sec(self, tup):
@@ -215,5 +212,5 @@ def tokenize_text(text):
             tokens = tokenize_text(text)
 
         for i, t in enumerate(tokens):
-            self.add_elem(TextStim(text=t, onset=None, duration=None,
-                                   order=i))
+            self._elements.append(TextStim(text=t, onset=None, duration=None,
+                                  order=i))
diff --git a/pliers/tests/filters/test_text_filters.py b/pliers/tests/filters/test_text_filters.py
@@ -54,11 +54,15 @@ def test_word_stemming_filter():
 
 
 def test_tokenizing_filter():
-    stim = TextStim(join(TEXT_DIR, 'scandal.txt'))
+    stim = TextStim(join(TEXT_DIR, 'scandal.txt'), onset=4.2)
     filt = TokenizingFilter()
     words = filt.transform(stim)
     assert len(words) == 231
     assert words[0].text == 'To'
+    assert words[0].onset == 4.2
+    assert words[0].order == 0
+    assert words[1].onset == 4.2
+    assert words[1].order == 1
 
     custom_tokenizer = PunktSentenceTokenizer()
     filt = TokenizingFilter(tokenizer=custom_tokenizer)

diff --git a/pliers/transformers/base.py b/pliers/transformers/base.py
@@ -129,6 +129,7 @@ def transform(self, stims, validation='strict', *args, **kwargs):
                 result = _log_transformation(validated_stim, result, self)
                 if isgenerator(result):
                     result = list(result)
+                self._propagate_context(validated_stim, result)
                 return result
 
     def _validate(self, stim):
@@ -186,6 +187,18 @@ def _transform(s):
         return (t for t in (self.transform(s, *args, **kwargs)
                             for s in stims) if t)
 
+    def _propagate_context(self, stim, result):
+        if isiterable(result):
+            for r in result:
+                self._propagate_context(stim, r)
+        else:
+            if result.onset is None:
+                result.onset = stim.onset
+            if result.duration is None:
+                result.duration = stim.duration
+            if result.order is None:
+                result.order = stim.order
+
     @abstractmethod
     def _transform(self, stim):
         pass
@@ -223,6 +236,7 @@ def _iterate(self, stims, validation='strict', *args, **kwargs):
             res = self._transform(batch, *args, **kwargs)
             for i, stim in enumerate(batch):
                 res[i] = _log_transformation(stim, res[i], self)
+                self._propagate_context(stim, res[i])
             results.extend(res)
         return results