Skip to content

Commit

Permalink
automatic onset/duration/order management
Browse files Browse the repository at this point in the history
  • Loading branch information
qmac committed Feb 16, 2018
1 parent ece187e commit 21f085e
Show file tree
Hide file tree
Showing 12 changed files with 57 additions and 64 deletions.
2 changes: 1 addition & 1 deletion pliers/converters/api.py
Expand Up @@ -52,7 +52,7 @@ def _convert(self, audio):

text = getattr(self.recognizer, self.recognize_method)(clip, self.api_key)

return ComplexTextStim(text=text, onset=audio.onset)
return ComplexTextStim(text=text)


class WitTranscriptionConverter(SpeechRecognitionAPIConverter):
Expand Down
18 changes: 6 additions & 12 deletions pliers/converters/google.py
Expand Up @@ -89,7 +89,7 @@ def _convert(self, stim):
onset=offset + onset,
duration=duration))

return ComplexTextStim(elements=words, onset=stim.onset)
return ComplexTextStim(elements=words)


class GoogleVisionAPITextConverter(GoogleVisionAPITransformer,
Expand Down Expand Up @@ -121,30 +121,24 @@ def _convert(self, stims):
responses = self._query_api(request)
texts = []

for i, response in enumerate(responses):
stim = stims[i]
for response in responses:
if response and self.response_object in response:
annotations = response[self.response_object]
# Combine the annotations
if self.handle_annotations == 'first':
text = annotations[0]['description']
texts.append(TextStim(text=text, onset=stim.onset,
duration=stim.duration))
texts.append(TextStim(text=text))
elif self.handle_annotations == 'concatenate':
text = ''
for annotation in annotations:
text = ' '.join([text, annotation['description']])
texts.append(TextStim(text=text, onset=stim.onset,
duration=stim.duration))
texts.append(TextStim(text=text))
elif self.handle_annotations == 'list':
for annotation in annotations:
texts.append(TextStim(text=annotation['description'],
onset=stim.onset,
duration=stim.duration))
texts.append(TextStim(text=annotation['description']))
elif 'error' in response:
raise Exception(response['error']['message'])
else:
texts.append(TextStim(text='', onset=stim.onset,
duration=stim.duration))
texts.append(TextStim(text=''))

return texts
3 changes: 1 addition & 2 deletions pliers/converters/image.py
Expand Up @@ -26,5 +26,4 @@ class TesseractConverter(ImageToTextConverter):
def _convert(self, stim):
verify_dependencies(['pytesseract'])
text = pytesseract.image_to_string(Image.fromarray(stim.data))
return TextStim(text=text, onset=stim.onset, duration=stim.duration,
order=stim.order)
return TextStim(text=text)
2 changes: 1 addition & 1 deletion pliers/converters/microsoft.py
Expand Up @@ -29,4 +29,4 @@ def _convert(self, stim):
lines.append(' '.join([w['text'] for w in l['words']]))

text = '\n'.join(lines)
return TextStim(text=text, onset=stim.onset, duration=stim.duration)
return TextStim(text=text)
3 changes: 1 addition & 2 deletions pliers/converters/video.py
Expand Up @@ -16,5 +16,4 @@ class VideoToAudioConverter(Converter):
def _convert(self, video):
fps = AudioStim.get_sampling_rate(video.filename)
return AudioStim(sampling_rate=fps,
clip=video.clip.audio,
onset=video.onset)
clip=video.clip.audio)
33 changes: 14 additions & 19 deletions pliers/extractors/base.py
Expand Up @@ -66,25 +66,16 @@ def __init__(self, data, stim, extractor, features=None, onsets=None,
self.features = features
self.raw = raw
self._history = None
self.onset = onsets
self.duration = durations
self.order = orders

# Eventually, the goal is to make raw mandatory, and always
# generate the .data property via calls to to_array() or to_df()
# implemented in the Extractor. But to avoid breaking the API without
# warning, we provide a backward-compatible version for the time being.
self.data = np.array(data)

if onsets is None:
onsets = stim.onset
self.onsets = onsets if onsets is not None else np.nan

if durations is None:
durations = stim.duration
self.durations = durations if durations is not None else np.nan

if orders is None:
orders = stim.order
self.orders = orders if orders is not None else np.nan

def to_df(self, timing=True, metadata=False, format='wide',
extractor_name=False, object_id=True, **to_df_kwargs):
''' Convert current instance to a pandas DatasFrame.
Expand Down Expand Up @@ -130,6 +121,10 @@ def to_df(self, timing=True, metadata=False, format='wide',
for i in range(self.data.shape[1])]
df = pd.DataFrame(self.data, columns=features)

onsets = np.nan if self.onset is None else self.onset
durations = np.nan if self.duration is None else self.duration
orders = np.nan if self.order is None else self.order

index_cols = []

# Generally we leave it to Extractors to properly track the number of
Expand All @@ -139,8 +134,8 @@ def to_df(self, timing=True, metadata=False, format='wide',
# counter for any row in the DF that cannot be uniquely distinguished
# from other rows by onset and duration.
if object_id and 'object_id' not in df.columns:
index = pd.Series(self.onsets).astype(str) + '_' + \
pd.Series(self.durations).astype(str)
index = pd.Series(onsets).astype(str) + '_' + \
pd.Series(durations).astype(str)
if object_id is True or (object_id == 'auto' and
len(set(index)) > 1):
ids = np.arange(len(df)) if len(index) == 1 \
Expand All @@ -149,11 +144,11 @@ def to_df(self, timing=True, metadata=False, format='wide',
index_cols = ['object_id']

if timing is True or (timing == 'auto' and
(np.isfinite(self.durations).any() or
np.isfinite(self.orders).any())):
df.insert(0, 'duration', self.durations)
df.insert(0, 'order', self.orders)
df.insert(0, 'onset', self.onsets)
(np.isfinite(durations).any() or
np.isfinite(orders).any())):
df.insert(0, 'onset', onsets)
df.insert(0, 'duration', durations)
df.insert(0, 'order', orders)
index_cols.extend(['onset', 'order', 'duration'])

if format == 'long':
Expand Down
8 changes: 2 additions & 6 deletions pliers/filters/image.py
Expand Up @@ -40,9 +40,7 @@ def _filter(self, stim):
x0, y0, x1, y1 = pillow_img.getbbox()
new_img = stim.data[y0:y1, x0:x1]
return ImageStim(stim.filename,
data=new_img,
onset=stim.onset,
duration=stim.duration)
data=new_img)


class PillowImageFilter(ImageFilter):
Expand Down Expand Up @@ -87,6 +85,4 @@ def _filter(self, stim):
pillow_img = Image.fromarray(stim.data)
new_img = np.array(pillow_img.filter(self.filter))
return ImageStim(stim.filename,
data=new_img,
onset=stim.onset,
duration=stim.duration)
data=new_img)
12 changes: 4 additions & 8 deletions pliers/filters/text.py
Expand Up @@ -67,8 +67,7 @@ def _filter(self, stim):
stemmed = ' '.join([self.stemmer.stem(tok) for tok in tokens])
else:
stemmed = self.stemmer.stem(stim.text)
return TextStim(stim.filename, stemmed, onset=stim.onset,
duration=stim.duration, order=stim.order)
return TextStim(stim.filename, stemmed)


class TokenizingFilter(TextFilter):
Expand Down Expand Up @@ -97,8 +96,7 @@ def _filter(self, stim):
tokens = self.tokenizer.tokenize(stim.text)
else:
tokens = word_tokenize(stim.text)
stims = [TextStim(stim.filename, token, onset=stim.onset,
duration=stim.duration, order=i)
stims = [TextStim(stim.filename, token, order=i)
for i, token in enumerate(tokens)]
return stims

Expand Down Expand Up @@ -134,8 +132,7 @@ def _filter(self, stim):
tokens = word_tokenize(stim.text)
tokens = [tok for tok in tokens if tok not in self.tokens]
text = ' '.join(tokens)
return TextStim(stim.filename, text, onset=stim.onset,
duration=stim.duration, order=stim.order)
return TextStim(stim.filename, text)


class PunctuationRemovalFilter(TokenRemovalFilter):
Expand All @@ -151,5 +148,4 @@ class LowerCasingFilter(TextFilter):
''' Lower cases the text in a TextStim. '''

def _filter(self, stim):
return TextStim(stim.filename, stim.text.lower(), onset=stim.onset,
duration=stim.duration, order=stim.order)
return TextStim(stim.filename, stim.text.lower())
3 changes: 1 addition & 2 deletions pliers/filters/video.py
Expand Up @@ -57,8 +57,7 @@ def _filter(self, video):
frame_index = sorted(list(set(video.frame_index).intersection(new_idx)))

return VideoFrameCollectionStim(filename=video.filename,
frame_index=frame_index,
onset=video.onset)
frame_index=frame_index)


class VideoTrimmingFilter(TemporalTrimmingFilter, VideoFilter):
Expand Down
17 changes: 7 additions & 10 deletions pliers/stimuli/text.py
Expand Up @@ -121,7 +121,7 @@ def __init__(self, filename=None, onset=None, duration=None, columns=None,

@property
def elements(self):
return self._elements
return [f for f in self]

def _from_file(self, filename, columns, default_duration):
tod_names = {'t': 'text', 'o': 'onset', 'd': 'duration'}
Expand All @@ -142,7 +142,7 @@ def _from_file(self, filename, columns, default_duration):
if duration is None:
duration = default_duration
elem = TextStim(filename, r['text'], r['onset'], duration)
self.add_elem(elem)
self._elements.append(elem)

def save(self, path):
with open(path, 'w') as f:
Expand Down Expand Up @@ -173,16 +173,13 @@ def _from_srt(self, filename):
for i, r in df.iterrows():
elem = TextStim(filename, text=r['text'], onset=r['onset'],
duration=r['duration'], order=i)
self.add_elem(elem)

def add_elem(self, elem):
offset = 0.0 if self.onset is None else self.onset
elem.onset = offset if elem.onset is None else offset + elem.onset
self._elements.append(elem)
self._elements.append(elem)

def __iter__(self):
""" Iterate text elements. """
for elem in self._elements:
offset = 0.0 if self.onset is None else self.onset
elem.onset = offset if elem.onset is None else offset + elem.onset
yield elem

def _to_sec(self, tup):
Expand Down Expand Up @@ -215,5 +212,5 @@ def tokenize_text(text):
tokens = tokenize_text(text)

for i, t in enumerate(tokens):
self.add_elem(TextStim(text=t, onset=None, duration=None,
order=i))
self._elements.append(TextStim(text=t, onset=None, duration=None,
order=i))
6 changes: 5 additions & 1 deletion pliers/tests/filters/test_text_filters.py
Expand Up @@ -54,11 +54,15 @@ def test_word_stemming_filter():


def test_tokenizing_filter():
stim = TextStim(join(TEXT_DIR, 'scandal.txt'))
stim = TextStim(join(TEXT_DIR, 'scandal.txt'), onset=4.2)
filt = TokenizingFilter()
words = filt.transform(stim)
assert len(words) == 231
assert words[0].text == 'To'
assert words[0].onset == 4.2
assert words[0].order == 0
assert words[1].onset == 4.2
assert words[1].order == 1

custom_tokenizer = PunktSentenceTokenizer()
filt = TokenizingFilter(tokenizer=custom_tokenizer)
Expand Down
14 changes: 14 additions & 0 deletions pliers/transformers/base.py
Expand Up @@ -129,6 +129,7 @@ def transform(self, stims, validation='strict', *args, **kwargs):
result = _log_transformation(validated_stim, result, self)
if isgenerator(result):
result = list(result)
self._propagate_context(validated_stim, result)
return result

def _validate(self, stim):
Expand Down Expand Up @@ -186,6 +187,18 @@ def _transform(s):
return (t for t in (self.transform(s, *args, **kwargs)
for s in stims) if t)

def _propagate_context(self, stim, result):
if isiterable(result):
for r in result:
self._propagate_context(stim, r)
else:
if result.onset is None:
result.onset = stim.onset
if result.duration is None:
result.duration = stim.duration
if result.order is None:
result.order = stim.order

@abstractmethod
def _transform(self, stim):
pass
Expand Down Expand Up @@ -223,6 +236,7 @@ def _iterate(self, stims, validation='strict', *args, **kwargs):
res = self._transform(batch, *args, **kwargs)
for i, stim in enumerate(batch):
res[i] = _log_transformation(stim, res[i], self)
self._propagate_context(stim, res[i])
results.extend(res)
return results

Expand Down

0 comments on commit 21f085e

Please sign in to comment.