-
Notifications
You must be signed in to change notification settings - Fork 1
/
training.py
175 lines (126 loc) · 6.02 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# for creating training examples by overlaying the different audio files on top of background noise.
import numpy as np
from utility import *
def get_random_time_segment(segment_ms):
"""
Gets a random time segment of duration segment_ms in a 10,000 ms audio clip.
Arguments:
segment_ms -- the duration of the audio clip in ms ("ms" stands for "milliseconds")
Returns:
segment_time -- a tuple of (segment_start, segment_end) in ms
"""
# Make sure segment doesn't run past the 10sec background
segment_start = np.random.randint(low = 0, high = 10000 - segment_ms)
segment_end = segment_start + segment_ms - 1
return (segment_start, segment_end)
def is_overlapping(segment_time, previous_segments):
"""
Checks if the time of a segment overlaps with the times of existing segments.
Arguments:
segment_time -- a tuple of (segment_start, segment_end) for the new segment
previous_segments -- a list of tuples of (segment_start, segment_end) for the existing segments
Returns:
True if the time segment overlaps with any of the existing segments, False otherwise
"""
segment_start, segment_end = segment_time
# initially there is no overlap is assumed
overlap = False
# loop over the previous segments already overlayed for finding any overlap with current segment
for previous_start, previous_end in previous_segments:
if segment_start <= previous_end and segment_start >= previous_start:
overlap = True
return overlap
def insert_audio_clip(background, audio_clip, previous_segments):
"""
Insert a new audio segment over the background noise at a random time step, making sure that the
audio segment does not overlap with existing segments.
Arguments:
background -- a 10 second background audio recording.
audio_clip -- the audio clip to be inserted/overlaid.
previous_segments -- times where audio segments have already been placed
Returns:
new_background -- the updated background audio
"""
# Get the duration of the audio clip in ms
segment_ms = len(audio_clip)
# Step 1: get a ranodm time segment
segment_time = get_random_time_segment(segment_ms)
# Step 2: Check for overlap with previous time segments
while is_overlapping(segment_time, previous_segments):
segment_time = get_random_time_segment(segment_ms)
# Step 3: Add the new segment_time to the list of previous_segments
previous_segments.append(segment_time)
# Step 4: Superpose audio segment and background
new_background = background.overlay(audio_clip, position=segment_time[0])
return new_background, segment_time
def insert_ones(y, segment_end_ms):
"""
for updating the label vector y. The labels of the 50 output steps strictly after the end of the segment
should be set to 1.
Arguments:
y -- numpy array of shape (1, Ty), the labels of the training example
segment_end_ms -- the end time of the segment in ms
Returns:
y -- updated labels
"""
# duration of the background (in terms of spectrogram time-steps)
segment_end_y = int(segment_end_ms * Ty / 10000.0)
# last timestep for inserting 1
end_index = y.shape[1] - 1
# check if end_index goes out of output timesteps or not
if segment_end_y + 51 < y.shape[1]:
end_index = segment_end_y + 51
# make the required steps as 1
y[0, segment_end_y + 1:end_index] = 1
return y
def create_training_example(background, activates, negatives):
"""
Creates a training example with a given background, activates, and negatives.
Arguments:
background -- a 10 second background audio recording
activates -- a list of audio segments of the word "activate"
negatives -- a list of audio segments of random words that are not "activate"
Returns:
x -- the spectrogram of the training example
y -- the label at each time step of the spectrogram
"""
# Set the random seed
np.random.seed(18)
# Make background quieter
background = background - 20
# Step 1: Initialize y (label vector) of zeros
y = np.zeros((1, Ty))
# Step 2: Initialize segment times as empty list
previous_segments = list()
# Select 0-4 random "activate" audio clips from the entire list of "activates" recordings
number_of_activates = np.random.randint(0, 5)
random_indices = np.random.randint(
len(activates), size=number_of_activates)
random_activates = [activates[i] for i in random_indices]
# Step 3: Loop over randomly selected "activate" clips and insert in background
for random_activate in random_activates:
# Insert the audio clip on the background
background, segment_time = insert_audio_clip(
background, random_activate, previous_segments)
# Retrieve segment_start and segment_end from segment_time
segment_start, segment_end = segment_time
# Insert labels in "y"
y = insert_ones(y, segment_end)
# Select 0-2 random negatives audio recordings from the entire list of "negatives" recordings
number_of_negatives = np.random.randint(0, 3)
random_indices = np.random.randint(
len(negatives), size=number_of_negatives)
random_negatives = [negatives[i] for i in random_indices]
# Step 4: Loop over randomly selected negative clips and insert in background
for random_negative in random_negatives:
# Insert the audio clip on the background
background, _ = insert_audio_clip(
background, random_negative, previous_segments)
# Standardize the volume of the audio clip
background = match_target_amplitude(background, -20.0)
# Export new training example
file_handle = background.export("train" + ".wav", format="wav")
print("File (train.wav) was saved in your directory.")
# Get and plot spectrogram of the new recording (background with superposition of positive and negatives)
x = graph_spectrogram("train.wav")
return x, y