In [None]:
import os
import re
import shutil
import numpy as np
import math
import matplotlib
import matplotlib.pyplot as plt
import librosa
import soundfile
import json
import textgrid
import pyroomacoustics as pra
from pyroomacoustics.directivities \
    import (CardioidFamily, DirectionVector, DirectivityPattern)
from pprint import pprint

In [None]:
"""Script parameters"""
target_amount_samples = 250
skipSamples = 0
target_dir = '/workspace/training/KEC'
visualize = False

"Persons"
# size of the Listeners head diameter
head_size = 0.2

speakers_in_room = 2
maxSpeakerAtOnce = 2
speaker_in_room_ranges = [1, 2]
randomize_speaker_count = False

minimum_time_offset = 0.1 #sec
maxTimeDistanceBetweenSpeakers = 1  # sec

"ROOM"


randomize_room = False
# room generation will be laplace distributed in these intervalls [width, length, height]
normal_room_dim = [15, 20, 4]
room_dim_ranges = [[3, 30], [3, 30], [2.5, 5]]

# The amout Walls absorb Sound
normal_absorption = 1.2
absorption_range = [1, 2]

# the time it take until the signal drops by 60 dB
normal_rt60 = 0.4
rt60_range = [0.25, 0.75]


"AUdio"
sampleRate = 16000
min_words_per_sentence = 4
padding = 1  # sec

# 'nice to have' for data generation
source_dataset = 'KEC - https://clarin.phonetik.uni-muenchen.de/BASRepository/index.php?target=Public/Corpora/KEC/KEC.1.php'


In [None]:
"""Classes"""


class Timestamp:

    def __init__(self, start: float, end: float):
        self.startTime = start
        self.endTime = end
        self.duration = self.endTime-self.startTime

    def toTimestap(self):
        return [self.startTime, self.endTime, self.duration]

    def setOffset(self, offset: float):
        self.startTime = offset
        self.endTime = self.duration + offset

    def __str__(self):
        return f'Start: {self.startTime}, End:{self.endTime}, Duration:{self.duration}'

    def __repr__(self):
        return self.__str__()


class SentenceWithTimestamp(Timestamp):
    def __init__(self, sentence, start: float, end: float):
        self.sentence = sentence
        Timestamp.__init__(self,start, end)

    def __str__(self):
        return f'Sentence: {str(self.sentence)}, ' + Timestamp.__str__(self)

    def __repr__(self):
        return self.__str__()


In [None]:
"""Visualization"""

def scatterplot3d(points, dims):
    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')
    ax.set_xlim(0, dims[0])
    ax.set_ylim(0, dims[1])
    ax.set_zlim(0, dims[2])
    c = listenerSpeakerColors(len(points))

    i = 0
    for p in points:
        ax.scatter(p[0], p[1], p[2], color=c[i])
        i = i+1


def scatterplot2d(points, middle, directivities, dims):
    plot2dPoints(points)
    plt.scatter(middle[0], middle[1], c='#5f59')
    if dims != None:
        plt.xlim(0, dims[0])
        plt.ylim(0, dims[1])


def plotDirectivities(base, middle, baseRefDirs, pos, posDirs, roomCorners):
    def sub(x, y):
        return [x[0]-y[0], x[1]-y[1]]

    arrowLen = 5
    plt.figure(figsize=(10,10))
    plt.xlim(-room_dim_ranges[0][1], room_dim_ranges[0][1])
    plt.ylim(-room_dim_ranges[1][1], room_dim_ranges[1][1])

    a = roomCorners[0]
    b = roomCorners[1]
    c = roomCorners[3]
    d = roomCorners[2]

    plt.arrow(a[0], a[1], sub(b, a)[0], sub(b, a)[1], color='#000')
    plt.arrow(b[0], b[1], sub(c, b)[0], sub(c, b)[1], color='#000')
    plt.arrow(c[0], c[1], sub(d, c)[0], sub(d, c)[1], color='#000')
    plt.arrow(d[0], d[1], sub(a, d)[0], sub(a, d)[1], color='#000')
    plt.gca().set_aspect('equal', adjustable='box')

    plt.scatter(base[0], base[1], c='#55f9')
    plt.scatter(middle[0], middle[1], c='#5f59')
    
    v = toVektor(baseRefDirs[0], arrowLen)
    plt.arrow(base[0], base[1], v[0], v[1], color='#5f59')


    for i in range(len(posDirs)):
        v = toVektor(posDirs[i][0], arrowLen)
        plt.arrow(base[0], base[1], v[0], v[1], color='#f559')
        plt.scatter(pos[i][0], pos[i][1], c='#f559')


def plot2dPoints(points):
    x = [points[i][0] for i in range(len(points))]
    y = [points[i][1] for i in range(len(points))]
    c = listenerSpeakerColors(len(points))
    plt.scatter(x, y, c=c)


def listenerSpeakerColors(count: int):
    if count < 1:
        return []
    if count == 1:
        return ['#55F9']
    colors = ['#55F9']
    for i in range(1, count):
        colors.append('#F559')
    return colors



def customPlot(positions, middle, dirs, baseAngle, roomDims):
    plt.figure(figsize=(10,10))
    #plt.subplot(131)
    scatterplot3d(positions, roomDims)
    plt.show()
    #plt.subplot(132)

    positions = positions.copy()
    dirs = dirs.copy()
    roomDims = roomDims.copy()
    a = [0, 0, 0]
    b = [roomDims[0], 0, 0]
    c = [0, roomDims[1], 0]
    d = [roomDims[0], roomDims[1], 0]
    roomCorners = [a, b, c, d]

    plotDirectivities(positions[0], middle, dirs[0],
                      positions[1:], dirs[1:], roomCorners)

    plt.show()
    #plt.subplot(133)
    positions = [rotateAroundPoint(v, positions[0], -baseAngle)
                 for v in positions]
    roomCorners = [rotateAroundPoint(v, positions[0], -baseAngle)
                   for v in roomCorners]
    dirs = [[d[0]-baseAngle, d[1]] for d in dirs]
    middle = rotateAroundPoint(middle,positions[0],-baseAngle)
    plotDirectivities(positions[0], middle, dirs[0],
                      positions[1:], dirs[1:], roomCorners)

    plt.show()


def plotTracks(tracks):
    plt.clf()
    colors = ['#F55', '#5F5', '#55F']
    for i in range(len(tracks)):
        for s in tracks[i]:
            c = colors[i % len(colors)]
            plt.arrow(s.startTime, i+0.4, s.duration, 0, color=c, width=0.8,head_width=0)
    plt.title('Speaker Tracks')
    plt.show()

In [None]:
"""Util"""
def rec_filter(x):
    return True if 'rec' in x else False


def wav_filter(x):
    return True if '.wav' in x else False


def txt_filter(x):
    return True if '.TextGrid' in x else False


def as_filter(x):
    return True if 'AS' in x else False


def idx_filter_txt(x, y):
    return True if x[len(x) - 10] == y else False


def rec_filter(x):
    return True if 'rec' in x else False


def norm2(x):
    n = np.sum(list(map(lambda y: y**2, x)))
    return n


def distance(x, y):
    v = [ x[i]-y[i] for i in range(len(x))]
    return norm2(v)


def avg(x):
    return sum(x)/len(x)

def toVektor(angle, len):
    return [np.round(len*math.cos(angle),15), np.round(len*math.sin(angle),15)]

"""Anlges"""
# von a nach b in deg
def toAngle(a, b):

    deltaY = b[1] - a[1]
    deltaX = b[0] - a[0]
    return math.atan2(deltaY, deltaX)

In [None]:
"Dataset specific Wav Generation"


def FileGeneratorsKEC():
    src = '/workspace/data/KEC'
    recFolders = list(filter(lambda f: rec_filter(f), os.listdir(src)))
    allWavfiles = []
    alltxtFiles = []
    for rec in recFolders:
        currentDir = src+"/"+rec

        wavFilesOfFolder = list(
            filter(lambda f: wav_filter(f), os.listdir(currentDir)))
        wavFilesOfFolder = list(
            map(lambda f: currentDir+"/"+f, wavFilesOfFolder))
        allWavfiles.extend(wavFilesOfFolder)

        txtFilesOfFolder = list(
            filter(lambda f: txt_filter(f), os.listdir(currentDir)))
        txtFilesOfFolder = list(
            map(lambda f: currentDir+"/"+f, txtFilesOfFolder))
        alltxtFiles.extend(txtFilesOfFolder)

    def wavGen():
        for file in allWavfiles:
            yield file

    def textGen():
        for file in alltxtFiles:
            yield file

    return wavGen, textGen


def split_sentence(textGrid: textgrid.textgrid.TextGrid):
    sentence = []
    sentences = []
    words = textGrid.getList('words')[0]
    specialWordsRegex = "<.*>"
    startTime = 0
    for word in words:
        if len(sentence) == 0:
            startTime = word.minTime

        if re.search(specialWordsRegex, word.mark) != None:
            if len(sentence) >= min_words_per_sentence:
                sentences.append(SentenceWithTimestamp(
                    sentence, startTime, word.minTime))
            sentence = []
        else:
            sentence.append(word.mark)

    return sentences


def VoiceLineGeneratorKEC(count, voicesPerSample):
    wavGen, textGen = FileGeneratorsKEC()
    allWavPaths = [p for p in wavGen()]
    allTxtPaths = [p for p in textGen()]
    padding = 0

    for i in range(count):
        w = []
        t = []
        j = 0
        while j < voicesPerSample:
            j+=1
            try:
                random_file = np.random.randint(0, len(allWavPaths))
                sentences = split_sentence(
                    textgrid.TextGrid.fromFile(allTxtPaths[random_file]))

                random_phrase = np.random.randint(0, len(sentences))
                ts = sentences[random_phrase]
                t.append(ts)
                w.append(loadWavFile(
                    allWavPaths[random_file], ts.startTime, ts.duration))
            except Exception as e:
                print('could not load Data')
                j-=1
        yield w, t
    return


In [None]:
"""Room Functions"""


def generate_room_characteristics():
    dims = rt60 = absorption = 0
    if randomize_room:
        dims = [np.random.randint(room_dim_ranges[i, 0], room_dim_ranges[i, 1])
                           for i in range(len(room_dim_ranges[:]))]

        rt60: float = rt60_range[0] + \
            (rt60_range[1]-rt60_range[0])*np.random.random()

        absorption: float = absorption_range[0] + \
            (absorption_range[1]-absorption_range[0]) * np.random.random()

    else:
        dims = normal_room_dim
        rt60 = normal_rt60
        absorption = normal_absorption

    return dims, rt60, absorption


def random_position_in_room(roomDims):
    x = np.random.random() * (roomDims[0]-1) + 0.5
    y = np.random.random() * (roomDims[1]-1) + 0.5
    z = 1.73
    return [x, y, z]


def positions_too_close(positions):
    for a in positions[1:]:
        for b in positions[1:]:
            if a == b:
                continue
            if distance(a, b) < 1:
                return True
    return False


def transform_to_directivities(positions):
    dirs = []
    middle = [avg(np.transpose(positions)[i])
              for i in range(len(positions[0]))]

    listenerPos = positions[0]
    # Winkel aus Vogelperspeltive, in die Listnener guckt
    baseAngle = toAngle(listenerPos[:2], middle[:2])
    dirs.append([baseAngle, math.pi/2])

    for pos in positions[1:]:
        dirs.append([toAngle(listenerPos, pos), math.pi/2])

    return dirs, baseAngle, middle


def random_persons_in_room(roomDims, count):
    def pos_in_room(count):
        pos = []
        for i in range(count):
            pos.append(random_position_in_room(roomDims))
        return pos

    # do while: erzeuge solange bis gültiges ergebnis
    positions = pos_in_room(count)
    while(positions_too_close(positions)):
        positions = pos_in_room(count)
    dirs, baseAngle, middle = transform_to_directivities(positions)

    return positions, dirs,middle, baseAngle

# calculates mic positions depentend of middle point


def get_pos_mics(position, dir):
    dirLeft = [dir[0]+math.pi/2, dir[1]]
    dirRight = [dir[0] - math.pi/2, dir[1]]
    posLeft = point_pos(position, head_size/2, dirLeft[0])
    posRight = point_pos(position, head_size/2, dirRight[0])

    return [posLeft, posRight], [dirLeft, dirRight]

# calculates mic position depentend of middle point


def point_pos(x, d, theta):
    theta_rad = math.pi/2 - theta
    return [x[0] + d * math.cos(theta_rad), x[1] + d*math.sin(theta_rad), x[2]]

def rotationMatrix(angle):
    return np.array([[math.cos(angle), -math.sin(angle)],
                     [math.sin(angle), math.cos(angle)]],)


def rotateAroundPoint(v, base, angle):
    x = np.subtract(v, base)
    x = list(np.dot(rotationMatrix(angle), x[:2]))
    x.append(v[2])
    x = np.add(x, base)
    return x.tolist()


In [None]:
"""WAV and audio Mixing"""


def loadWavFile(path, offset=0, duration=None):
    wav, sr = librosa.load(path, sr=sampleRate, offset=offset,
                           duration=duration, mono=True)

    wav = librosa.util.normalize(wav)
    return wav


def makeCardioid(direction):
    return CardioidFamily(
        orientation=DirectionVector(
            azimuth=direction[0], colatitude=direction[1], degrees=True),
        pattern_enum=DirectivityPattern.CARDIOID,
    )


def createRoom(room_dim, rt60, absorption):
    e_absortion, max_order = pra.inverse_sabine(rt60, room_dim)
    room: pra.ShoeBox = pra.ShoeBox(room_dim, fs=sampleRate, materials=pra.Material(
        e_absortion), absorption=absorption, max_order=max_order)
    return room


def trackEndingSoon(tracks):
    if len(tracks) < 1:
        raise 'Tracks length must be greater than 0'

    min = tracks[0][-1].endTime
    pos = 0
    for i in range(len(tracks)):
        track = tracks[i]
        # empty tracks
        if(len(track) == 0):
            return i

        ts = track[-1]
        if(ts.endTime < min):
            min = ts.endTime
            pos = i
    return pos


def trackEndingLatest(tracks):
    if len(tracks) < 1:
        raise 'Tracks length must be greater than 0'

    max = tracks[0][-1].endTime
    pos = 0
    for i in range(len(tracks)):
        track = tracks[i]
        # empty tracks
        if(len(track) == 0):
            continue

        ts = track[-1]
        if(ts.endTime > max):
            max = ts.endTime
            pos = i
    return pos


def makeTimeOffsets(timeStamps):
    tracks= [[]for i in range(maxSpeakerAtOnce)]
    for ts in timeStamps:
        ts.setOffset(0)
    tracks[0].append(timeStamps[0])
    for s in timeStamps[1:]:

        posToFill = trackEndingSoon(tracks)
        minTimeOffset = 0 if len(
            tracks[posToFill]) == 0 else tracks[posToFill][-1].endTime
        minTimeOffset = minTimeOffset + minimum_time_offset

        posOfLastTrack = trackEndingLatest(tracks)
        maxTimeOffset = 0 if len(
            tracks[posOfLastTrack]) == 0 else tracks[posOfLastTrack][-1].endTime
        maxTimeOffset = maxTimeOffset + maxTimeDistanceBetweenSpeakers

        offset = np.random.rand()*(maxTimeOffset-minTimeOffset) + minTimeOffset
        s.setOffset(offset)
        tracks[posToFill].append(s)

    return tracks


def mixRoom(room: pra.ShoeBox, listenerEarPositions, listenerEarDirs, speakerPositions, speakerDirs, wavs, timeStamp):
    listenerEarDirs = list(map(lambda d: makeCardioid(d), listenerEarDirs))
    speakerDirs = list(map(lambda d: makeCardioid(
        [d[0]+math.pi, d[1]]), speakerDirs))  # turn speaker around

    mic_array = pra.MicrophoneArray(
        np.c_[listenerEarPositions[0], listenerEarPositions[1]], directivity=listenerEarDirs, fs=sampleRate)

    for i in range(len(speakerPositions)):
        room.add_source(
            position=speakerPositions[i],
            directivity=speakerDirs[i],
            signal=wavs[i],
            delay=timeStamp[i].startTime)
    room.add_microphone_array(mic_array)

    return room


def exportRoom(room: pra.ShoeBox, filepath):
    room.mic_array.to_wav(filepath, norm=True, bitdepth=np.float32)


In [None]:
"""Exporting training data"""
def createJsonData(sampleNr: int, speakerIdsList, listenerPos, listenerDir,
                   speakrePositionList, speakerDirections, timestamps, words=None):
    speakers = []
    for i in range(len(speakerIdsList)):
        speaker = {
            'id': speakerIdsList[i],
            'position': speakrePositionList[i],
            'direction': {
                'azimuth': speakerDirections[i][0],
                'colatitude': speakerDirections[i][1],
            },
            'startTime': timestamps[i].startTime,
            'endTime': timestamps[i].endTime,
            'duration': timestamps[i].duration,
            'words': timestamps[i].sentence if hasattr(timestamps[i],'sentence')  else []
        }
        speakers.append(speaker)

    return {
        'sample': {
            'id': sampleNr,
            'speakers': speakers,
            'listener': {
                'position': listenerPos[0],
                'direction': {
                    'azimuth': listenerDir[0][0],
                    'colatitude': listenerDir[0][1],
                },
                'positionLeft': listenerPos[1],
                'directionLeft': {
                    'azimuth': listenerDir[1][0],
                    'colatitude': listenerDir[1][1],
                },
                'positionRight': listenerPos[2],
                'directionRight': {
                    'azimuth': listenerDir[2][0],
                    'colatitude': listenerDir[2][1],
                },
            },
            'source': source_dataset
        }
    }


def createFolder(targetFolder):
    try:
        os.mkdir(targetFolder)
    except Exception as e:
        shutil.rmtree(targetFolder)
        os.mkdir(targetFolder)


def exportSample(sampleNr:int, room, wavs, json_data:any):
    folder = target_dir+'/'+str(sampleNr)
    createFolder(folder)
    exportRoom(room, folder+'/room.wav')
    for i in range(len(wavs)):
        soundfile.write(folder+f'/speaker{i}.wav', wavs[i], sampleRate)
    with open(folder+'/description.json', 'w') as file:
        json.dump(json_data, file, indent=4)


In [None]:
"""MAIN"""


def generate():
    sampleNr = skipSamples
    gen = VoiceLineGeneratorKEC(target_amount_samples, speakers_in_room)
    for (wavs, timestamps) in gen:
        sampleNr += 1
        try:
            # creating parameters

            dims, rt60, absorption = generate_room_characteristics()
            room = createRoom(dims, rt60, absorption)

            pos, dirs,middle, baseAnlge = random_persons_in_room(dims, speakers_in_room+1)
            listener_pos = pos[0]
            listener_dir = dirs[0]
            speakerPos = pos[1:]
            speakerDir = dirs[1:]

            earPos, earDirs = get_pos_mics(listener_pos, listener_dir)

            # creating data
            tracks = makeTimeOffsets(timestamps)
            if visualize:
                plotTracks(tracks)
                customPlot(pos, middle, dirs, baseAnlge, dims)
            room = mixRoom(room, earPos, earDirs, speakerPos,
                        speakerDir, wavs, timestamps)
            room.simulate()


            allListenerPos = [listener_pos]
            allListenerPos.extend(earPos)
            
            allListenerDirs = [listener_dir]
            allListenerDirs.extend(earDirs)

            #correct agnles offset
            allListenerPos = [rotateAroundPoint(v, listener_pos, -baseAnlge)
                for v in allListenerPos]
            speakerPos = [rotateAroundPoint(v, listener_pos, -baseAnlge)
                        for v in speakerPos]
            allListenerDirs = [[d[0]-baseAnlge, d[1]] for d in allListenerDirs]
            speakerDir = [[d[0]-baseAnlge, d[1]] for d in speakerDir]

            json_data = createJsonData(sampleNr, range(
                len(wavs)), allListenerPos, allListenerDirs, speakerPos, speakerDir, timestamps)
            exportSample(sampleNr, room, wavs, json_data)

            msg = f'Generated Room Nr.{sampleNr}.'
            print(msg)
        except Exception as e:
            sampleNr-=1
            print('error')


if __name__ == '__main__':
    generate()


In [None]:
"""Tests"""

if False:
    #       0,      pi,     pi/2,   -pi/2 ,  pi/4,  ,3/4 pi    -pi/4    -3/4 pi
    vs = [[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [-1, 1], [1, -1], [-1, -1]]
    angs = []
    for v in vs:
        angs.append(toAngle([0,0], v))

    vs2 = []
    for ang in angs:
        vs2.append(toVektor(ang,1))

    res = []

    for i in range(len(vs)):
        res.append([vs[i],angs[i],vs2[i]])

    pprint(res)
