<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
!pip install pydub
!pip install SpeechRecognition
!pip install praat-parselmouth

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Collecting SpeechRecognition
  Downloading SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8 MB)
[K     |████████████████████████████████| 32.8 MB 82 kB/s 
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.8.1
Collecting praat-parselmouth
  Downloading praat_parselmouth-0.4.0-cp37-cp37m-manylinux2010_x86_64.whl (10.7 MB)
[K     |████████████████████████████████| 10.7 MB 10.4 MB/s 
Installing collected packages: praat-parselmouth
Successfully installed praat-parselmouth-0.4.0


In [2]:
from pydub import AudioSegment
from pydub.silence import detect_silence
from pydub.silence import detect_nonsilent
from keras.models import load_model

import matplotlib.pyplot as plt
import numpy as np
import os
import librosa
import sklearn
import speech_recognition as sr
import json

import parselmouth
import glob
import seaborn as sns

from IPython.display import Audio

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 비추임새판별 / 추임새분류 Model 불러오기

In [4]:
filler_determine_model = load_model('/content/drive/MyDrive/Colab Notebooks/posco_ai_project/SpeakUP_ML/isfiller_classifier_0928.h5')
filler_classifier_model = load_model('/content/drive/MyDrive/Colab Notebooks/posco_ai_project/SpeakUP_ML/filler_classifier_0928.h5')

## 몇몇 기능 정의
- mfcc padding
- scaler
- 음성파일 amplitude 변환

In [5]:
pad1d = lambda a, i: a[0: i] if a.shape[0] > i else np.hstack((a, np.zeros(i-a.shape[0])))
pad2d = lambda a, i: a[:, 0:i] if a.shape[1] > i else np.hstack((a, np.zeros((a.shape[0], i-a.shape[1]))))

from sklearn import preprocessing
scaler = preprocessing.StandardScaler()

#adjust target amplitude
def match_target_amplitude(sound, target_dBFS):
    change_in_dBFS = target_dBFS - sound.dBFS
    return sound.apply_gain(change_in_dBFS)

## 비추임새판별 함수 / 추임새 분류 함수

In [6]:
def predict_filler(audio_file):
    # 추임새 판별을 위한 임시 음성 파일 생성
    audio_file.export("temp.wav", format="wav")

    wav, sr = librosa.load("temp.wav", sr=16000)

    mfcc = librosa.feature.mfcc(wav, sr=16000, n_mfcc=100, n_fft=400, hop_length=160)
    #mfcc = sklearn.preprocessing.scale(mfcc, axis=1)
    mfcc = scaler.fit_transform(mfcc)

    padded_mfcc = pad2d(mfcc, 40)
    padded_mfcc = np.expand_dims(padded_mfcc, [0, 3])  # (100,40) 에서 (1,100,40,1) 으로 차원 추가

    result = filler_determine_model.predict(padded_mfcc)

    # 판별 완료된 음성 파일 삭제
    os.remove("temp.wav")

    if result[0][0] >= result[0][1]: # 추임새
        return 0 
    else:
        return 1

In [7]:
def predict_filler_type(audio_file):
    # 추임새 분류를 위한 임시 음성 파일 생성
    audio_file.export("temp.wav", format="wav")

    wav, sr = librosa.load("temp.wav", sr=16000)
    mfcc = librosa.feature.mfcc(wav, sr=16000, n_mfcc=100, n_fft=400, hop_length=160) # 여기서 mfcc 파라미터에 n_fft나 stride 안 넣어주면, 모델 input이랑 달라지는 거임
    #mfcc = sklearn.preprocessing.scale(mfcc, axis=1)
    mfcc = scaler.fit_transform(mfcc)
    padded_mfcc = pad2d(mfcc, 40)
    padded_mfcc = np.expand_dims(padded_mfcc, [0, 3])

    result = filler_classifier_model.predict(padded_mfcc)

    # 판별 완료된 음성 파일 삭제
    os.remove("temp.wav")

    return np.argmax(result)

## 목소리 떨림 분석 함수

In [8]:
def cut_off(file):
    snd = parselmouth.Sound(file)
    pitch = snd.to_pitch()
    pre_emphasized_snd = snd.copy()
    pre_emphasized_snd.pre_emphasize()
    spectrogram = pre_emphasized_snd.to_spectrogram(window_length=0.03, maximum_frequency=8000)
    data = pitch.selected_array['frequency']
    q25, q75 = np.quantile(data, 0.25), np.quantile(data, 0.75) 
    iqr = q75 -q25
    cut_off = iqr *0.5
    lower, upper = q25 - cut_off, q75 + cut_off 
    return lower, upper

def cut_mean(file):
    snd = parselmouth.Sound(file)
    pitch = snd.to_pitch()
    pre_emphasized_snd = snd.copy()
    pre_emphasized_snd.pre_emphasize()
    spectrogram = pre_emphasized_snd.to_spectrogram(window_length=0.03, maximum_frequency=8000)
    data = pitch.selected_array['frequency']
    m = np.mean(data)
    upper = m + 400
    lower = m - 100
    return upper, lower

def brr_ok(file,up,down,audio_len):
    snd = parselmouth.Sound(file)
    pitch = snd.to_pitch()

    pre_emphasized_snd = snd.copy()
    pre_emphasized_snd.pre_emphasize()
    spectrogram = pre_emphasized_snd.to_spectrogram(window_length=0.03, maximum_frequency=8000)
    max(pitch.selected_array['frequency'])
    count = 0
    brr = False
    for i in pitch.selected_array['frequency']:
        if i> up or i< down:
            #brr = True
            count = 1 + count
    brr_sec = count/audio_len
    if brr_sec > 3:
        brr = True
    return brr

## 비침묵/침묵 구간 자르기

In [24]:
def create_json(audio_file):
    intervals_jsons = []

    # min_silence_length = 70
    min_silence_length = 70
    intervals = detect_nonsilent(audio_file,
                                min_silence_len=min_silence_length,
                                silence_thresh=-32.64
                                )
  
    if intervals[0][0] != 0:
        intervals_jsons.append({'start':0,'end':intervals[0][0],'tag':'0000'}) # tag: 0000 means silence
    
    non_silence_start = intervals[0][0]
    before_silence_start = intervals[0][1]

    for interval in intervals:
        interval_audio = audio_file[interval[0]:interval[1]]

     # 800ms초 이상의 공백 부분 처리
        if (interval[0]-before_silence_start) >= 800:
            intervals_jsons.append({'start':non_silence_start,'end':before_silence_start+200,'tag':'1000'}) # tag: 1000 means non-slience
            non_silence_start = interval[0]-200
            intervals_jsons.append({'start':before_silence_start,'end':interval[0],'tag':'0000'}) # tag: 0000 means slience

        if predict_filler(interval_audio) == 0 : # 추임새인 경우
            if len(interval_audio) <= 460: # 비침묵 구간이 460ms보다 짧은 경우
                intervals_jsons.append({'start':non_silence_start,'end':interval[0],'tag':'1000'}) # tag: 1000 means non-slience
                non_silence_start = interval[0]
                intervals_jsons.append({'start':interval[0],'end':interval[1],'tag':'1111'})
            else: # 비침묵 구간이 460ms보다 긴 경우
                non_silence_start = shorter_filler(intervals_jsons, interval_audio, min_silence_length, interval[0], non_silence_start)
        
        before_silence_start = interval[1]

    if non_silence_start != len(audio_file):
        intervals_jsons.append({'start':non_silence_start,'end':len(audio_file),'tag':'1000'})

    return intervals_jsons

In [10]:
def shorter_filler(json_result, audio_file, min_silence_len, start_time, non_silence_start):
  
    # 침묵 길이를 더 짧게
    min_silence_length = (int)(min_silence_len/1.2)

    intervals = detect_nonsilent(audio_file,
                                min_silence_len=min_silence_length,
                                silence_thresh=-32.64
                                )
    
    for interval in intervals:
        interval_audio = audio_file[interval[0]:interval[1]]

        # padding 40 길이 이상인 경우 더 짧게
        if (interval[1]-interval[0] >= 460):
            non_silence_start = shorter_filler(json_result, interval_audio, min_silence_length, interval[0]+start_time, non_silence_start)

        else: # padding 40 길이보다 짧은 경우 predict
            if predict_filler(interval_audio) == 0 : # 추임새인 경우
                json_result.append({'start':non_silence_start,'end':start_time+interval[0],'tag':'1000'}) # tag: 1000 means non-slience
                non_silence_start = start_time + interval[0]
        
                # 추임새 tagging
                json_result.append({'start':start_time+interval[0],'end':start_time+interval[1],'tag':'1111'}) # tag: 1111 means filler word
        
    return non_silence_start

## 구간별 태깅(침묵/추임새) 및 STT(비침묵) 호출

In [11]:
import json
def STT_with_json_google(audio_file, jsons):
    first_silence = 0
    num = 0
    unrecognizable_start = 0
    r = sr.Recognizer()

    transcript_json = []
    statistics_filler_json = []
    statistics_silence_json = []
    filler_1 = 0  # '어' 횟수
    filler_2 = 0  # '음' 횟수
    filler_3 = 0  # '그' 횟수
    audio_total_length = audio_file.duration_seconds
    silence_interval = 0

    for js in jsons:
        if js['tag'] == '0000':

            # 통역 개시 지연시간
            if num == 0:
                first_silence = first_silence + (js['end']-js['start'])/1000

            else:
                silence_interval = silence_interval + (js['end']-js['start'])/1000
                silence = "(침묵 " + str(round((js['end']-js['start'])/1000)) + "초).."
                transcript_json.append({'start':js['start'],'end':js['end'],'tag':'0000','result':silence})


        elif js['tag'] == '1111':
            # 통역 개시 지연시간
            if num == 0:
                silence = "(침묵 " + str(round(first_silence)) + "초).."
                transcript_json.append({'start':0,'end':js['start'],'tag':'0000','result':silence})
                first_silence_interval = first_silence
            # 추임새(어, 음, 그) 구분  
            filler_type = predict_filler_type(audio_file[js['start']:js['end']])
            if filler_type == 0 :
                transcript_json.append({'start':js['start'],'end':js['end'],'tag':'1001','result':'어(추임새)'})
                filler_1 = filler_1 + 1
            elif filler_type == 1:
                transcript_json.append({'start':js['start'],'end':js['end'],'tag':'1010','result':'음(추임새)'})
                filler_2 = filler_2 + 1
            else:
                transcript_json.append({'start':js['start'],'end':js['end'],'tag':'1100','result':'그(추임새)'})
                filler_3 = filler_3 + 1
            num = num + 1
   

        elif js['tag'] == '1000': # 이때만 stt 사용. 이때만 발화구간

            if unrecognizable_start != 0:
                audio_file[unrecognizable_start:js['end']].export("temp.wav", format="wav")
            else:
                audio_file[js['start']:js['end']].export("temp.wav", format="wav")

            temp_audio_file = sr.AudioFile('temp.wav')

            with temp_audio_file as source:
                audio = r.record(source)


            try:
                stt = r.recognize_google(audio_data=audio, language="ko-KR")
                brr_check = brr_ok(audio_thul,brrupper,brrlower,audio_len)
                # 통역 개시 지연시간
                if num == 0:
                    silence = "(침묵 " + str(round(first_silence)) + "초).."
                    transcript_json.append({'start':0,'end':js['start'],'tag':'0000','result':silence})
                    first_silence_interval = first_silence
                if unrecognizable_start != 0:
                    if brr_check == True:
                        transcript_json.append({'start':unrecognizable_start,'end':js['end'],'tag':'1000','result':stt+"(떨림)"})
                    else:
                        transcript_json.append({'start':unrecognizable_start,'end':js['end'],'tag':'1000','result':stt})
                    
                else:
                    #transcript_json.append({'start':js['start'],'end':js['end'],'tag':'1000','result':stt})
                    if brr_check == True:
                        transcript_json.append({'start':js['start'],'end':js['end'],'tag':'1000','result':stt+"(떨림)"})
                    else:
                        transcript_json.append({'start':js['start'],'end':js['end'],'tag':'1000','result':stt})
                unrecognizable_start = 0
                num = num + 1
            except:
                if unrecognizable_start == 0:
                    unrecognizable_start = js['start']


    statistics_filler_json.append({'어':filler_1, '음':filler_2, '그':filler_3})
    statistics_silence_json.append({'통역개시지연시간':100 * first_silence_interval/audio_total_length, '침묵시간':100 * silence_interval/audio_total_length, '발화시간':100 * (audio_total_length - first_silence - silence_interval)/audio_total_length})
    return transcript_json, statistics_filler_json, statistics_silence_json

In [19]:
import json
def STT_with_json_google(audio_file, jsons,brrupper,brrlower):
    first_silence = 0
    num = 0
    unrecognizable_start = 0
    r = sr.Recognizer()

    transcript_json = []
    statistics_filler_json = []
    statistics_silence_json = []
    filler_1 = 0  # '어' 횟수
    filler_2 = 0  # '음' 횟수
    filler_3 = 0  # '그' 횟수
    audio_total_length = audio_file.duration_seconds
    silence_interval = 0
# js 각각의 구간
    for js in jsons: #구간 리스트 :
        if js['tag'] == '0000':

            # 통역 개시 지연시간
            if num == 0:
                first_silence = first_silence + (js['end']-js['start'])/1000

            else:
                silence_interval = silence_interval + (js['end']-js['start'])/1000
                silence = "(침묵 " + str(round((js['end']-js['start'])/1000)) + "초).."
                transcript_json.append({'start':js['start'],'end':js['end'],'tag':'0000','result':silence})

        elif js['tag'] == '1111':
            # 통역 개시 지연시간
            if num == 0:
                silence = "(침묵 " + str(round(first_silence)) + "초).."
                transcript_json.append({'start':0,'end':js['start'],'tag':'0000','result':silence})
                first_silence_interval = first_silence
            # 추임새(어, 음, 그) 구분  
            filler_type = predict_filler_type(audio_file[js['start']:js['end']])
            if filler_type == 0 :
                transcript_json.append({'start':js['start'],'end':js['end'],'tag':'1001','result':'어(추임새)'})
                filler_1 = filler_1 + 1
            elif filler_type == 1:
                transcript_json.append({'start':js['start'],'end':js['end'],'tag':'1010','result':'음(추임새)'})
                filler_2 = filler_2 + 1
            else:
                transcript_json.append({'start':js['start'],'end':js['end'],'tag':'1100','result':'그(추임새)'})
                filler_3 = filler_3 + 1
            num = num + 1
   
        elif js['tag'] == '1000': # 이때만 stt 사용. 이때만 발화구간

            if unrecognizable_start != 0:
                audio_file[unrecognizable_start:js['end']].export("temp.wav", format="wav")
                audio_len = js['end'] - unrecognizable_start
            else:
                audio_file[js['start']:js['end']].export("temp.wav", format="wav")
                audio_len = js['end'] - js['start']
            audio_len =audio_len/1000
            audio_thul = "temp.wav"
            temp_audio_file = sr.AudioFile('temp.wav')

            with temp_audio_file as source:
                audio = r.record(source)


            try:
                stt = r.recognize_google(audio_data=audio, language="ko-KR")
                brr_check = brr_ok(audio_thul,brrupper,brrlower,audio_len)
                # 통역 개시 지연시간
                if num == 0:
                    silence = "(침묵 " + str(round(first_silence)) + "초).."
                    transcript_json.append({'start':0,'end':js['start'],'tag':'0000','result':silence})
                    first_silence_interval = first_silence
                if unrecognizable_start != 0:
                    if brr_check == True:
                        transcript_json.append({'start':unrecognizable_start,'end':js['end'],'tag':'1000','result':stt+"(떨림)"})
                    else:
                        transcript_json.append({'start':unrecognizable_start,'end':js['end'],'tag':'1000','result':stt})
                    
                else:
                    #transcript_json.append({'start':js['start'],'end':js['end'],'tag':'1000','result':stt})
                    if brr_check == True:
                        transcript_json.append({'start':js['start'],'end':js['end'],'tag':'1000','result':stt+"(떨림)"})
                    else:
                        transcript_json.append({'start':js['start'],'end':js['end'],'tag':'1000','result':stt})
                unrecognizable_start = 0
                num = num + 1
            except:
                if unrecognizable_start == 0:
                    unrecognizable_start = js['start']

    statistics_filler_json.append({'어':filler_1, '음':filler_2, '그':filler_3})
    statistics_silence_json.append({'통역개시지연시간':100 * first_silence_interval/audio_total_length, '침묵시간':100 * silence_interval/audio_total_length, '발화시간':100 * (audio_total_length - first_silence - silence_interval)/audio_total_length})
    return transcript_json, statistics_filler_json, statistics_silence_json


## 최종 함수

In [28]:
def make_transcript_google(audio_file_path, brrupper, brrlower):
    audio = AudioSegment.from_mp3(audio_file_path)
    normalized_audio = match_target_amplitude(audio, -20.0)
    intervals_jsons = create_json(normalized_audio)
    transcript_json, statistics_filler_json, statistics_silence_json = STT_with_json_google(normalized_audio, intervals_jsons, brrupper, brrlower)

    # return transcript_json
    return transcript_json, statistics_filler_json, statistics_silence_json

In [29]:
import pandas as pd
def json_to_text(transcript):
    temp = pd.DataFrame(transcript)
    # 추임새가 연속된 중복 행으로 나오면 첫 번째 행만 남기기
    temp_ = temp.loc[temp['result'] != temp['result'].shift()].reset_index(drop=True)
    
    text = ''
    for txt in temp_['result']:
        text += txt + ' '
    text = text.rstrip()
    
    return text

In [36]:
# 떨림분석을 위한 코드
lower,upper = cut_off("/content/drive/MyDrive/test/custom5.wav")

In [37]:
transcript_json, statistics_filler_json, statistics_silence_json = make_transcript_google("/content/drive/MyDrive/test/custom5.wav", upper, lower)

  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]


In [38]:
transcript_json
# 0.4초
# short_filler

[{'end': 2221, 'result': '(침묵 2초)..', 'start': 0, 'tag': '0000'},
 {'end': 2227, 'result': '그(추임새)', 'start': 2221, 'tag': '1100'},
 {'end': 2234, 'result': '그(추임새)', 'start': 2228, 'tag': '1100'},
 {'end': 2245, 'result': '그(추임새)', 'start': 2235, 'tag': '1100'},
 {'end': 2252, 'result': '그(추임새)', 'start': 2246, 'tag': '1100'},
 {'end': 2259, 'result': '그(추임새)', 'start': 2253, 'tag': '1100'},
 {'end': 2277, 'result': '그(추임새)', 'start': 2260, 'tag': '1100'},
 {'end': 2534, 'result': '어(추임새)', 'start': 2278, 'tag': '1001'},
 {'end': 2556, 'result': '그(추임새)', 'start': 2535, 'tag': '1100'},
 {'end': 2578, 'result': '그(추임새)', 'start': 2557, 'tag': '1100'},
 {'end': 2600, 'result': '그(추임새)', 'start': 2579, 'tag': '1100'},
 {'end': 2676, 'result': '그(추임새)', 'start': 2670, 'tag': '1100'},
 {'end': 2683, 'result': '그(추임새)', 'start': 2677, 'tag': '1100'},
 {'end': 2687, 'result': '그(추임새)', 'start': 2684, 'tag': '1100'},
 {'end': 5511, 'result': '실제로 추임새 모델들을(떨림)', 'start': 2073, 'tag': '1000'},


In [39]:
json_to_text(transcript_json)

'(침묵 2초).. 그(추임새) 어(추임새) 그(추임새) 실제로 추임새 모델들을(떨림) (침묵 1초).. 사용해서 (침묵 1초).. 음(추임새) 그 전사 알고리즘 API를 돌려보니 (침묵 1초).. 음(추임새) 그(추임새) 별도로'

## Reference
https://github.com/EwhaSpeakUP/SpeakUP_ML <br>
https://youdaeng-com.tistory.com/5