In [None]:
#@title Click to install the necessary packages. It may take a while. {display-mode: "form"}
%%capture
from google.colab import drive
drive.mount('/content/drive/')
!pip install ffmpeg-python
!pip install IPython
!pip install pydub
!pip install -U openai-whisper
!pip install praat-parselmouth

In [None]:
#@title Click to record the audio. Try saying "I like this product" in a certain or uncertain tone. {display-mode: "form"}
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg
%%capture
# Reference: https://www.youtube.com/watch?v=4DGkgUffWxs
AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {

      mimeType : 'audio/webm;codecs=opus'
  };
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data);
    reader.onloadend = function() {
      base64data = reader.result;
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);
function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "You can play the recording!"
  }
}
// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  resolve(base64data.toString())

});

}
});

</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  pricess=(ffmpeg.input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True))
  output, err = pricess.communicate(input=binary)
  riff_chunk_size = len(output) - 8
  q=riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)
  riff=output[:4] + bytes(b) + output[8:]
  sr, audio = wav_read(io.BytesIO(riff))
  return audio, sr
audio, sr = get_audio()

In [None]:
#@title Or click to upload the audio {display-mode: "form"}
from google.colab import files
import io
from scipy.io.wavfile import read as wav_read
from pydub import AudioSegment
import numpy as np
def upload_and_read_single_audio():
    print("Please upload a single audio file (.wav or .mp3):")
    uploaded = files.upload()  # User uploads a file

    # Check if more than one file was uploaded
    if len(uploaded) != 1:
        print("Please upload only one audio file at a time.")
        return None
    file_name, file_data = next(iter(uploaded.items()))
    try:
        if file_name.lower().endswith('.wav'):
            # Use scipy.io.wavfile.read for WAV files
            sr, audio = wav_read(io.BytesIO(file_data))
            if audio.ndim == 2:  # Check if audio is stereo
                audio = audio.mean(axis=1)  # Convert to mono by averaging both channels
            print(f"Loaded {file_name} successfully with sample rate {sr} Hz.")
            return sr, audio
        elif file_name.lower().endswith('.mp3'):
            # Use pydub to handle MP3 files
            audio_segment = AudioSegment.from_file(io.BytesIO(file_data), format='mp3')
            sr = audio_segment.frame_rate
            # Convert audio segment to numpy array
            samples = np.array(audio_segment.get_array_of_samples())
            if audio_segment.channels == 2:
                samples = samples.reshape((-1, 2))
                samples = samples.mean(axis=1)  # Convert to mono by averaging both channels
            print(f"Loaded {file_name} successfully with sample rate {sr} Hz.")
            return sr, samples.astype(np.int16)  # Ensure dtype is int16 for consistency with scipy.io.wavfile
        else:
            print(f"Unsupported file type uploaded: {file_name}. Please upload only .wav or .mp3 files.")
    except Exception as e:
        print(f"Failed to load {file_name}: {e}")

sr, audio = upload_and_read_single_audio()


In [None]:
# @title Click to predict certainty degree from the recorded or uploaded audio { display-mode: "form" }
%%capture
import whisper
from scipy.io.wavfile import write as wav_write
import tempfile
from scipy.interpolate import CubicSpline
import parselmouth
from parselmouth.praat import call
import joblib
import torch

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
"""def predict_text_from_audio(filename):
    model = whisper.load_model("base")  # Load the Whisper model
    # Create a temporary WAV file to handle audio correctly
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmpfile:
        model.to(DEVICE)
        result = model.transcribe(filename)
        return result["text"]
predicted_text = predict_text_from_audio(filename)"""
#prepress the audio for uncertainty prediction
def F0_contour(file_name):
    sound = parselmouth.Sound(file_name)
    Topitch=call(sound,"To Pitch (ac)",0.01, 80.0, 5,1, 0.03, 0.6, 0.03,0.6,0.14,300)
    F0_values=Topitch.selected_array['frequency']
    F0_times=Topitch.ts()
    F0_times, F0_values = zip(*((x, y) for x, y in zip(F0_times, F0_values) if str(y)!="nan" and y>0.0))
    xarray = np.array(F0_times)
    yarray = np.array(F0_values)
    data = np.column_stack([xarray, yarray])
    return [xarray, yarray]
time_F0s=F0_contour(filename)

def compute_cspline_DFT(time_F0s):
    F0_times=time_F0s[0]
    F0_values=time_F0s[1]
    F0_time_normalized_0_1=(F0_times - F0_times[0]) / (F0_times[-1] - F0_times[0])
    f = CubicSpline(F0_time_normalized_0_1, F0_values, bc_type="natural")
    x_new = np.linspace(0, 1, 400) # parameter=400
    y_new = f(x_new)
    f=np.fft.fft(y_new)[:10]
    f=f/np.abs(f[0])# Normalize the FFT by the magnitude of the zero frequency
    f_all=[]
    for i in f:
        f_all=f_all+[np.abs(i),np.abs(i)*np.angle(i)]
    return f_all
model_input=compute_cspline_DFT(time_F0s)
uncertainty_classifier = joblib.load('/content/drive/My Drive/JM_1st_R_R/New Code/random_forest_maxdepth20_short_medium_long_parameter0.62_1.pkl')
predcted_uncertainty=uncertainty_classifier.predict_proba([model_input])[:,1][0]

In [None]:
#@title Click to see the transcription and certainty degree {display-mode: "form"}
print("Transcription: ",predicted_text)
print("Certainty degree (0-extremely uncertain; 1-extremely certain)",predcted_uncertainty)