# Speech and text interaction for mental illness

In [None]:
%%time

from IPython.display import clear_output

! pip install sentence_transformers==2.2.2

! pip install -qq -U langchain
! pip install -qq -U tiktoken
! pip install -qq -U pypdf
! pip install -qq -U faiss-gpu
! pip install -qq -U InstructorEmbedding


! pip install -qq -U transformers
! pip install -qq -U accelerate
! pip install -qq -U bitsandbytes
! pip install gtts librosa sounddevice

clear_output()

CPU times: user 997 ms, sys: 146 ms, total: 1.14 s
Wall time: 2min 32s


In [None]:
pip install -U langchain-community
!pip install librosa sounddevice gtts
!apt install libportaudio2
!pip install SpeechRecognition
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg
!pip install PyAudio
!pip install ffmpeg-python
!pip install ipywebrtc

clear_output()

Collecting langchain-community
  Downloading langchain_community-0.2.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: langchain-community
Successfully installed langchain-community-0.2.0


**Recording audio and saving**

In [81]:
from google.colab import drive
from google.colab import files
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg

# Authenticate and mount Google Drive
#drive.mount('/content/drive')

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data);
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});

</script>
"""

def save_audio_to_drive(audio_data, file_path):
    with open(file_path, 'wb') as f:
        f.write(audio_data)

def get_audio():
    display(HTML(AUDIO_HTML))
    data = eval_js("data")
    binary = b64decode(data.split(',')[1])

    process = (ffmpeg
               .input('pipe:0')
               .output('pipe:1', format='wav')
               .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
              )
    output, err = process.communicate(input=binary)

    riff_chunk_size = len(output) - 8
    # Break up the chunk size into four bytes, held in b.
    q = riff_chunk_size
    b = []
    for i in range(4):
        q, r = divmod(q, 256)
        b.append(r)

    # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
    riff = output[:4] + bytes(b) + output[8:]

    return riff

# Example usage
audio_data = get_audio()
save_audio_to_drive(audio_data, '/content/recorded_audio.wav')

**Chatbot Building**

In [None]:
%%time

import warnings
warnings.filterwarnings("ignore")

import os
import glob
import textwrap
import time

import langchain

### loaders
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

### splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

### prompts
from langchain import PromptTemplate, LLMChain

### vector stores
from langchain.vectorstores import FAISS

### models
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceInstructEmbeddings

### retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

clear_output()

CPU times: user 8.68 s, sys: 1.32 s, total: 10 s
Wall time: 17.9 s


In [28]:
print('langchain:', langchain.__version__)
print('torch:', torch.__version__)
print('transformers:', transformers.__version__)

langchain: 0.2.0
torch: 2.3.0+cu121
transformers: 4.41.0


In [None]:
class CFG:
    # LLMs
    model_name = 'llama2-7b-chat'
    temperature = 0
    top_p = 0.95
    repetition_penalty = 1.15

    # splitting
    split_chunk_size = 800
    split_overlap = 0

    # embeddings
    embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'


In [None]:
def get_model(model = CFG.model_name):

    print('\nDownloading model: ', model, '\n\n')

    if model == 'llama2-7b-chat':
        model_repo = 'daryl149/llama-2-7b-chat-hf'

        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
            trust_remote_code = True
        )

        max_len = 150

    else:
        print("Not implemented model (tokenizer and backbone)")

    return tokenizer, model, max_len

In [None]:
%%time

tokenizer, model, max_len = get_model(model = CFG.model_name)

clear_output()

CPU times: user 23.5 s, sys: 28.2 s, total: 51.7 s
Wall time: 2min 43s


In [None]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [None]:
### check how Accelerate split the model across the available devices (GPUs)
model.hf_device_map

{'': 0}

In [None]:
### hugging face pipeline
pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    pad_token_id = tokenizer.eos_token_id,
#     do_sample = True,
    max_length = max_len,
    temperature = CFG.temperature,
    top_p = CFG.top_p,
    repetition_penalty = CFG.repetition_penalty
)

### langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)

  warn_deprecated(


In [None]:
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x78f56d1a26e0>)

**Speech to Text**

In [72]:
import speech_recognition as sr
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
import string
import wave

import speech_recognition as sr

# Initialize recognizer
r = sr.Recognizer()

# Specify the path to your audio file
audio_file_path = "recorded_audio.wav"

# Use the audio file as the audio source
with sr.AudioFile(audio_file_path) as source:
    # Adjust for ambient noise and record the audio from the file
    r.adjust_for_ambient_noise(source)
    audio = r.record(source)

# Recognize speech using Google Web Speech API
try:
    text_en_in = r.recognize_google(audio, language='en-IN')
    text_default = r.recognize_google(audio)

    print(f"Recognized text (en-IN): {text_en_in}")
    #print(f"Recognized text (default): {text_default}")

except sr.UnknownValueError:
    print("Chatbot could not understand the audio")
except sr.RequestError as e:
    print(f"Could not request results; {e}")

Recognized text (en-IN): died I'm so sad what should I do


**Gender and age prediction**

In [75]:
import librosa
import numpy as np
import pickle

# Load the pickled model from the file
with open('Age_gender_prediction_model.pkl', 'rb') as f:
    final_model = pickle.load(f)

# Define the list of indices for the top features
top_features_indices = [1, 3, 0, 16, 2, 22, 19, 17, 4, 26, 13, 25, 8, 27, 18]

# Function to extract features from a single WAV file
def extract_features(y, sr):
    features = {}

    # Extract features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y=y)
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    spectral_flux = np.diff(librosa.feature.spectral_centroid(y=y, sr=sr)[0])
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)

    # Compute statistical summaries
    for feature_name, feature_values in {
        'mfcc': mfcc,
        'spectral_centroid': spectral_centroid,
        'zero_crossing_rate': zero_crossing_rate,
        'spectral_rolloff': spectral_rolloff,
        'spectral_flux': spectral_flux,
        'chroma_stft': chroma_stft
    }.items():
        features[f'{feature_name}_mean'] = np.mean(feature_values)
        features[f'{feature_name}_median'] = np.median(feature_values)
        features[f'{feature_name}_std'] = np.std(feature_values)
        features[f'{feature_name}_min'] = np.min(feature_values)
        features[f'{feature_name}_max'] = np.max(feature_values)

    return features

# Function to load a single audio file and extract features
def load_and_extract_features(file_path):
    # Load audio file
    y_audio, sr = librosa.load(file_path, sr=22050)

    # Extract features from the audio file
    features = extract_features(y_audio, sr)
    sample_features = np.array(list(features.values())).reshape(1, -1)  # Reshape to match the input shape

    return sample_features

# Function to load a single audio file and predict the age
def predict_age_from_audio(file_path):
    # Load and extract features from the audio file
    sample_features = load_and_extract_features(file_path)

    # Select only the top features as selected by the indices
    sample_top_features = sample_features[:, top_features_indices]

    # Predict the age using the trained model
    predicted_age = final_model.predict(sample_top_features)

    return predicted_age[0]

# Define the path to the sample audio file
file_path = "recorded_audio.wav"

# Predict the age
predicted_age = predict_age_from_audio(file_path)

# Print the predicted age
print("Predicted age:", predicted_age)


Predicted age: 0


**Emotion**

In [76]:
from tensorflow.keras.models import model_from_json
import numpy as np
import librosa
import pickle
import json
import tensorflow as tf
import os

# Suppress TensorFlow logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel('ERROR')

# Load the model architecture
json_file_path = r"/content/CNN_model.json"
weights_file_path = r"/content/best_model1_weights_speech18.keras/model.weights.h5"

with open(json_file_path, 'r') as json_file:
    loaded_model_json = json_file.read()

# Modify the model JSON to ensure input shape is defined correctly
model_config = json.loads(loaded_model_json)
for layer in model_config['config']['layers']:
    if layer['class_name'] == 'InputLayer':
        layer['config']['batch_input_shape'] = [None, 2376, 1]
    if 'batch_shape' in layer['config']:
        del layer['config']['batch_shape']

# Convert back to JSON string
modified_model_json = json.dumps(model_config)

# Load the modified model
loaded_model = model_from_json(modified_model_json)
loaded_model.load_weights(weights_file_path)
print("Loaded model from disk")

# Load the scaler and encoder
with open(r"/content/scaler2.pickle", 'rb') as f:
    scaler2 = pickle.load(f)
with open(r"/content/encoder2.pickle", 'rb') as f:
    encoder2 = pickle.load(f)
print("Loaded scaler and encoder")

# Define feature extraction functions
def zcr(data, frame_length, hop_length):
    zcr = librosa.feature.zero_crossing_rate(data, frame_length=frame_length, hop_length=hop_length)
    return np.squeeze(zcr)

def rms(data, frame_length=2048, hop_length=512):
    rms = librosa.feature.rms(y=data, frame_length=frame_length, hop_length=hop_length)
    return np.squeeze(rms)

def mfcc(data, sr, frame_length=2048, hop_length=512, flatten: bool = True):
    mfcc = librosa.feature.mfcc(y=data, sr=sr)
    return np.squeeze(mfcc.T) if not flatten else np.ravel(mfcc.T)

def extract_features(data, sr=22050, frame_length=2048, hop_length=512):
    result = np.array([])
    result = np.hstack((result,
                        zcr(data, frame_length, hop_length),
                        rms(data, frame_length, hop_length),
                        mfcc(data, sr, frame_length, hop_length)
                       ))
    return result

def get_predict_feat(path):
    d, s_rate = librosa.load(path, duration=2.5, offset=0.6)
    res = extract_features(d)

    # Ensure the result has the expected size by padding/truncating
    expected_size = 2376
    if len(res) < expected_size:
        res = np.pad(res, (0, expected_size - len(res)), 'constant')
    else:
        res = res[:expected_size]

    result = np.array(res)
    result = np.reshape(result, newshape=(1, expected_size))
    i_result = scaler2.transform(result)
    final_result = np.expand_dims(i_result, axis=2)

    return final_result

def prediction(path1):
    res = get_predict_feat(path1)
    predictions = loaded_model.predict(res, verbose=0)
    y_pred = encoder2.inverse_transform(predictions)
    return y_pred[0][0]

# Path to the audio file
file_path = "recorded_audio.wav"

# Make a prediction and assign it to a variable
predicted_emotion = prediction(file_path)
print(predicted_emotion)


Loaded model from disk
Loaded scaler and encoder
angry


In [77]:
print(predicted_emotion)

angry


For llama2-7b


In [78]:
# Define a function to interact with the wizardlm model and get answers
def get_llama2_answer(query, age, emotion):
    # Define age-specific responses
    if age == 0:
        greeting = "Hey buddy! How are your studies going on. "
    elif age == 1:
        greeting = "Hello. I am here to support you with any challenges. "
    else:
        greeting = ""  # No specific greeting for other age groups

    # Define emotion-specific responses
    if emotion == "anger":
        emotion_response = "I can sense some anger. Let's try to find a way to calm down. How can I assist you?"
    elif emotion == "disgust":
        emotion_response = "I can sense some frustration. Can I help with something?"
    elif emotion == "fear":
        emotion_response = "It sounds like you're feeling scared. Is there anything I can do to help you feel safer?"
    elif emotion == "happy":
        emotion_response = "It's great to hear you're happy! How can I assist you further?"
    elif emotion == "neutral":
        emotion_response = "It seems like you're feeling neutral. What would you like to talk about?"
    elif emotion == "sad":
        emotion_response = "I'm sorry to hear you're feeling down. Is there anything I can do to help?"
    elif emotion == "surprise":
        emotion_response = "You seem surprised! What's on your mind?"
    else:
        emotion_response = ""

    # Combine the greetings and emotion-specific response
    combined_greeting = greeting + emotion_response

    # Invoke the wizardlm model to get an answer based on the query
    answer = llm.invoke(query)  # Modify this line based on your actual interaction with the model

    # Construct the final response
    final_answer = combined_greeting + answer

    return final_answer


In [79]:
# Get the input question and age
#query_text = input("\nEnter your question: ").strip()
query_text=text_en_in
age = predicted_age
emotion=predicted_emotion
answer = get_llama2_answer(query_text, age, emotion)

# Print the answer
print("\nQUESTION: \"%s\"" % query_text)
print("ANSWER: \"%s\"\n" % answer)


QUESTION: "died I'm so sad what should I do"
ANSWER: "Hey buddy! How are your studies going on. died I'm so sad what should I do?
 everybody loves you and is here for you. 💕"



In [80]:
print(answer)

Hey buddy! How are your studies going on. died I'm so sad what should I do?
 everybody loves you and is here for you. 💕


**Text to Speech**

In [81]:
from gtts import gTTS
import os
from IPython.display import Audio

def text_to_speech(text, filename):
    # Language in which you want to convert
    language = 'en'

    # Passing the text and language to the engine
    myobj = gTTS(text=text, lang=language, slow=False)

    # Saving the converted audio in a mp3 file
    myobj.save(filename)

    # Return the filename
    return filename

# The text that you want to convert to audio
mytext = answer

# Name of the output audio file
output_file = "output_t.mp3"

# Convert text to speech and save as an audio file
output_file = text_to_speech(mytext, output_file)

# Print the audio file in Colab
Audio(output_file)
