In [1]:
# Install Required Packages if you don't have them:
# !pip install python-dotenv joblib anthropic hume-api

# Import Statements
import os
from dotenv import load_dotenv
from joblib import load
from pprint import pprint
import time
import traceback

# Custom Modules (Assuming these are in your project structure)
from src.utils.anthropicwrapper import ClaudeChat, ClaudeChatAssess
from src.utils.humewrapper import HumeSentimentAnalyzer
from src.modules import ConversationVerifier

import json
import os

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:

# --- Load Environment Variables ---
load_dotenv(os.path.join(os.path.dirname(os.getcwd()), ".env"))

# --- Timestamp Input ---
timestamp = 1724530742  # Example timestamp, replace with your input 

# --- Load Data ---
directory = f'data/interviews/{timestamp}/'
chatlog = load(os.path.join(directory, "joblib/conversation.joblib"))



In [3]:
# --- Model Setup --- 
try:
    sentiment_analyser = HumeSentimentAnalyzer(api_key=os.getenv("HUME_API_KEY"))

    sentiment_system_prompt = """You are a skilled emotions analyst provided with a detailed breakdown of sentiment analysis scores from Hume.ai, for a single response in an interview to a question from the interviewer. The scores are split into 3 sections. All numbers are from 0 to 1, linearly scaling, with 1 being a very strong representation of the indicator in question.

    First, Emotions. This contains several human emotions with a numerical value indicating the strength of the corresponding emotion.
    Second, Sentiments. This contains a scale from 1 to 9, each containing a numerical value indicating the magnitude of the sentiment of the topic of the conversation. A negative topic such as murder will have a high value lower in the scale, such as 1 or 2, and a positive topic will have a high value from 0 to 1 higher in the scale such as 8 or 9.
    Third, Toxicity. This contains several toxic representations such as hate, insult, etc, with a value from 0 to 1 for each representation identified in the audio.

    Your job is to provide a concise detailed one sentence breakdown of how the individual was feeling for the particular scores provided. You must be highly objective as your job is to discern whether or not a candidate was exhibiting traits which would or would not be fitting for a successful interview. 
    Model your answer beginning with something along the lines of "For this particular response, the candidate..."""
    
    sentiment_summariser = ClaudeChat("claude-3-5-sonnet-20240620", sentiment_system_prompt)

    evaluation_system_prompt = """You are a highly skilled interviewer currently tasked with reviewing a phone screening interview candidate to decide whether they are to pass on to the next stage of the interview process. There is a high volume of candidates and as such only 47% of candidates should be allowed to pass.
    You will be provided with the following information to aid your decision:
    1. A copy of the job description.
    2. Transcript of the phone interview between the interviewer and candidate.
    3. Sentiment analysis summary to provide you with insight into the tone of the candidate in the call
    4. A summary of any factual inaccuracies the candidate may have made during discussions in the call
    5. A copy of the candidate's CV, which will be provided next.

    You are to evaluate the candidate, primarily on the transcript, and use the additional information provided to identify any potential red-flags. Your response should include a detailed breakdown of why the candidate is chosen to continue onwards to further interviewing. You must end the breakdown with a simple one word response on a new line, "pass" or "fail"."""

    candidate_evaluator = ClaudeChatAssess("claude-3-5-sonnet-20240620", evaluation_system_prompt, "data/cvs/cv-deb.pdf")

except Exception as e:
    print(f"An error occurred during model setup: {e}")
    raise



Error processing PDF: no such file: 'data/cvs/cv-deb.pdf'


In [4]:
# --- Reformat Chatlog ---
def reformat_chatlog(chatlog):
    dropped_context = chatlog[3:]
    outputchatlog = []

    for i in range(0, len(dropped_context), 2):
        if i + 1 < len(dropped_context):
            tempdict = {
                'interviewer': dropped_context[i]['content'],
                'candidate': dropped_context[i+1]['content']
            }
            outputchatlog.append(tempdict)
        else:
            break 
            
    return outputchatlog

# --- Process Sentiments ---


In [5]:
def process_sentiments(chatlog_chat, timestamp):
    filepath = os.path.join('data', 'interviews', str(timestamp), 'audio')
    print(f"Current working directory: {os.getcwd()}")
    print(f"Full audio directory path: {os.path.abspath(filepath)}")

    files = [f for f in os.listdir(filepath) if os.path.isfile(os.path.join(filepath, f))]
    
    if len(chatlog_chat) < len(files):
        files = files[:len(chatlog_chat)]

    for f in files:
        print(f"File found: {f}")

    sentiments = []
    
    for count, file in enumerate(files, 1):
        full_file_path = os.path.join(filepath, file)
        print(f"Processing file: {full_file_path}")
        
        # Add a small delay and re-check file existence
        time.sleep(0.1)
        if not os.path.exists(full_file_path):
            print(f"File not found (after delay): {full_file_path}")
            continue

        try:
            result = sentiment_analyser.analyze_audio(full_file_path)
            sentiment_summary = sentiment_summariser.chat(str(result))
            sentiments.append((result, sentiment_summary))
            chatlog_chat[count-1]['sentiment'] = sentiment_summary
        except Exception as e:
            print(f"Error processing file {full_file_path}: {str(e)}")
            traceback.print_exc()

    return chatlog_chat



In [6]:
# --- Evaluate Candidate --- 
def evaluate_candidate(chatlog_chat):
    ConversationVerifier.process_qa_pair(chatlog_chat)
    print("The Feedback JSON from the sentiment analyser and accuracy verifier: \n")
    pprint(chatlog_chat)
    evaluation = candidate_evaluator.chat(str(chatlog_chat))
    return evaluation



In [7]:
    # --- Main Execution Flow ---
for i in range(9):
        print(f'---------------------------{timestamp}-{i}------------------------')
        chatlog_chat = reformat_chatlog(chatlog)  # Process the loaded chatlog
        chatlog_chat = process_sentiments(chatlog_chat, timestamp) # Analyze sentiments

        #chatlog_chat_copy = chatlog_chat[1]
        evaluation = evaluate_candidate(chatlog_chat) # Get the final evaluation


        chatlog_file_path = f"data/interviews/{timestamp}-{i}/outcome/"

        if not os.path.exists(chatlog_file_path):
            os.makedirs(chatlog_file_path)

        with open(chatlog_file_path+"chatlog.json", "w") as file:
            json.dump(chatlog_chat, file, indent=4)
        evaluation_file_path = f"data/interviews/{timestamp}-{i}/outcome/"
        if not os.path.exists(chatlog_file_path):
            os.makedirs(chatlog_file_path)
        with open(evaluation_file_path+"evaluation.txt", "w") as file:
            file.write(str(evaluation).replace("\\n", "\n"))
        print(f'---------------------------{timestamp}-{i}------------------------')


---------------------------1724530742-0------------------------
Current working directory: d:\Kent\University Of Kent UK\Projects\Disso\Screening-LLM
Full audio directory path: d:\Kent\University Of Kent UK\Projects\Disso\Screening-LLM\data\interviews\1724530742\audio
File found: audio_1_1724530742.wav
File found: audio_2_1724530742.wav
File found: audio_3_1724530742.wav
File found: audio_4_1724530742.wav
Processing file: data\interviews\1724530742\audio\audio_1_1724530742.wav
Analyzing audio file: data\interviews\1724530742\audio\audio_1_1724530742.wav
File exists: True
Job submitted successfully
Job completed
[{'source': {'type': 'file', 'filename': 'audio_1_1724530742.wav', 'content_type': 'audio/wav', 'md5sum': '31b16fd26394f971a482304cc02e5e3a'}, 'results': {'predictions': [{'file': 'audio_1_1724530742.wav', 'file_type': 'audio', 'models': {'language': {'metadata': {'confidence': 0.9243164, 'detected_language': 'nl'}, 'grouped_predictions': [{'id': 'unknown', 'predictions': [{'tex

In [12]:
chatlog_chat = [
  {
    "interviewer": "Hello! Thank you for taking the time to speak with me today about the Entry-Level RAG AI Engineer role. I'd like to start by asking you a few questions about your experience and skills. Could you tell me about any projects you've worked on involving retrieval-augmented generation (RAG) pipelines?",
    "candidate": " I'm sure so I'm studying artificial intelligence at the University of Kent currently and for my final dissertation. I'm working on making a automated screening Interview agent and to implement this I have used a rag pipeline mainly as the accuracy verifier so what happens is when the Candidate answers their questions it goes through two pipelines one is the sentiment analysis and one is the accuracy verifier For the accuracy verifier I have implemented a retrieval augmented generation, which would basically break down the answer into separate Searchable strings which will then be searched on Google and The first two articles it will retrieve the contents of the first two articles and input that in the context of the LLM So the LM has more up-to-date information to verify with the whether the answer from the candidate is accurate or not and to give an accuracy percentage",
    "sentiment": "For this particular response, the candidate exhibited a high level of interest (0.51) and confusion (0.33), along with notable levels of awkwardness (0.16) and doubt (0.14), suggesting they were engaged but struggling to fully comprehend or articulate their thoughts, which could indicate a lack of preparation or nervousness during the interview.",
    "feedback": "**Accuracy Score:** 10%\n\n**Feedback:**\n\n1. The candidate mentioned using a RAG pipeline for an automated screening interview agent, which is relevant and demonstrates practical experience. However, the explanation of how the RAG pipeline is implemented could be clearer. Specifically, the candidate stated that the answer is broken down into \"searchable strings\" and then searched on Google. While this approach is valid, it would be beneficial to clarify how the retrieved articles are integrated into the LLM's context for verification. \n\n2. The candidate described using sentiment analysis alongside the RAG pipeline, which is an interesting addition. However, the connection between sentiment analysis and the accuracy verification process was not clearly articulated. It would enhance the response to explain how sentiment analysis contributes to the overall evaluation of the candidate's answers.\n\n3. The candidate's explanation of retrieving the contents of the first two articles from Google is somewhat vague. It would be helpful to specify how the information from these articles is processed and utilized in the context of the LLM to assess the accuracy of the candidate's responses.\n\nOverall, the candidate demonstrates a solid understanding of RAG pipelines and their application in a real-world project, but further clarity and detail in the explanation would strengthen the response."
  },
  {
    "interviewer": "That's an interesting project. Can you elaborate on the specific challenges you faced while implementing the RAG pipeline for your accuracy verifier? How did you address issues like retrieval quality or context relevance?",
    "candidate": " Yes, so to improve the context or the retrieval quality of the rag pipeline, I had to break down the answer from the candidate into searchable strings with the help of an LLM. So let's say an answer can be broken down into six query strings. Each of these six query strings would then be used to search in Google and we would draw the context from the first two web pages. So in a total we would get the information from a total of 12 web pages for one answer. So this I think is plenty of information to feed the LLM. This answer, this document would then be stored in a vector store and when the LLM would be queried on a specific topic or like when the LLM wanted to verify the accuracy of a certain answer it would then use a cosine similarity to find out the relevant portions of the vector store that are relevant to the answer. And doing this, it would vastly improve the quality of the answers fetched. I got this from a paper written by Google called Query Decomposition. This was the technique they used and this overcame the shortcomings of just searching for two or three websites instead of getting a more holistic picture of the entire topics being discussed in the answer.",
    "sentiment": "For this particular response, the candidate exhibited a high level of concentration (0.838) and contemplation (0.361), coupled with moderate levels of determination (0.254), interest (0.288), and satisfaction (0.226), suggesting they were deeply engaged in providing a thoughtful and focused answer while maintaining a generally positive demeanor during the interview.",
    "feedback": "**Accuracy Score:** 10%\n\n**Feedback:**\n\n1. The candidate's explanation of breaking down the answer into \"searchable strings\" is a valid approach, but it lacks clarity on how these strings are generated and how they relate to the original answer. It would be beneficial to elaborate on the process of creating these strings and their significance in improving retrieval quality.\n\n2. The candidate mentions retrieving content from the first two articles found on Google, but does not specify how the information from these articles is processed or integrated into the LLM's context. Providing details on this integration process would enhance the understanding of how the RAG pipeline functions in practice.\n\n3. While the candidate references a paper by Google on \"Query Decomposition,\" it would strengthen the response to briefly explain how this technique specifically addresses the challenges faced in their implementation. This would provide a clearer connection between the theoretical background and the practical application.\n\n4. The mention of using cosine similarity to find relevant portions of the vector store is a good point, but the candidate could clarify how this similarity measure is applied in the context of verifying the accuracy of the candidate's answers. More detail on this process would improve the overall explanation.\n\nOverall, the candidate demonstrates a solid understanding of RAG pipelines and their application, but further elaboration on specific processes and techniques would enhance the clarity and depth of the response."
  },
  {
    "interviewer": "That's a sophisticated approach. How did you handle the integration of this RAG pipeline with the large language model? Were there any specific challenges in terms of prompt engineering or model selection?",
    "candidate": " For model selection, we chose JATGBD 4.0 mini mainly because we used Langchain to implement the rank pipeline and GPT 4.0 mini had the perfect balance of intelligence and cost effectiveness and also speed that we had to manage. And this was just to verify the answer. So we did not go for a more sophisticated model such as Claude SONET 3.5 which is considered the most intelligent LLM till now. We did not need such a high powered LLM, we just needed a cost effective LLM to just verify the answer and make searchable strings and JATGBD 4.0 mini was perfect for the job. Apart from this, for prompt engineering, yes, I had to write several prompts to give the last rank pipeline to verify the answer. So what would happen is when we converted speech to text from the interview, some of the text had grammatical errors or typographical errors which is common for most text translation apps. So to overcome this, I had to prompt the LLM to specifically overlook grammatical errors or to make sense of words that were not properly converted but were close to the actual word that the candidate was trying to explain. So these were some of the challenges that I faced.",
    "sentiment": "For this particular response, the candidate exhibited a complex emotional state characterized by high levels of annoyance (0.2), disapproval (0.15), and realization (0.15), balanced with moderate levels of concentration (0.1), contemplation (0.1), and interest (0.1), suggesting a critical and analytical approach to the topic while maintaining professional engagement despite some negative feelings.",
    "feedback": "**Accuracy Score:** 10%\n\n**Feedback:**\n\n1. The candidate mentions selecting \"JATGBD 4.0 mini\" for the RAG pipeline, which appears to be a transcription error. The correct model name is likely \"GPT-4o mini.\" This should be clarified, as it may lead to confusion regarding the model's capabilities and features.\n\n2. The candidate states that they chose GPT-4o mini for its \"balance of intelligence and cost-effectiveness.\" While this is a valid point, it would be beneficial to provide more specific details about how the model's performance metrics (e.g., accuracy, speed) influenced their decision. This would strengthen the rationale behind the model selection.\n\n3. The explanation of prompt engineering is somewhat vague. The candidate mentions writing several prompts to help the LLM overlook grammatical errors, but it would be helpful to provide examples of these prompts or describe the process in more detail. This would enhance the understanding of how prompt engineering was applied in practice.\n\n4. The candidate discusses the challenges faced with grammatical and typographical errors in the transcribed text. While they mention prompting the LLM to overlook these issues, it would be useful to elaborate on how this impacted the overall accuracy of the responses and the effectiveness of the RAG pipeline.\n\n5. The candidate's reference to the need for a cost-effective LLM is valid, but they could further explain how the choice of model directly impacted the performance of the RAG pipeline in terms of retrieval quality and context relevance. This would provide a clearer connection between model selection and the challenges faced during implementation."
  },
  {
    "interviewer": "Thank you for sharing those details. Can you discuss any experience you have with optimizing model performance, particularly in terms of speed and cost efficiency?",
    "candidate": " So far I have not optimized any model. By optimizing I am thinking you mean fine tuning model. So for the specific project fine tuning was not necessary. However, we had to determine which model best suited the specific area of our project. So for example, for the real time conversation where the LLM had to generate questions and interact with the candidate, we went with Claude 3.5 Sonnet which is the most intelligent LLM till date as preferred by most developers. And again for the accuracy verifier we went with ChatGPD 4.0 Mini which is a cut down version of ChatGPD 4.0 which itself is a very powerful LLM. However, 4.0 Mini has the right balance of intelligence and cost effectiveness and also speed. Then for the sentiment analysis we went with Hume AI which is an external service that does the sentiment analysis directly from audio and video feed. So the service, we don't know the specific implementation of the service because we are paying to use the service. And after that getting the sentiment and accuracy verifier score we then feed it into Claude Sonnet 3.5 again to make sense of the answers that the candidate made from both the accuracy verifier and from the sentiment analysis and to give the final verdict of the candidate. So these are the main considerations we made when choosing an LLM.",
    "sentiment": "For this particular response, the candidate exhibited a high level of concentration (0.436) and determination (0.292), coupled with moderate levels of contemplation (0.264) and confusion (0.174), suggesting they were deeply engaged in processing the question while experiencing some uncertainty, which could be viewed as a thoughtful approach to a challenging interview question.",
    "feedback": "**Accuracy Score:** 10%\n\n**Feedback:**\n\n1. The candidate states that they have not optimized any model, which directly answers the question but lacks depth. It would be beneficial to elaborate on any considerations or evaluations made regarding model performance, even if no formal optimization was conducted.\n\n2. The candidate mentions selecting Claude 3.5 Sonnet for real-time conversation and ChatGPT 4.0 Mini for accuracy verification. While they provide reasoning for these choices, it would enhance the response to include specific metrics or performance indicators that influenced their decision-making process regarding speed and cost efficiency.\n\n3. The explanation of using Hume AI for sentiment analysis is somewhat vague. The candidate could clarify how the integration of this external service impacts the overall model performance, particularly in terms of speed and cost efficiency.\n\n4. The candidate discusses feeding the results from the accuracy verifier and sentiment analysis back into Claude Sonnet 3.5 for final evaluation. However, they do not address how this process affects the overall efficiency and effectiveness of the model. More detail on this integration would strengthen the response.\n\n5. The candidate's mention of not knowing the specific implementation details of the Hume AI service is valid, but it would be helpful to discuss any considerations taken into account when selecting this service, particularly regarding its performance and cost.\n\nOverall, while the candidate provides relevant information about model selection and application, further elaboration on the optimization process, performance metrics, and integration details would improve the clarity and depth of the response."
  }
]



In [13]:
timestamp = 1724629705
#evaluation = evaluate_candidate(chatlog_chat) # Get the final evaluation
evaluation = candidate_evaluator.chat(str(chatlog_chat))
    
chatlog_file_path = f"data/interviews/{timestamp}/outcome/"

if not os.path.exists(chatlog_file_path):
    os.makedirs(chatlog_file_path)

with open(chatlog_file_path+"chatlog.json", "w") as file:
    json.dump(chatlog_chat, file, indent=4)
evaluation_file_path = f"data/interviews/{timestamp}/outcome/"
if not os.path.exists(chatlog_file_path):
    os.makedirs(chatlog_file_path)
with open(evaluation_file_path+"evaluation.txt", "w") as file:
    file.write(str(evaluation).replace("\\n", "\n"))
print(f'---------------------------{timestamp}------------------------')


---------------------------1724629705------------------------
