In [1]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Define the hot words (phrases) to detect
hot_words = ["be careful", "destroy", "stranger"]

In [3]:
# Load the pre-trained text embedding model (using sentence-transformers)
model_name = "hkunlp/instructor-large" 
model = SentenceTransformer(model_name)

In [8]:
# Function to compute embeddings 
def compute_embedding(text):
    if isinstance(text, str):
        embedding = model.encode(text, convert_to_tensor=True)
        return embedding
    else:
        raise ValueError(f"Expected a string, got {type(text)}")

In [9]:
# Function to check similarity between the transcription and hot words
def check_similarity(transcription, hot_words, threshold=0.9):
    # Compute the embedding for the transcription
    transcription_embedding = compute_embedding(transcription)
    
    # Check similarity with each hot word
    for hot_word in hot_words:
        # Compute the embedding for the hot word
        hot_word_embedding = compute_embedding(hot_word)

        # Reshape the transcription_embedding and hot_word_embedding (both 1D) into 2D arrays for cosine similarity (expects 2D arrays)
        transcription_embedding_reshaped = transcription_embedding.cpu().numpy().reshape(1, -1)
        hot_word_embedding_reshaped = hot_word_embedding.cpu().numpy().reshape(1, -1)
        
        # Calculate cosine similarity
        similarity = cosine_similarity(transcription_embedding_reshaped, hot_word_embedding_reshaped)[0][0]
        
        # If the similarity exceeds the threshold, return True
        if similarity > threshold:
            return True
    
    # If no hot word exceeded the threshold, return False
    return False

In [10]:
# Path to the cv-valid-dev.csv file containing the transcriptions
cv_valid_dev_path = "C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techtest/common_voice/cv-valid-dev.csv"

# Load the cv-valid-dev.csv file into a df
df = pd.read_csv(cv_valid_dev_path)

In [12]:
# Initialize a list to store similarity results (True or False)
similarity_results = []

# Iterate through the rows of the df to check for similarity
for index, row in df.iterrows():
    # Extract the transcription text from the 'generated_text_fine-tuned' column
    transcription_text = row.get("generated_text_fine-tuned", "")
    
    if isinstance(transcription_text, str):
        is_similar = check_similarity(transcription_text, hot_words)
    else:
        print(f"Invalid transcription text: {transcription_text}")   
    # Handle invalid case (e.g., skip, log, or default)
    
    # Check if the transcription contains any similar phrases to the hot words
    #is_similar = check_similarity(transcription_text, hot_words)
    
    # Append the result to the list
    similarity_results.append(is_similar)

Invalid transcription text: nan


In [13]:
# Add the similarity results to a new column 
df['similarity'] = similarity_results

# Save the updated df to cv-valid-dev.csv 
output_file_path = "C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techtest/common_voice/cv-valid-dev.csv"  
df.to_csv(output_file_path, index=False)

print(f"cv-valid-dev.csv updated with similarity and saved to {output_file_path}")

cv-valid-dev.csv updated with similarity and saved to C:/Users/lingy/OneDrive/Desktop/HTX/HTX_techtest/common_voice/cv-valid-dev.csv


In [16]:
df['similarity'].sum()

6