In [None]:
pip install langchain_google_genai

In [None]:
pip install faiss-cpu

In [None]:
pip install sentence-transformers

**Person 2**

In [None]:
import pandas as pd
import ast
from langchain_google_genai import ChatGoogleGenerativeAI
import os
import time
from typing import List
import textwrap

def chunk_text(text: str, chunk_size: int = 500) -> List[str]:
    """Split text into smaller chunks while preserving word boundaries."""
    return textwrap.wrap(text, chunk_size, break_long_words=False)

def get_translation(text: str, max_retries: int = 3, delay: int = 2) -> str:
    """Get a translation from Kannada to English with retry logic."""
    for attempt in range(max_retries):
        try:
            prompt = f"You are a Kannada to English translator. Translate the following text to English, maintaining the context and meaning: {text}"
            response = llm.invoke(prompt)
            return str(response.content) if response else "No response generated."
        except Exception as e:
            if "429" in str(e) and attempt < max_retries - 1:
                wait_time = delay * (attempt + 1)
                print(f"Rate limit hit, waiting {wait_time} seconds...")
                time.sleep(wait_time)
                continue
            print(f"Translation error: {str(e)}")
            return f"Error: {str(e)}"
    return "Failed after maximum retries"

def translate_large_text(text: str, chunk_size: int = 500) -> str:
    """Translate a large text by breaking it into chunks."""
    if not text or text.isspace():
        return ""

    chunks = chunk_text(text, chunk_size)
    translated_chunks = []

    for i, chunk in enumerate(chunks):
        print(f"Translating chunk {i+1}/{len(chunks)} ({len(chunk)} characters)")
        translated_chunk = get_translation(chunk)
        translated_chunks.append(translated_chunk)
        time.sleep(1)

    return " ".join(translated_chunks)

def translate_time_aligned_transcripts(time_aligned_dict: dict) -> dict:
    """Translate each chunk in the time-aligned transcripts."""
    translated_dict = {}

    try:
        if isinstance(time_aligned_dict, str):
            time_aligned_dict = ast.literal_eval(time_aligned_dict)

        for time_key, text in time_aligned_dict.items():
            print(f"Translating chunk for {time_key}")
            translated_text = get_translation(text)
            translated_dict[time_key] = translated_text
            time.sleep(1)

    except Exception as e:
        print(f"Error processing time-aligned transcripts: {str(e)}")
        return {}

    return translated_dict


In [None]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyC29gObkycJDBjVkEWjhJoJO-HVB0pC00E"  # Replace with your key

# Initialize the model
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.7)

# Read the original CSV
df = pd.read_csv('kannada_english_transcriptions.csv')

# Add new columns if they don't exist
if 'full_transcript_translation' not in df.columns:
    df['full_transcript_translation'] = None
if 'time_aligned_translations' not in df.columns:
    df['time_aligned_translations'] = None

# Process files 24-47
for idx in range(24, 48):
    if idx >= len(df):
        break

    row = df.iloc[idx]
    print(f"\nProcessing file {idx - 23}/24: {row['audio_file']}")

    try:
        # Skip if it's the English file
        if row['audio_file'] == "SandalWoodNewsStories_53.mp3":
            print("Skipping English audio file...")
            df.at[idx, 'full_transcript_translation'] = row['full_transcript']
            df.at[idx, 'time_aligned_translations'] = row['time_aligned_transcripts']
            continue

        # Only translate if not already translated
        if pd.isna(df.at[idx, 'full_transcript_translation']):
            print("Translating full transcript...")
            full_translation = translate_large_text(row['full_transcript'])
            df.at[idx, 'full_transcript_translation'] = full_translation

        if pd.isna(df.at[idx, 'time_aligned_translations']):
            print("Translating time-aligned transcripts...")
            time_aligned_translations = translate_time_aligned_transcripts(row['time_aligned_transcripts'])
            df.at[idx, 'time_aligned_translations'] = str(time_aligned_translations)

        # Save progress after each file
        df.iloc[24:48].to_csv('translations_person2.csv', index=False, encoding='utf-8')
        print(f"Completed translation for {row['audio_file']}")

    except Exception as e:
        print(f"Error processing row {idx}: {str(e)}")
        continue

print("\nPerson 2's translations completed!")


***Person 3***

In [None]:
import pandas as pd
import ast
from langchain_google_genai import ChatGoogleGenerativeAI
import os
import time
from typing import List
import textwrap

def chunk_text(text: str, chunk_size: int = 500) -> List[str]:
    """Split text into smaller chunks while preserving word boundaries."""
    return textwrap.wrap(text, chunk_size, break_long_words=False)

def get_translation(text: str, max_retries: int = 3, delay: int = 2) -> str:
    """Get a translation from Kannada to English with retry logic."""
    for attempt in range(max_retries):
        try:
            prompt = f"You are a Kannada to English translator. Translate the following text to English, maintaining the context and meaning: {text}"
            response = llm.invoke(prompt)
            return str(response.content) if response else "No response generated."
        except Exception as e:
            if "429" in str(e) and attempt < max_retries - 1:
                wait_time = delay * (attempt + 1)
                print(f"Rate limit hit, waiting {wait_time} seconds...")
                time.sleep(wait_time)
                continue
            print(f"Translation error: {str(e)}")
            return f"Error: {str(e)}"
    return "Failed after maximum retries"

def translate_large_text(text: str, chunk_size: int = 500) -> str:
    """Translate a large text by breaking it into chunks."""
    if not text or text.isspace():
        return ""

    chunks = chunk_text(text, chunk_size)
    translated_chunks = []

    for i, chunk in enumerate(chunks):
        print(f"Translating chunk {i+1}/{len(chunks)} ({len(chunk)} characters)")
        translated_chunk = get_translation(chunk)
        translated_chunks.append(translated_chunk)
        time.sleep(1)

    return " ".join(translated_chunks)

def translate_time_aligned_transcripts(time_aligned_dict: dict) -> dict:
    """Translate each chunk in the time-aligned transcripts."""
    translated_dict = {}

    try:
        if isinstance(time_aligned_dict, str):
            time_aligned_dict = ast.literal_eval(time_aligned_dict)

        for time_key, text in time_aligned_dict.items():
            print(f"Translating chunk for {time_key}")
            translated_text = get_translation(text)
            translated_dict[time_key] = translated_text
            time.sleep(1)

    except Exception as e:
        print(f"Error processing time-aligned transcripts: {str(e)}")
        return {}

    return translated_dict


In [None]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyD7eJyGM-Twi4Z-XUVdvJ_rGnPcJcFbgR8"

llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.7)

df = pd.read_csv('/kaggle/input/kannada-transcript/kannada_english_transcriptions.csv')

if 'full_transcript_translation' not in df.columns:
    df['full_transcript_translation'] = None
if 'time_aligned_translations' not in df.columns:
    df['time_aligned_translations'] = None

# Process files 48-end
for idx in range(48, len(df)):
    row = df.iloc[idx]
    print(f"\nProcessing file {idx - 47}/{len(df) - 48}: {row['audio_file']}")

    try:
        # Skip if it's the English file
        if row['audio_file'] == "SandalWoodNewsStories_53.mp3":
            print("Skipping English audio file...")
            df.at[idx, 'full_transcript_translation'] = row['full_transcript']
            df.at[idx, 'time_aligned_translations'] = row['time_aligned_transcripts']
            continue

        # Only translate if not already translated
        if pd.isna(df.at[idx, 'full_transcript_translation']):
            print("Translating full transcript...")
            full_translation = translate_large_text(row['full_transcript'])
            df.at[idx, 'full_transcript_translation'] = full_translation

        if pd.isna(df.at[idx, 'time_aligned_translations']):
            print("Translating time-aligned transcripts...")
            time_aligned_translations = translate_time_aligned_transcripts(row['time_aligned_transcripts'])
            df.at[idx, 'time_aligned_translations'] = str(time_aligned_translations)

        # Save progress after each file
        df.iloc[48:].to_csv('translations_person3.csv', index=False, encoding='utf-8')
        print(f"Completed translation for {row['audio_file']}")

    except Exception as e:
        print(f"Error processing row {idx}: {str(e)}")
        continue

print("\nPerson 3's translations completed!")


# Combining all the three csv into one

In [None]:
import pandas as pd

# Load the two CSV files
csv1 = pd.read_csv('/kaggle/input/first2people/translations_person1.csv')
csv2 = pd.read_csv('/kaggle/input/first2people/translations_person2.csv')
csv3 = pd.read_csv('/kaggle/working/translations_person3.csv')
# Combine the two files by appending rows
combined_csv = pd.concat([csv1, csv2,csv3], ignore_index=True)

# Save the combined data to a new CSV file
combined_csv.to_csv('allfilescombined.csv', index=False)

print("The three files have been combined and saved to 'allfilescombined.csv'")


In [None]:
import pandas as pd
df=pd.read_csv("/kaggle/input/final-translated-all/allfilescombined.csv")
df.head()

Unnamed: 0,audio_file,full_transcript,time_aligned_transcripts,full_transcript_translation,time_aligned_translations
0,SandalWoodNewsStories_179.mp3,([' ಒಂದು ಕಿಲೋ ಎಂಟು ಸಾವಿರರಿಂದ ಹನ್ನೆರಡು ಸಾವಿರ ರೂ...,"{'0.00s - 10.00s': ""([' ಒಂದು ಕಿಲೋ ಎಂಟು ಸಾವಿರರಿ...",([' One kg is eight thousand to twelve thousan...,{'0.00s - 10.00s': '([‘One kilogram is between...
1,SandalWoodNewsStories_168.mp3,([' ಗಂಧದ ಗುಡಿಯಲ್ಲಿ ಹುಟ್ಟಿರುವ ಕೃಷಿಕರೇ ನೀವು ಕೃಷಿ...,"{'0.00s - 10.00s': ""([' ಗಂಧದ ಗುಡಿಯಲ್ಲಿ ಹುಟ್ಟಿರ...",([' You are farmers born in the sandalwood for...,"{'0.00s - 10.00s': ""( ['You are a farmer who w..."
2,SandalWoodNewsStories_43.mp3,([' ಹಾಯ ಇಂಡಿಯಾ ಸೋ ಹೀರೋಸ್ ಆಕ್ಚುಲಿ ನನ್ನ ಬ್ಯಾಂಗಗಳ...,"{'0.00s - 10.00s': ""([' ಹಾಯ ಇಂಡಿಯಾ ಸೋ ಹೀರೋಸ್ ಆ...",([' Hey India so heroes actually my Bangalurul...,"{'0.00s - 10.00s': ""[' Hi India so heroes actu..."
3,SandalWoodNewsStories_176.mp3,([' ನೋಡಿ ಅರಣ್ಯ ನಮ್ ಸರ್ಕಾರ ಸುಮಾರು ಸ್ಕೆಮ್ ಗಳು ಕೊ...,"{'0.00s - 10.00s': ""([' ನೋಡಿ ಅರಣ್ಯ ನಮ್ ಸರ್ಕಾರ ...","([' Look, our government is giving around sche...","{'0.00s - 10.00s': ""([' Look, our government i..."
4,SandalWoodNewsStories_284.mp3,([' ಗೆಳೆಯ ರೇ ಚಂದನವನದಿಂದ ಮತ್ತೀಗ ನಾನು ತಾವರೆ ಸೋಟಕ...,"{'0.00s - 10.00s': ""([' ಗೆಳೆಯ ರೇ ಚಂದನವನದಿಂದ ಮತ...",[(' Oh friend I am coming now to the lotus pon...,"{'0.00s - 10.00s': ""([' Friend I'm coming to t..."


# Retrieval Part via RAG

In [None]:
import pandas as pd
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
import ast
import re

def load_and_prepare_data(csv_path):
    df = pd.read_csv(csv_path)


    df['time_aligned_transcripts'] = df['time_aligned_transcripts'].apply(ast.literal_eval)
    df['time_aligned_translations'] = df['time_aligned_translations'].apply(ast.literal_eval)

    return df

def create_documents(df):
    documents = []

    for idx, row in df.iterrows():

        for time_range, translation in row['time_aligned_translations'].items():

            metadata = {
                'audio_file': row['audio_file'],
                'time_range': time_range,
                'original_text': row['time_aligned_transcripts'].get(time_range, ''),
                'full_translation': row['full_transcript_translation']
            }

            doc = Document(
                page_content=translation,
                metadata=metadata
            )
            documents.append(doc)

    return documents

# Initialize vector store
def initialize_vector_store(documents):
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    )

    # Create FAISS index
    vector_store = FAISS.from_documents(documents, embeddings)

    return vector_store

def answer_question(question, vector_store, k=3):
    docs = vector_store.similarity_search(question, k=k)

    print("\nRelevant passages found:")
    print("-" * 80)

    for i, doc in enumerate(docs, 1):
        print(f"\nPassage {i}:")
        print(f"Audio File: {doc.metadata['audio_file']}")
        print(f"Time Range: {doc.metadata['time_range']}")
        print(f"English Translation: {doc.page_content}")
        print(f"Original Kannada: {doc.metadata['original_text']}")
        print("-" * 40)

    return docs

def setup_qa_system(csv_path):
    print("Loading data...")
    df = load_and_prepare_data(csv_path)

    print("Creating documents...")
    documents = create_documents(df)

    print("Initializing vector store...")
    vector_store = initialize_vector_store(documents)

    return vector_store

if __name__ == "__main__":
    csv_path = "/kaggle/input/final-translated-all/allfilescombined.csv"
    vector_store = setup_qa_system(csv_path)

    test_questions = [
        "What is mentioned about farmers?",
        "Is there any mention about government schemes?",
        "What is discussed about heroes?"
    ]

    print("\nTesting the system with sample questions:")
    for question in test_questions:
        print(f"\nQuestion: {question}")
        relevant_docs = answer_question(question, vector_store)

Loading data...
Creating documents...
Initializing vector store...





Testing the system with sample questions:

Question: What is mentioned about farmers?

Relevant passages found:
--------------------------------------------------------------------------------

Passage 1:
Audio File: SandalWoodNewsStories_156.mp3
Time Range: 290.00s - 300.00s
English Translation: ([Farmers are tilling the soil and doing all the ploughing and sowing in the agricultural land.], [Farmers are tilling the soil and doing all the ploughing and sowing in the agricultural land.])
Original Kannada: ([' ಫಾರ್ಮರ್ಸ್ ಆಗ್ರಿಕಲ್ಚರ ಲಂಡ್ ಅಲ್ಲಿ ಫೂಲ ಉಳುಮೆ ಮಾಡ್ಕೊಂಡು ಪ್ಲೋಇಂಗ್ ಎಲ್ಲ ಮಾಡಿ ನೆಟ್ಬೋದು ಆ'], [' ಫಾರ್ಮರ್ಸ್ ಆಗ್ರಿಕಲ್ಚರ ಲಂಡ್ ಅಲ್ಲಿ ಫೂಲ ಉಳುಮೆ ಮಾಡ್ಕೊಂಡು ಪ್ಲೋಇಂಗ್ ಎಲ್ಲ ಮಾಡಿ ನೆಟ್ಬೋದು ಆ'])
----------------------------------------

Passage 2:
Audio File: SandalWoodNewsStories_42.mp3
Time Range: 900.00s - 910.00s
English Translation: ([' Farmers' Own '], [' Farmers' Own '])
Original Kannada: ([' ರೈತಾ್ ಯ ಅ ಆದ ಬ'], [' ರೈತಾ್ ಯ ಅ ಆದ ಬ'])
----------------------------------------

Passage 3:
Audio File:

In [None]:
pip install -U langchain-community

In [None]:
pip install langchain

In [None]:
import pandas as pd
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from pydub import AudioSegment
import ast
import re
import os
from typing import List, Dict, Tuple
import torch.nn.functional as F
import torch

class AudioRAGSystem:
    def __init__(self, csv_path: str, audio_dir: str, output_dir: str = "./extracted_segments"):
        """
        Initialize the RAG system with paths for data and audio files.

        Args:
            csv_path: Path to the CSV file containing transcriptions and translations
            audio_dir: Directory containing the audio files
            output_dir: Directory to save extracted audio segments
        """
        self.csv_path = csv_path
        self.audio_dir = audio_dir
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

        # Load data and initialize system
        self.df = self.load_and_prepare_data()
        self.documents = self.create_documents()
        self.vector_store = self.initialize_vector_store()

        # Cache for loaded audio files
        self.audio_cache = {}

    def parse_time(self, time_str: str) -> float:
        """
        Parse time string to float, handling various formats.

        Args:
            time_str: Time string (e.g., "0.00s", "0.00s ", " 0.00s", etc.)

        Returns:
            float: Time in seconds
        """
        cleaned = time_str.strip().rstrip('s').strip()
        return float(cleaned)

    def load_and_prepare_data(self) -> pd.DataFrame:
        """Load and prepare the CSV data."""
        df = pd.read_csv(self.csv_path)

        def safe_eval(x):
            try:
                return ast.literal_eval(x) if isinstance(x, str) else x
            except (ValueError, SyntaxError):
                return {}

        df['time_aligned_transcripts'] = df['time_aligned_transcripts'].apply(safe_eval)
        df['time_aligned_translations'] = df['time_aligned_translations'].apply(safe_eval)
        return df

    def create_documents(self) -> List[Document]:
        """Create documents for vector store with merged time segments."""
        documents = []

        for idx, row in self.df.iterrows():
            try:

                time_ranges = sorted(
                    row['time_aligned_translations'].keys(),
                    key=lambda x: self.parse_time(x.split('-')[0])
                )

                for i in range(len(time_ranges)):
                    combined_text = ""
                    combined_original = ""
                    start_time = self.parse_time(time_ranges[i].split('-')[0])

                    j = i
                    while j < len(time_ranges):
                        end_time = self.parse_time(time_ranges[j].split('-')[1])
                        if end_time - start_time > 30:
                            break

                        current_translation = row['time_aligned_translations'].get(time_ranges[j], "")
                        current_transcript = row['time_aligned_transcripts'].get(time_ranges[j], "")

                        if current_translation:
                            combined_text += " " + current_translation
                        if current_transcript:
                            combined_original += " " + current_transcript
                        j += 1

                    if not combined_text.strip() or not combined_original.strip():
                        continue

                    metadata = {
                        'audio_file': row['audio_file'],
                        'start_time': start_time,
                        'end_time': end_time,
                        'time_range': f"{start_time:.2f}s - {end_time:.2f}s",
                        'original_text': combined_original.strip(),
                        'full_translation': row.get('full_transcript_translation', '')
                    }

                    doc = Document(
                        page_content=combined_text.strip(),
                        metadata=metadata
                    )
                    documents.append(doc)
            except Exception as e:
                print(f"Error processing row {idx}: {str(e)}")
                continue

        return documents

    def initialize_vector_store(self) -> FAISS:
        """Initialize the FAISS vector store."""
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
        )
        return FAISS.from_documents(self.documents, embeddings)

    def calculate_relevance_score(self, question: str, doc: Document) -> float:
        """
        Calculate a relevance score for a document relative to the question.
        Uses a combination of semantic similarity and keyword matching.
        """
        try:

            embeddings = HuggingFaceEmbeddings(
                model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
            )
            question_emb = embeddings.embed_query(question)
            doc_emb = embeddings.embed_query(doc.page_content)

            semantic_score = F.cosine_similarity(
                torch.tensor(question_emb).unsqueeze(0),
                torch.tensor(doc_emb).unsqueeze(0)
            ).item()

            question_words = set(question.lower().split())
            doc_words = set(doc.page_content.lower().split())
            keyword_score = len(question_words.intersection(doc_words)) / len(question_words)

            final_score = 0.7 * semantic_score + 0.3 * keyword_score

            return final_score
        except Exception as e:
            print(f"Error calculating relevance score: {str(e)}")
            return 0.0

    def extract_audio_segment(self, audio_file: str, start_time: float, end_time: float) -> str:
        """Extract and save an audio segment."""
        try:

            if audio_file not in self.audio_cache:
                audio_path = os.path.join(self.audio_dir, audio_file)
                self.audio_cache[audio_file] = AudioSegment.from_mp3(audio_path)

            audio = self.audio_cache[audio_file]

            start_ms = int(start_time * 1000)
            end_ms = int(end_time * 1000)

            segment = audio[start_ms:end_ms]

            output_filename = f"segment_{audio_file}_{start_time:.2f}_{end_time:.2f}.mp3"
            output_path = os.path.join(self.output_dir, output_filename)
            segment.export(output_path, format='mp3')

            return output_path
        except Exception as e:
            print(f"Error extracting audio segment: {str(e)}")
            return ""

    def answer_question(self, question: str, k: int = 5) -> List[Dict]:
        """
        Answer a question by retrieving and ranking relevant passages.
        Also extracts corresponding audio segments.
        """
        try:

            docs = self.vector_store.similarity_search(question, k=k)

            scored_docs = [
                (doc, self.calculate_relevance_score(question, doc))
                for doc in docs
            ]
            scored_docs.sort(key=lambda x: x[1], reverse=True)

            results = []
            for doc, score in scored_docs:

                audio_path = self.extract_audio_segment(
                    doc.metadata['audio_file'],
                    doc.metadata['start_time'],
                    doc.metadata['end_time']
                )

                result = {
                    'relevance_score': score,
                    'audio_file': doc.metadata['audio_file'],
                    'time_range': doc.metadata['time_range'],
                    'english_translation': doc.page_content,
                    'original_kannada': doc.metadata['original_text'],
                    'extracted_audio_path': audio_path
                }
                results.append(result)

            return results
        except Exception as e:
            print(f"Error answering question: {str(e)}")
            return []

    def print_results(self, results: List[Dict]):
        """Print results in a formatted way."""
        if not results:
            print("\nNo relevant passages found.")
            return

        print("\nRelevant passages found (ranked by relevance):")
        print("-" * 80)

        for i, result in enumerate(results, 1):
            print(f"\nPassage {i} (Relevance Score: {result['relevance_score']:.3f}):")
            print(f"Audio File: {result['audio_file']}")
            print(f"Time Range: {result['time_range']}")
            print(f"English Translation: {result['english_translation']}")
            print(f"Original Kannada: {result['original_kannada']}")
            print(f"Extracted Audio: {result['extracted_audio_path']}")
            print("-" * 40)

In [None]:

csv_path = "/kaggle/input/final-translated-all/allfilescombined.csv"
audio_dir = "/kaggle/input/audio-kannada/audiocorpus"
output_dir = "/kaggle/working/extracted_segments2"

rag_system = AudioRAGSystem(csv_path, audio_dir, output_dir)

def interactive_qa():
    while True:
        question = input("\nEnter your question (or 'quit' to exit): ")
        if question.lower() == 'quit':
            break

        print("\nSearching for answer...")
        results = rag_system.answer_question(question)
        rag_system.print_results(results)

if __name__ == "__main__":

    sample_question = "What is mentioned about farmers?"
    results = rag_system.answer_question(sample_question)
    rag_system.print_results(results)
    print("\nEntering interactive mode...")
    interactive_qa()




Relevant passages found (ranked by relevance):
--------------------------------------------------------------------------------

Passage 1 (Relevance Score: 0.661):
Audio File: SandalWoodNewsStories_9.mp3
Time Range: 650.00s - 690.00s
English Translation: ([My contact with farmers is continuous, different organizations come and hundreds of farmers come under one roof, they themselves come and gather there], [My contact with farmers is continuous, different organizations come and hundreds of farmers come under one roof, they themselves come and gather there]) ([' Every third Sunday of the month, we have a group discussion where farmers can discuss any confusions they have about agroforestry, what to do and not to do'], [' Every third Sunday of the month, we have a group discussion where farmers can discuss any confusions they have about agroforestry, what to do and not to do']) ([' And so what I do is every third Sunday of the month at 9 am, I have a session on agro forestry models on 


Enter your question (or 'quit' to exit):  quit
