In [None]:
from google.colab import drive
import pandas as pd
# Open a CSV file from the shared drive for reading
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Gemini 1.5 Flash

In [None]:
''' Gemini API Key
*********
'''

In [None]:
import google.generativeai as genai
import pandas as pd
import subprocess  #send API requests via `curl`.
import json
import time
from tqdm import tqdm  # progress bar
from google.colab import drive


# API key
GEMINI_API_KEY = "Insert your key "
genai.configure(api_key=GEMINI_API_KEY)

# Function to split long interview transcripts into smaller chunks for processing
# Possibly where the 34 "Please provide summary" came from still summarized
def chunk_text(text, max_length=3000):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    # Iterates over words and groups them into chunks of specified max_length
    for word in words:
        if current_length + len(word) + 1 <= max_length:
            current_chunk.append(word)
            current_length += len(word) + 1
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = len(word) + 1

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Function to summarize a single chunk using Gemini API
def summarize_chunk(text, retries=3):
    for attempt in range(retries):  # Retry mechanism in case of failures
        try:
            # Gemini Prompt the summarise is running through
            prompt = (
                "Summarize the following two-way interview while preserving key proper names. "
                "Focus on main themes, important takeaways, and discussion points:\n\n" + text
            )

            # Formatting request payload for API call
            payload = json.dumps({
                "contents": [ { "parts": [{ "text": prompt }] } ]
            })

            # Making API request using curl command
            curl_command = [
                "curl", f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key={GEMINI_API_KEY}",
                "-H", "Content-Type: application/json",
                "-X", "POST",
                "-d", payload
            ]

            # Executing the command and capturing the response
            result = subprocess.run(curl_command, capture_output=True, text=True)
            response = json.loads(result.stdout)

            # Extracting the summarized text from API response
            summary = response.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "").strip()
            if summary:
                return summary

        except Exception as e:
            print(f"⚠️ Attempt {attempt + 1}: Error during summarization: {e}")

        time.sleep(5)  # Wait 5 seconds before retrying

    print("❌ Failed to summarize after retries.")
    return None  # Return None if all retries fail

# Function to generate summary for an entire interview transcript
def generate_summary(text):
    chunks = chunk_text(text, max_length=3000)  # Splitting long text if necessary
    summaries = [summarize_chunk(chunk) for chunk in chunks]  # Summarizing each chunk

    # Combine all chunk summaries into a final summary
    final_summary = " ".join([s for s in summaries if s])
    return final_summary if final_summary else None

# Load interview transcripts from CSV
csv_path = '/content/drive/MyDrive/Colab Notebooks'
df = pd.read_csv(csv_path)

# Ensure 'Summary' column exists in the DataFrame
if 'Summary' not in df.columns:
    df['Summary'] = None

# Tracking API request limits and batch processing
batch_count = 0  # Count requests per minute
total_requests = 0  # Count total API requests
BATCH_LIMIT = 15  # Gemini 1.5 Flash allows 15 requests per minute
TOTAL_LIMIT = 1500  # Stop processing after 1500 requests

# Filter rows that still need summarization
remaining_rows = df[df['Summary'].isna() | (df['Summary'].str.strip() == "")]

print(f"🔍 Found {len(remaining_rows)} interviews that still need summaries.")

# Iterate over remaining rows and generate summaries
for index, row in tqdm(remaining_rows.iterrows(), total=len(remaining_rows), desc="Summarizing Interviews"):
    text = row.get('RecordingTranscription_1', None)

    if isinstance(text, str) and text.strip():  # Ensure text is valid
        summary = generate_summary(text)
        if summary:
            df.at[index, 'Summary'] = summary  # Save summary in DataFrame
        else:
            print(f"⚠️ Warning: Failed to generate summary for row {index}, skipping.")

        batch_count += 1
        total_requests += 1

    else:
        print(f"Skipping row {index} (Invalid text)")

    # Save progress periodically to prevent data loss
    if batch_count >= BATCH_LIMIT:
        print("💾 Saving progress...")
        df.to_csv(csv_path, index=False, encoding='utf-8-sig')

        print("⏳ Waiting 60 seconds before processing the next batch...")
        time.sleep(60)  # Wait before processing next batch
        batch_count = 0

    # Stop after reaching API request limit
    if total_requests >= TOTAL_LIMIT:
        print("🚀 Reached 1500 requests. Please resume processing on the next day.")
        break

# Final save after completing summarization
df.to_csv(csv_path, index=False, encoding='utf-8-sig')

print(f"✅ Summarized data saved to {csv_path}")


🔍 Found 2157 interviews that still need summaries.


Summarizing Interviews:   0%|          | 5/2157 [00:11<1:20:10,  2.24s/it]


KeyboardInterrupt: 