In [13]:
from google.colab import drive
import pandas as pd
# Open a CSV file from the shared drive for reading
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Gemini 1.5 Flash

In [None]:
''' Gemini API Key
AIzaSyAkTfXVMMSSuO6FCf8Y7F9b_-pqb15IERw
'''

In [14]:
import google.generativeai as genai
import pandas as pd
import subprocess  # Send API requests via `curl`
import json
import time
from tqdm import tqdm  # Progress bar
from google.colab import drive

# API Key
GEMINI_API_KEY = "AIzaSyAkTfXVMMSSuO6FCf8Y7F9b_-pqb15IERw"
genai.configure(api_key=GEMINI_API_KEY)

# Mount Google Drive
drive.mount('/content/drive')

# Function to split long transcripts into smaller chunks
def chunk_text(text, max_length=3000):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 <= max_length:
            current_chunk.append(word)
            current_length += len(word) + 1
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = len(word) + 1

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Function to summarize a single chunk using Gemini API
def summarize_chunk(text, retries=3):
    for attempt in range(retries):
        try:
            # Gemini API Prompt
            prompt = (
                "Summarize the following two-way interview while preserving key proper names. "
                "Focus on main themes, important takeaways, and discussion points:\n\n" + text
            )

            # Request payload for API
            payload = json.dumps({
                "contents": [ { "parts": [{ "text": prompt }] } ]
            })

            # Making API request using curl
            curl_command = [
                "curl", f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key={GEMINI_API_KEY}",
                "-H", "Content-Type: application/json",
                "-X", "POST",
                "-d", payload
            ]

            # Execute the command and capture the response
            result = subprocess.run(curl_command, capture_output=True, text=True)
            response = json.loads(result.stdout)

            # Extract summarized text from API response
            summary = response.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "").strip()

            if summary:
                return summary

        except Exception as e:
            print(f"⚠️ Attempt {attempt + 1}: Error during summarization: {e}")

        time.sleep(5)  # Retry delay

    print("❌ Failed to summarize after retries.")
    return None

# Function to generate a summary for a full transcript
def generate_summary(text):
    chunks = chunk_text(text, max_length=3000)
    summaries = [summarize_chunk(chunk) for chunk in chunks]

    final_summary = " ".join([s for s in summaries if s])  # Join non-empty summaries
    return final_summary if final_summary else None

# Load CSV from Google Drive
csv_path = "/content/drive/MyDrive/Colab Notebooks/SUU_AllStories_02_24_2024/Copy of Final_SUU_AllStories_02_24_2024_14_06_41_PM_Includes_SUMMARIES.csv"
df = pd.read_csv(csv_path)

# Ensure 'Summary' column exists
if 'Summary' not in df.columns:
    df['Summary'] = None

# Filter rows that still need summarization
remaining_rows = df[df['Summary'].isna() | (df['Summary'].str.strip() == "")]

print(f"🔍 Found {len(remaining_rows)} interviews that need summaries.")

# API limits
BATCH_LIMIT = 15  # Gemini 1.5 Flash allows 15 requests per minute
TOTAL_LIMIT = 1500  # Stop after 1500 requests
batch_count = 0
total_requests = 0
updated = False  # Track if we need to save

# Process remaining rows
for index, row in tqdm(remaining_rows.iterrows(), total=len(remaining_rows), desc="Summarizing Interviews"):
    text = row.get('RecordingTranscription_1', None)

    if isinstance(text, str) and text.strip():
        summary = generate_summary(text)
        if summary:
            df.at[index, 'Summary'] = summary
            updated = True  # Mark that we made changes
        else:
            print(f"⚠️ Warning: Failed to generate summary for row {index}, skipping.")

        batch_count += 1
        total_requests += 1

    else:
        print(f"Skipping row {index} (Invalid text)")

    # Save progress every batch and prevent API limit issues
    if batch_count >= BATCH_LIMIT:
        if updated:
            print("💾 Saving progress...")
            df.to_csv(csv_path, index=False, encoding='utf-8-sig')
            updated = False  # Reset update flag

        print("⏳ Waiting 60 seconds before processing the next batch...")
        time.sleep(60)
        batch_count = 0

    # Stop after reaching the request limit
    if total_requests >= TOTAL_LIMIT:
        print("🚀 Reached 1500 requests. Please resume processing tomorrow.")
        break

# Final save if any updates were made
if updated:
    df.to_csv(csv_path, index=False, encoding='utf-8-sig')

print(f"✅ Summarized data saved to {csv_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🔍 Found 0 interviews that need summaries.


Summarizing Interviews: 0it [00:00, ?it/s]

✅ Summarized data saved to /content/drive/MyDrive/Colab Notebooks/SUU_AllStories_02_24_2024/Copy of Final_SUU_AllStories_02_24_2024_14_06_41_PM_Includes_SUMMARIES.csv



