# Capstone Project 2025 ‚Äî Unified AWS Version (Grading-Ready)

This notebook **replaces** the template contents and provides a start-to-finish pipeline:
1) Load Transcribe results from S3  
2) Normalize text  
3) Extract **real** key phrases using **Amazon Comprehend (DetectKeyPhrases)**  
4) Prepare input files and run **Comprehend Topics Detection (Phase Detection)**  
5) Basic visualizations

> **Region:** `us-east-1`  
> **Comprehend Role:** `arn:aws:iam::934798949390:role/service-role/c176045a4549683l12324630t1-ComprehendDataAccessRole-E2EeGxWSgfrW`


In [None]:

import boto3
import json
import pandas as pd
import re
import uuid
import time
from time import sleep
import matplotlib.pyplot as plt
from collections import Counter

# AWS region & role from template
AWS_REGION = "us-east-1"
COMPREHEND_DATA_ACCESS_ROLE = "arn:aws:iam::934798949390:role/service-role/c176045a4549683l12324630t1-ComprehendDataAccessRole-E2EeGxWSgfrW"


In [None]:

# --- REQUIRED: set your S3 bucket that holds Transcribe outputs ---
# If you ran Transcribe already, your result JSON files should be in this bucket.
output_bucket = "c176045a4549683l12324630t1w510414224130-labbucket-ymkoanalkg8l"

s3 = boto3.client("s3", region_name=AWS_REGION)
transcribe = boto3.client("transcribe", region_name=AWS_REGION)
comprehend = boto3.client("comprehend", region_name=AWS_REGION)

print("Using region:", AWS_REGION)
print("Data access role:", COMPREHEND_DATA_ACCESS_ROLE)


## 1) Discover completed Transcribe JSONs

In [None]:

response = s3.list_objects_v2(Bucket=output_bucket, Prefix="transcribe-job-")
output_files = []

for obj in response.get("Contents", []):
    key = obj["Key"]
    # Infer a simple video id from the key (UUID or slug)
    video_name = key.replace("transcribe-job-", "").replace(".json", "")
    output_files.append({"Video": video_name, "OutputKey": key})

print(f"‚úÖ Found {len(output_files)} Transcribe result files.")
if len(output_files) == 0:
    print("‚ö†Ô∏è No transcribe-job-*.json files found. Confirm your bucket name and prefix.")
output_files[:3]


## 2) Load transcripts from S3 into a DataFrame

In [None]:

data_rows = []

for entry in output_files:
    key = entry["OutputKey"]
    try:
        obj = s3.get_object(Bucket=output_bucket, Key=key)
        data = json.loads(obj["Body"].read().decode("utf-8"))
        transcript = data["results"]["transcripts"][0]["transcript"]
        data_rows.append({"Video": entry["Video"], "Transcription": transcript})
    except Exception as e:
        print(f"‚ö†Ô∏è Error reading {key}: {e}")

df = pd.DataFrame(data_rows)
print(f"‚úÖ Loaded {len(df)} transcripts.")
df.head()


## 3) Normalize text

In [None]:

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

if "Transcription" not in df.columns:
    raise KeyError("Expected 'Transcription' column not found. Check previous step.")

df["clean_text"] = df["Transcription"].apply(normalize_text)
print("‚úÖ Text normalized.")
df.head()


## 4) Extract **real** key phrases with Amazon Comprehend

In [None]:

key_rows = []

print("üîç Starting Amazon Comprehend DetectKeyPhrases...")
for i, row in df.iterrows():
    text = row["Transcription"][:4500]  # Comprehend ~5000 bytes per call
    try:
        response = comprehend.detect_key_phrases(Text=text, LanguageCode="en")
        phrases = [kp["Text"] for kp in response.get("KeyPhrases", [])]
        key_rows.append({"Video": row["Video"], "KeyPhrases": phrases})
        if (i + 1) % 10 == 0:
            print(f"Processed {i+1}/{len(df)} transcripts...")
        sleep(0.25)  # throttle to avoid API limits
    except Exception as e:
        print(f"‚ö†Ô∏è Error on {row['Video']}: {e}")
        key_rows.append({"Video": row["Video"], "KeyPhrases": []})

df_keys = pd.DataFrame(key_rows)
print(f"‚úÖ Extracted key phrases for {len(df_keys)} transcripts.")
df_keys.head()


## 5) Combine transcripts and key phrases

In [None]:

df_combined = pd.merge(df, df_keys, on="Video", how="left")
print("‚úÖ Combined DataFrame shape:", df_combined.shape)
df_combined.head()


## 6) Prepare inputs for **Comprehend Topics Detection** (Phase Detection)
We will upload one plain-text file per transcript to `s3://<bucket>/transcribe-json-input/`.

In [None]:

input_prefix = "transcribe-json-input/"  # ONE_DOC_PER_FILE input
uploaded = 0

for _, r in df.iterrows():
    key = f"{input_prefix}{r['Video']}.txt"
    body = r["Transcription"].encode("utf-8")
    s3.put_object(Bucket=output_bucket, Key=key, Body=body)
    uploaded += 1

print(f"‚úÖ Uploaded {uploaded} input text files to s3://{output_bucket}/{input_prefix}")


## 7) Start Comprehend Topics Detection Job

In [None]:

input_s3_uri  = f"s3://{output_bucket}/transcribe-json-input/"
output_s3_uri = f"s3://{output_bucket}/comprehend-topics-output/"

response = comprehend.start_topics_detection_job(
    InputDataConfig={"S3Uri": input_s3_uri, "InputFormat": "ONE_DOC_PER_FILE"},
    OutputDataConfig={"S3Uri": output_s3_uri},
    DataAccessRoleArn=COMPREHEND_DATA_ACCESS_ROLE,
    JobName=f"ComprehendTopics-{uuid.uuid4()}",
    NumberOfTopics=10
)

job_id = response["JobId"]
print("üöÄ Started Topics Detection Job:", job_id)


## 8) Monitor job status until completion

In [None]:

while True:
    status = comprehend.describe_topics_detection_job(JobId=job_id)
    props = status["TopicsDetectionJobProperties"]
    state = props["JobStatus"]
    print(f"Job status: {state}")
    if state in ("COMPLETED", "FAILED"):
        print("Final properties:", json.dumps(props, indent=2, default=str))
        break
    time.sleep(60)


## 9) List Topics Detection outputs

In [None]:

result = s3.list_objects_v2(Bucket=output_bucket, Prefix="comprehend-topics-output/")
files = [obj["Key"] for obj in result.get("Contents", [])]
print("Output files:")
for k in files:
    print("-", k)


## 10) Quick visualization of top key phrases (optional)

In [None]:

if len(df_keys) and len(df_keys['KeyPhrases'].dropna()):
    all_phrases = [p for sub in df_keys["KeyPhrases"] for p in (sub or [])]
    if all_phrases:
        top = Counter(all_phrases).most_common(10)
        words, counts = zip(*top)
        plt.figure(figsize=(8,4))
        plt.barh(words, counts)
        plt.gca().invert_yaxis()
        plt.title("Top Key Phrases (Amazon Comprehend)")
        plt.show()
    else:
        print("No phrases to visualize.")
else:
    print("Key phrases DataFrame is empty or missing.")
