# Capstone Project 2025 ‚Äî Unified AWS Version
This notebook combines working Transcribe jobs, real Amazon Comprehend key phrase extraction, and phase detection (topic modeling). Ready for AWS Academy grading.

In [None]:

import boto3
import json
import pandas as pd
import re
import uuid
import time
from time import sleep
import matplotlib.pyplot as plt
from collections import Counter


In [None]:

output_bucket = "c176045a4549683l12324630t1w510414224130-labbucket-ymkoanalkg8l"

s3 = boto3.client("s3")
transcribe = boto3.client("transcribe", region_name="us-east-1")
comprehend = boto3.client("comprehend", region_name="us-east-1")


In [None]:

response = s3.list_objects_v2(Bucket=output_bucket, Prefix="transcribe-job-")
output_files = []

for obj in response.get("Contents", []):
    key = obj["Key"]
    video_name = key.replace("transcribe-job-", "").replace(".json", "")
    output_files.append({"Video": video_name, "OutputKey": key})

print(f"‚úÖ Found {len(output_files)} Transcribe result files.")


In [None]:

data_rows = []

for entry in output_files:
    key = entry["OutputKey"]
    try:
        obj = s3.get_object(Bucket=output_bucket, Key=key)
        data = json.loads(obj["Body"].read().decode("utf-8"))
        transcript = data["results"]["transcripts"][0]["transcript"]
        data_rows.append({"Video": entry["Video"], "Transcription": transcript})
    except Exception as e:
        print(f"‚ö†Ô∏è Error reading {key}: {e}")

df = pd.DataFrame(data_rows)
print(f"‚úÖ Loaded {len(df)} transcripts.")
df.head()


In [None]:

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text"] = df["Transcription"].apply(normalize_text)
print("‚úÖ Text normalized.")
df.head()


In [None]:

key_rows = []

print("üîç Starting real key-phrase extraction using Amazon Comprehend...")
for i, row in df.iterrows():
    text = row["Transcription"][:4500]
    try:
        response = comprehend.detect_key_phrases(Text=text, LanguageCode="en")
        phrases = [kp["Text"] for kp in response["KeyPhrases"]]
        key_rows.append({"Video": row["Video"], "KeyPhrases": phrases})
        if i % 10 == 0:
            print(f"Processed {i+1}/{len(df)} transcripts...")
        sleep(0.25)
    except Exception as e:
        print(f"‚ö†Ô∏è Error on {row['Video']}: {e}")
        key_rows.append({"Video": row["Video"], "KeyPhrases": []})

df_keys = pd.DataFrame(key_rows)
print(f"‚úÖ Extracted key phrases for {len(df_keys)} transcripts.")
df_keys.head()


In [None]:

df_combined = pd.merge(df, df_keys, on="Video", how="left")
df_combined.head()


In [None]:

all_phrases = [p for sub in df_keys["KeyPhrases"] for p in sub]
top = Counter(all_phrases).most_common(10)
phrases, counts = zip(*top)

plt.figure(figsize=(8,4))
plt.barh(phrases, counts)
plt.gca().invert_yaxis()
plt.title("Top Key Phrases (Amazon Comprehend)")
plt.show()


In [None]:

input_s3_uri  = f"s3://{output_bucket}/transcribe-json-input/"
output_s3_uri = f"s3://{output_bucket}/comprehend-topics-output/"

data_access_role = "arn:aws:iam::934798949390:role/service-role/c176045a4549683l12324630t1-ComprehendDataAccessRole-rq0eDXpjZoWu"

response = comprehend.start_topics_detection_job(
    InputDataConfig={"S3Uri": input_s3_uri, "InputFormat": "ONE_DOC_PER_FILE"},
    OutputDataConfig={"S3Uri": output_s3_uri},
    DataAccessRoleArn=data_access_role,
    JobName=f"ComprehendTopics-{uuid.uuid4()}",
    NumberOfTopics=10
)

print("üöÄ Started Comprehend Topics Detection Job:")
print("Job ID:", response["JobId"])


In [None]:

job_id = response["JobId"]

while True:
    status = comprehend.describe_topics_detection_job(JobId=job_id)
    state = status["TopicsDetectionJobProperties"]["JobStatus"]
    print(f"Job status: {state}")
    if state in ("COMPLETED", "FAILED"):
        break
    time.sleep(60)


In [None]:

result = s3.list_objects_v2(Bucket=output_bucket, Prefix="comprehend-topics-output/")
for obj in result.get("Contents", []):
    if obj["Key"].endswith(".csv") or obj["Key"].endswith(".json"):
        print("Found:", obj["Key"])
