Version: 11.08.2025

# Capstone Project: Bringing It All Together (2025 Update)

This modernized version ensures compatibility with **Python 3.10+**, **boto3 ‚â• 1.34**, and **Amazon OpenSearch 2.x** APIs.

You'll transcribe ML-course videos using **Amazon Transcribe**, analyze topics with **Amazon Comprehend**, and visualize them in **OpenSearch Dashboards**.

In [None]:
import boto3, json, uuid, time, os, io, re, requests, tarfile
import pandas as pd
from io import StringIO
import nltk
nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet')

bucket = 'c125984a3128017l8216419t1w730335316855-labbucket-jekz3oeugfkv'
job_data_access_role = 'arn:aws:iam::730335316855:role/service-role/c125984a3128017l8216419t1w-ComprehendDataAccessRole-DbY1v1fex5lo'
region = 'us-east-1'

s3 = boto3.client('s3', region_name=region)
transcribe = boto3.client('transcribe', region_name=region)
comprehend = boto3.client('comprehend', region_name=region)
opensearch = boto3.client('opensearch', region_name=region)

## 1Ô∏è‚É£ View available videos

In [None]:
!aws s3 ls s3://aws-tc-largeobjects/CUR-TF-200-ACMNLP-1/video/

Copy them into your bucket for processing.

In [None]:
!aws s3 cp s3://aws-tc-largeobjects/CUR-TF-200-ACMNLP-1/video/ s3://{bucket}/input/ --recursive

## 2Ô∏è‚É£ Transcribe the videos

In [None]:
# List input files safely
resp = s3.list_objects_v2(Bucket=bucket, Prefix='input/')
objects = [obj['Key'] for obj in resp.get('Contents', [])]
for key in objects:
    print(key)

In [None]:
# Start transcription for each file
output_files = []
for obj_key in objects:
    if 'temp' in obj_key:
        continue

    media_uri = f's3://{bucket}/{obj_key}'
    job_name = f'transcribe-job-{uuid.uuid4()}'

    print(f'üéô Starting transcription for {media_uri}')
    transcribe.start_transcription_job(
        TranscriptionJobName=job_name,
        Media={'MediaFileUri': media_uri},
        MediaFormat='mp4',
        LanguageCode='en-US',
        OutputBucketName=bucket,
        Settings={'ShowSpeakerLabels': False, 'ChannelIdentification': False}
    )

    # Poll until completion
    while True:
        job = transcribe.get_transcription_job(TranscriptionJobName=job_name)
        status = job['TranscriptionJob']['TranscriptionJobStatus']
        if status in ['COMPLETED', 'FAILED']:
            print(f'Job {job_name}: {status}')
            break
        print('.', end='', flush=True)
        time.sleep(15)

    if status == 'COMPLETED':
        uri = job['TranscriptionJob']['Transcript']['TranscriptFileUri']
        output_files.append({'Video': obj_key, 'TranscriptUri': uri})

print('‚úÖ Transcriptions complete:', len(output_files))

### Download transcripts

In [None]:
data_rows = []
for entry in output_files:
    uri = entry['TranscriptUri']
    data = requests.get(uri).json()
    transcript = data['results']['transcripts'][0]['transcript']
    data_rows.append({'Video': entry['Video'], 'Transcription': transcript})

df = pd.DataFrame(data_rows)
df.head()

## 3Ô∏è‚É£ Normalize text

In [None]:
def normalize_text(content):
    text = re.sub(r'http\S+', '', content)
    text = text.lower().strip()
    text = re.sub('\n', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub('<.*?>', '', text)
    return text

df['Normalized'] = df['Transcription'].apply(normalize_text)
df.head()

## 4Ô∏è‚É£ Extract key phrases and entities with Comprehend

In [None]:
s3_resource = boto3.resource('s3', region_name=region)

def upload_comprehend_s3_csv(filename, folder, dataframe):
    csv_buffer = StringIO()
    dataframe.to_csv(csv_buffer, header=False, index=False)
    key = f"{folder}/{filename}"
    s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())
    print(f'‚úÖ Uploaded to s3://{bucket}/{key}')
    return f's3://{bucket}/{key}'

input_path = upload_comprehend_s3_csv('comprehend_input.csv', 'capstone/comprehend', df['Normalized'])

# Start key phrase detection job
kpe_job = comprehend.start_key_phrases_detection_job(
    InputDataConfig={'S3Uri': input_path, 'InputFormat': 'ONE_DOC_PER_LINE'},
    OutputDataConfig={'S3Uri': f's3://{bucket}/'},
    DataAccessRoleArn=job_data_access_role,
    JobName=f'kpe-job-{uuid.uuid4()}',
    LanguageCode='en'
)
print('Key phrase job started:', kpe_job['JobId'])

### Wait for job completion

In [None]:
job_id = kpe_job['JobId']
while True:
    status = comprehend.describe_key_phrases_detection_job(JobId=job_id)
    state = status['KeyPhrasesDetectionJobProperties']['JobStatus']
    if state in ['COMPLETED','FAILED']:
        print('Job status:', state)
        break
    print('.', end='', flush=True)
    time.sleep(20)

## 5Ô∏è‚É£ Create OpenSearch Dashboard

In [None]:
my_ip = 'YOUR.IP.ADDRESS/32'  # e.g., '203.0.113.24/32'
access_policy = {
    'Version': '2012-10-17',
    'Statement': [{
        'Effect': 'Allow',
        'Principal': '*',
        'Action': 'es:*',
        'Resource': '*',
        'Condition': {'IpAddress': {'aws:SourceIp': my_ip}}
    }]
}

resp = opensearch.create_domain(
    DomainName='nlp-lab',
    EngineVersion='OpenSearch_2.11',
    ClusterConfig={'InstanceType': 't3.small.search', 'InstanceCount': 1},
    AccessPolicies=json.dumps(access_policy)
)
print('Domain creation started...')

In [None]:
while True:
    status = opensearch.describe_domain(DomainName='nlp-lab')
    if not status['DomainStatus']['Processing']:
        break
    print('.', end='', flush=True)
    time.sleep(30)

endpoint = status['DomainStatus']['Endpoint']
print(f'‚úÖ OpenSearch ready: https://{endpoint}/_dashboards')

### (Optional) Index a few docs into OpenSearch with `opensearch-py`

In [None]:
!pip install opensearch-py requests-aws4auth --quiet

In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection, helpers
from requests_aws4auth import AWS4Auth

session = boto3.Session()
creds = session.get_credentials().get_frozen_credentials()
awsauth = AWS4Auth(creds.access_key, creds.secret_key, region, 'es', session_token=creds.token)

client = OpenSearch(
    hosts=[{'host': endpoint, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

# Create index if not exists
index_name = 'videos'
if not client.indices.exists(index=index_name):
    client.indices.create(index=index_name)

# Bulk load a subset of docs
def gendocs(df):
    for i, row in df.iterrows():
        yield {
            '_index': index_name,
            '_id': i,
            '_source': {
                'video': row['Video'],
                'transcription': row['Transcription'][:32000],
                'normalized': row['Normalized'][:32000]
            }
        }

helpers.bulk(client, gendocs(df))
print('‚úÖ Indexed', len(df), 'documents')
print('Open the dashboard at: ', f'https://{endpoint}/_dashboards')

# üßπ Cleanup

In [None]:
opensearch.delete_domain(DomainName='nlp-lab')
print('Deleting OpenSearch domain...')

# ‚úÖ Congratulations!

You have completed the Capstone Project (2025 version). Your notebook now uses all modern AWS SDK methods and APIs safely.