## Create ECS cluster
- Persistent and doesn't need to be created again

In [2]:
import boto3

In [12]:
ecs = boto3.client('ecs')
ecs.create_cluster(clusterName='book-digitization-cluster')

{'cluster': {'clusterArn': 'arn:aws:ecs:us-east-1:322793536920:cluster/book-digitization-cluster',
  'clusterName': 'book-digitization-cluster',
  'status': 'ACTIVE',
  'registeredContainerInstancesCount': 0,
  'runningTasksCount': 0,
  'pendingTasksCount': 0,
  'activeServicesCount': 0,
  'statistics': [],
  'tags': [],
  'settings': [{'name': 'containerInsights', 'value': 'disabled'}],
  'capacityProviders': [],
  'defaultCapacityProviderStrategy': []},
 'ResponseMetadata': {'RequestId': '9bbde54a-1ec4-4a6c-8404-b32d7cf17460',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '9bbde54a-1ec4-4a6c-8404-b32d7cf17460',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '427',
   'date': 'Sun, 01 Jun 2025 05:50:02 GMT'},
  'RetryAttempts': 0}}

## Create ECR repository
- This is where my code is stored virtually

In [2]:
ecr = boto3.client('ecr')

response = ecr.create_repository(
    repositoryName='book-digitization',
    imageScanningConfiguration={'scanOnPush': True},
    tags=[{'Key': 'project', 'Value': 'book-digitization'}]
)

print("ECR repository created:")
print(response['repository']['repositoryUri'])


ECR repository created:
322793536920.dkr.ecr.us-east-1.amazonaws.com/book-digitization


## Create ECS Task Definition 
- Tells ECS how to run my container (Docker image, CPU/memory, IAM role, env vars).

In [7]:
ACCOUNT_ID = 322793536920
REGION = 'us-east-1'
ECR_IMAGE_URI = '322793536920.dkr.ecr.us-east-1.amazonaws.com/book-digitization'
CLUSTER_NAME = 'book-digitization-cluster'
subnet = 'subnet-01a44828a3a4e64cd' # Subnet: this is where I have a private network with all your computers, servers, and devices connected together in their own isolated environment

In [14]:
import boto3

ecs = boto3.client('ecs')

response = ecs.register_task_definition(
    family='book-digitization-task',  # your family name
    executionRoleArn='arn:aws:iam::322793536920:role/ecsTaskExecutionRole',
    taskRoleArn='arn:aws:iam::322793536920:role/LabRole',
    networkMode='awsvpc',
    requiresCompatibilities=['FARGATE'],
    cpu='1024',
    memory='2048',
    containerDefinitions=[
        {
            'name': 'book-digitization-container',
            'image': '322793536920.dkr.ecr.us-east-1.amazonaws.com/book-digitization:latest',
            'essential': True,
            'logConfiguration': {
                'logDriver': 'awslogs',
                'options': {
                    'awslogs-group': '/ecs/book-digitization',
                    'awslogs-region': 'us-east-1',
                    'awslogs-stream-prefix': 'ecs'
                }
            }
        }
    ]
)

print("Registered task definition revision:", response['taskDefinition']['revision'])


Registered task definition revision: 19


## Create security group in my VPC

In [8]:
ec2 = boto3.client('ec2', region_name='us-east-1')

response = ec2.create_security_group(
    GroupName='book-digitization-sg',
    Description='Security group for ECS tasks',
    VpcId='vpc-08bfbd1db0fdbb931'
)

sg_id = response['GroupId']
print(f"Created security group with ID: {sg_id}")

# Optionally, add inbound rules (e.g., allow all outbound, allow SSH, HTTP, etc.)
ec2.authorize_security_group_ingress(
    GroupId=sg_id,
    IpPermissions=[
        {
            'IpProtocol': 'tcp',
            'FromPort': 22,
            'ToPort': 22,
            'IpRanges': [{'CidrIp': '0.0.0.0/0'}]  # SSH access from anywhere (be cautious!)
        },
        {
            'IpProtocol': 'tcp',
            'FromPort': 80,
            'ToPort': 80,
            'IpRanges': [{'CidrIp': '0.0.0.0/0'}]  # HTTP access if needed
        }
    ]
)


Created security group with ID: sg-0f278fd15aed7b093


{'Return': True,
 'SecurityGroupRules': [{'SecurityGroupRuleId': 'sgr-03581b59a9a8b79cf',
   'GroupId': 'sg-0f278fd15aed7b093',
   'GroupOwnerId': '322793536920',
   'IsEgress': False,
   'IpProtocol': 'tcp',
   'FromPort': 22,
   'ToPort': 22,
   'CidrIpv4': '0.0.0.0/0',
   'SecurityGroupRuleArn': 'arn:aws:ec2:us-east-1:322793536920:security-group-rule/sgr-03581b59a9a8b79cf'},
  {'SecurityGroupRuleId': 'sgr-090e145bfd6382cae',
   'GroupId': 'sg-0f278fd15aed7b093',
   'GroupOwnerId': '322793536920',
   'IsEgress': False,
   'IpProtocol': 'tcp',
   'FromPort': 80,
   'ToPort': 80,
   'CidrIpv4': '0.0.0.0/0',
   'SecurityGroupRuleArn': 'arn:aws:ec2:us-east-1:322793536920:security-group-rule/sgr-090e145bfd6382cae'}],
 'ResponseMetadata': {'RequestId': '921f3b3f-bbe1-4250-b47d-f6881d187527',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '921f3b3f-bbe1-4250-b47d-f6881d187527',
   'cache-control': 'no-cache, no-store',
   'strict-transport-security': 'max-age=31536000; inclu

## Run a Fargate task
- Fargate is AWS's serverless container platform. It runs your containerized applications without you managing the underlying servers
- This is being run for testing purposes

In [19]:
response = ecs.run_task(
    cluster='book-digitization-cluster',
    launchType='FARGATE',
    taskDefinition='book-digitization-task',
    count=1,
    networkConfiguration={
        'awsvpcConfiguration': {
            'subnets': ['subnet-01a44828a3a4e64cd'],  # your subnet
            'assignPublicIp': 'ENABLED',
            'securityGroups': ['sg-0f278fd15aed7b093']  
        }
    },
    overrides={
        'containerOverrides': [
            {
                'name': 'book-digitization-container',
                'environment': [
                    {
                        'name': 'PDF_BATCH_PATH',
                        'value': 'mission-to-ashantee-longer/input/mission-from-cape-coast-castle-longer-batch-1.pdf'
                    }
                ]
            }
        ]
    }
)

print("🚀 Task started:", response['tasks'][0]['taskArn'])


🚀 Task started: arn:aws:ecs:us-east-1:322793536920:task/book-digitization-cluster/e051f32cc20947a49769828ae1c87c04


## Create lambda package zip file 
- This is needed before I can create the function
- One time use

In [5]:
import shutil
from pathlib import Path
import subprocess

# Paths
build_dir = Path('lambda_build')
src_files = ['lambda_function.py', 'book_digitizer.py']  # Add any other .py files your lambda needs
requirements_file = 'requirements.txt'
zip_filename = 'lambda_package.zip'

# Clean up old build dir and zip if exist
if build_dir.exists():
    shutil.rmtree(build_dir)
if Path(zip_filename).exists():
    Path(zip_filename).unlink()

# Create build directory
build_dir.mkdir()

# Copy source files to build dir
for f in src_files:
    shutil.copy(f, build_dir / f)

# Install dependencies to build dir
subprocess.check_call([
    'pip', 'install', '-r', requirements_file, '--target', str(build_dir)
])

# Zip build directory contents (all files in build_dir, no top folder)
shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', root_dir=build_dir)

print(f"✅ Built {zip_filename}")


Collecting boto3 (from -r requirements.txt (line 1))
  Using cached boto3-1.38.26-py3-none-any.whl.metadata (6.6 kB)
Collecting requests (from -r requirements.txt (line 2))
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting pathlib (from -r requirements.txt (line 3))
  Using cached pathlib-1.0.1-py3-none-any.whl.metadata (5.1 kB)
Collecting PyPDF2 (from -r requirements.txt (line 4))
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting mistralai (from -r requirements.txt (line 5))
  Using cached mistralai-1.8.1-py3-none-any.whl.metadata (33 kB)
Collecting botocore<1.39.0,>=1.38.26 (from boto3->-r requirements.txt (line 1))
  Using cached botocore-1.38.26-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3->-r requirements.txt (line 1))
  Using cached jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.14.0,>=0.13.0 (from boto3->-r requirements.txt (line 1))
  Using cached s3transfer-0.13.0-

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 1.2.2 requires botocore<1.19.53,>=1.19.52, but you have botocore 1.38.26 which is incompatible.
sparkmagic 0.21.0 requires pandas<2.0.0,>=0.17.1, but you have pandas 2.2.3 which is incompatible.
streamlit 1.37.1 requires tenacity<9,>=8.1.0, but you have tenacity 9.1.2 which is incompatible.[0m[31m
[0m

✅ Built lambda_package.zip


## Create Lambda function
- Only needs to be run once

In [None]:
# lambda_client = boto3.client('lambda', region_name='us-east-1')

# with open('lambda_package.zip', 'rb') as f:
#     zipped_code = f.read()

# response = lambda_client.create_function(
#     FunctionName='bookDigitizationLambda',
#     Runtime='python3.9',
#     Role=f'arn:aws:iam::{ACCOUNT_ID}:role/LabRole', # TODO Might need to change that when I switch out of my root user
#     Handler='lambda_function.lambda_handler',  
#     Code={'ZipFile': zipped_code},
#     Timeout=300,
#     MemorySize=2048,
#     Publish=True,
#     PackageType='Zip',
# )

# print("Lambda created:", response)


ResourceConflictException: An error occurred (ResourceConflictException) when calling the CreateFunction operation: Function already exist: bookDigitizationLambda

## Create bucket for lambda function to be stored 

In [10]:
s3 = boto3.client('s3', region_name='us-east-1')

bucket_name = 'book-digitization-lambda'

s3.create_bucket(
    Bucket=bucket_name
)
print(f"Bucket '{bucket_name}' created.")

Bucket 'book-digitization-lambda' created.
