-
Notifications
You must be signed in to change notification settings - Fork 1
/
glue-template.yaml.j2
102 lines (102 loc) · 3.09 KB
/
glue-template.yaml.j2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
AWSTemplateFormatVersion: '2010-09-09'
Description: AWS Glue Job, Crawlers, S3 Bucket Creation, and Athena Querying
Resources:
DataBucket:
Type: 'AWS::S3::Bucket'
Properties:
BucketName: "{{ prefix }}-bucket"
GlueJob1:
Type: 'AWS::Glue::Job'
Properties:
Role: !Ref GlueRole
Command:
Name: 'glueetl'
ScriptLocation: !Sub 's3://${DataBucket}/scripts/job1/script.py'
DefaultArguments:
"--TempDir": !Sub 's3://${DataBucket}/temp-dir/'
"--enable-notebook": "true"
"--notebook-dir": !Sub 's3://${DataBucket}/notebooks/'
--JOB_NAME: job1
--SOURCE_CRAWLER: glue-jupyter-wsSourceCrawler
--BUCKET_NAME: glue-jupyter-ws-bucket
GlueVersion: '2.0'
MaxRetries: 3
Name: "{{ prefix }}GlueJob1"
ExecutionProperty:
MaxConcurrentRuns: 1
GlueJob2:
Type: 'AWS::Glue::Job'
Properties:
Role: !Ref GlueRole
Command:
Name: 'glueetl'
ScriptLocation: !Sub 's3://${DataBucket}/scripts/job2/script.py'
DefaultArguments:
"--TempDir": !Sub 's3://${DataBucket}/temp-dir/'
"--enable-notebook": "true"
"--notebook-dir": !Sub 's3://${DataBucket}/notebooks/'
--JOB_NAME: "job2"
--SOURCE_CRAWLER: glue-jupyter-wsSourceCrawler
--BUCKET_NAME: glue-jupyter-ws-bucket
GlueVersion: '2.0'
MaxRetries: 3
Name: "{{ prefix }}GlueJob2"
ExecutionProperty:
MaxConcurrentRuns: 1
SourceCrawler:
Type: 'AWS::Glue::Crawler'
Properties:
Role: !Ref GlueRole
DatabaseName: '{{ prefix }}sourcedatabase'
Schedule:
ScheduleExpression: 'cron(0 12 * * ? *)'
Targets:
S3Targets:
- Path: !Sub 's3://${DataBucket}/source'
Name: "{{ prefix }}SourceCrawler"
TargetCrawler:
Type: 'AWS::Glue::Crawler'
Properties:
Role: !Ref GlueRole
DatabaseName: '{{ prefix }}targetdatabase'
Schedule:
ScheduleExpression: 'cron(0 12 * * ? *)'
Targets:
S3Targets:
- Path: !Sub 's3://${DataBucket}/landing'
Name: "{{ prefix }}TargetCrawler"
sourcedatabase:
Type: 'AWS::Glue::Database'
Properties:
CatalogId: !Ref AWS::AccountId
DatabaseInput:
Name: "{{ prefix }}sourcedatabase"
Description: '{{ prefix }} Source Catalog'
targetdatabase:
Type: 'AWS::Glue::Database'
Properties:
CatalogId: !Ref AWS::AccountId
DatabaseInput:
Name: "{{ prefix }}targetdatabase"
Description: '{{ prefix }} Target Catalog'
GlueRole:
Type: 'AWS::IAM::Role'
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: 'Allow'
Principal:
Service: 'glue.amazonaws.com'
Action: 'sts:AssumeRole'
Path: '/'
Policies:
- PolicyName: "{{ prefix }}GluePolicy"
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: 'Allow'
Action:
- 's3:*'
- 'glue:*'
Resource: '*'