Skip to content

Commit 4d4e029

Browse files
authored
Update error and usage alarms (#377)
* update alarms * update template * update high concurrent executions alarm * update alarm * fix function name * remove old alarms * change links * fix casing
1 parent 1a0befa commit 4d4e029

File tree

1 file changed

+36
-164
lines changed

1 file changed

+36
-164
lines changed

cicd/3-app/javabuilder/template.yml.erb

Lines changed: 36 additions & 164 deletions
Original file line numberDiff line numberDiff line change
@@ -541,134 +541,6 @@ Resources:
541541
ForwardedValues: {QueryString: true}
542542
ViewerProtocolPolicy: redirect-to-https
543543

544-
HighConcurrentExecutionsAlarm:
545-
Type: AWS::CloudWatch::Alarm
546-
Properties:
547-
AlarmName: !Sub "${SubDomainName}_high_concurrent_executions"
548-
AlarmDescription: !Sub |
549-
This will page the DOTD if javabuilder usage exceeds 50 concurrent
550-
executions for 10 minutes. Occasional spikes are expected, but
551-
long-running high usage is an indication of an attack. Go to the
552-
following URLs and set reserved concurrency to 10 immediately
553-
<%JAVALAB_APP_TYPES.each do | name | -%>
554-
https://console.aws.amazon.com/lambda/home?region=${AWS::Region}#/functions/${BuildAndRunJava<%=name%>ProjectFunction}/edit/concurrency?tab=configure
555-
<%end -%>
556-
Then post in #ap-csa-dev.
557-
ActionsEnabled: true
558-
AlarmActions:
559-
- !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:CDO-Urgent"]
560-
EvaluationPeriods: 10
561-
DatapointsToAlarm: 10
562-
Threshold: 50
563-
ComparisonOperator: GreaterThanThreshold
564-
TreatMissingData: notBreaching
565-
Metrics:
566-
- Id: e1
567-
Label: Concurrent Executions Across All Lambdas
568-
ReturnData: true
569-
Expression: SUM(METRICS())
570-
<%{Theater: "m2", Neighborhood: "m3", Console: "m4"}.each do |name, id| -%>
571-
- Id: <%=id%>
572-
ReturnData: false
573-
MetricStat:
574-
Metric:
575-
Namespace: AWS/Lambda
576-
MetricName: ConcurrentExecutions
577-
Dimensions:
578-
- Name: FunctionName
579-
Value: !Ref BuildAndRunJava<%=name%>ProjectFunction
580-
Period: 60
581-
Stat: Maximum
582-
<%end -%>
583-
584-
HighWebsocketConnectionsAlarm:
585-
Type: AWS::CloudWatch::Alarm
586-
Properties:
587-
AlarmName: !Sub "${SubDomainName}_high_websocket_connections"
588-
AlarmDescription: Significantly higher websocket connections than normal detected. Investigate if there is a DDOS.
589-
ActionsEnabled: false
590-
EvaluationPeriods: 20
591-
DatapointsToAlarm: 20
592-
ComparisonOperator: GreaterThanUpperThreshold
593-
TreatMissingData: notBreaching
594-
Metrics:
595-
- Id: m1
596-
ReturnData: true
597-
MetricStat:
598-
Metric:
599-
Namespace: AWS/ApiGateway
600-
MetricName: ConnectCount
601-
Dimensions:
602-
- Name: Stage
603-
Value: !Sub "${StageName}"
604-
- Name: ApiId
605-
Value: !Ref WebSocketApi
606-
Period: 60
607-
Stat: Sum
608-
- Id: ad1
609-
Label: ConnectCount (expected)
610-
ReturnData: true
611-
Expression: ANOMALY_DETECTION_BAND(m1, 8)
612-
ThresholdMetricId: ad1
613-
614-
HighHttpRequestsAlarm:
615-
Type: AWS::CloudWatch::Alarm
616-
Properties:
617-
AlarmName: !Sub "${SubDomainName}_high_http_requests"
618-
AlarmDescription: Significantly higher HTTP requests than normal detected.
619-
Investigate if there is a DDOS.
620-
ActionsEnabled: true
621-
OKActions: []
622-
AlarmActions: []
623-
InsufficientDataActions: []
624-
EvaluationPeriods: 20
625-
DatapointsToAlarm: 20
626-
ComparisonOperator: GreaterThanUpperThreshold
627-
TreatMissingData: notBreaching
628-
Metrics:
629-
- Id: m1
630-
ReturnData: true
631-
MetricStat:
632-
Metric:
633-
Namespace: AWS/ApiGateway
634-
MetricName: Count
635-
Dimensions:
636-
- Name: ApiId
637-
Value: !Ref HttpApi
638-
Period: 60
639-
Stat: Sum
640-
- Id: ad1
641-
Label: Count (expected)
642-
ReturnData: true
643-
Expression: ANOMALY_DETECTION_BAND(m1, 8)
644-
ThresholdMetricId: ad1
645-
646-
HighUsageCompositeAlarm:
647-
Type: AWS::CloudWatch::CompositeAlarm
648-
DependsOn:
649-
- ConsoleHighInvocationsAlarm
650-
- HighHttpRequestsAlarm
651-
- HighWebsocketConnectionsAlarm
652-
- NeighborhoodHighInvocationsAlarm
653-
- TheaterHighInvocationsAlarm
654-
Properties:
655-
ActionsEnabled: true
656-
AlarmActions:
657-
# TODO: after we have run at high usage for a while, consider re-enabling this alarm. Right now it is too noisy
658-
# - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:javabuilder-high-usage"]
659-
- !Ref AWS::NoValue
660-
AlarmDescription: Send message if abnormally high Javabuilder usage detected.
661-
Monitors usage across the HTTP API, WebSocket API, and all Build and Run
662-
Lambdas.
663-
AlarmName: !Sub "${SubDomainName}_high_usage_composite"
664-
AlarmRule: !Sub "ALARM(${SubDomainName}_console_high_invocations) OR
665-
ALARM(${SubDomainName}_high_http_requests) OR
666-
ALARM(${SubDomainName}_high_websocket_connections) OR
667-
ALARM(${SubDomainName}_neighborhood_high_invocations) OR
668-
ALARM(${SubDomainName}_theater_high_invocations)"
669-
InsufficientDataActions: []
670-
OKActions: []
671-
672544
<%JAVALAB_APP_TYPES.each do | name | -%>
673545
<%{
674546
TenPercentSevereErrorRateAlarm: {Threshold: 10, AlarmName: 'ten_percent_severe_error_rate'},
@@ -871,35 +743,6 @@ Resources:
871743
Threshold: 2500
872744
Period: 60
873745
874-
<%=name%>HighInvocationsAlarm:
875-
Type: AWS::CloudWatch::Alarm
876-
Properties:
877-
AlarmName: !Sub "${SubDomainName}_<%=name.downcase%>_high_invocations"
878-
AlarmDescription: Significantly higher <%=name%> build and run invocations than
879-
normal detected. Investigate if there is a DDOS.
880-
ActionsEnabled: false
881-
EvaluationPeriods: 20
882-
DatapointsToAlarm: 20
883-
ComparisonOperator: GreaterThanUpperThreshold
884-
TreatMissingData: notBreaching
885-
Metrics:
886-
- Id: m1
887-
ReturnData: true
888-
MetricStat:
889-
Metric:
890-
Namespace: AWS/Lambda
891-
MetricName: Invocations
892-
Dimensions:
893-
- Name: FunctionName
894-
Value: !Ref BuildAndRunJava<%=name%>ProjectFunction
895-
Period: 60
896-
Stat: Sum
897-
- Id: ad1
898-
Label: Invocations (expected)
899-
ReturnData: true
900-
Expression: ANOMALY_DETECTION_BAND(m1, 8)
901-
ThresholdMetricId: ad1
902-
903746
<%=name%>MinimumUsageAlarm:
904747
Type: AWS::CloudWatch::Alarm
905748
Properties:
@@ -932,7 +775,8 @@ Resources:
932775
AlarmDescription: Alarm if Javabuilder severe error rate exceeds 10% every 5 minutes for 20
933776
minutes and there are at least 100 requests every 5 minutes.
934777
Occasional spikes are expected, but a sustained elevated severe error rate is an indication of an issue.
935-
Severe errors are generated and emitted by our code.
778+
Severe errors are generated and emitted by our code. Please follow the instructions in this document to mitigate
779+
https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.2gh4dxmz643n
936780
ActionsEnabled: true
937781
AlarmActions:
938782
- !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate"]
@@ -954,10 +798,11 @@ Resources:
954798
AlarmDescription: Alarm if Javabuilder severe error rate exceeds 90% every 5 minutes for 20
955799
minutes and there are at least 100 requests every 5 minutes.
956800
Occasional spikes are expected, but a sustained high severe error rate is an indication of an outage.
957-
Severe errors are generated and emitted by our code.
801+
Severe errors are generated and emitted by our code. Please follow the instructions in this document to mitigate
802+
https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.2gh4dxmz643n
958803
ActionsEnabled: true
959804
AlarmActions:
960-
- !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate"]
805+
- !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:CDO-Urgent"]
961806
AlarmRule: !Sub "ALARM(${SubDomainName}_<%=name.downcase%>_ninety_percent_severe_error_rate) AND
962807
ALARM(${SubDomainName}_<%=name.downcase%>_minimum_usage)"
963808
InsufficientDataActions: []
@@ -974,7 +819,8 @@ Resources:
974819
AlarmDescription: Alarm if Javabuilder severe error rate exceeds 25% every 5 minutes for 20
975820
minutes and there are at least 100 requests every 5 minutes.
976821
Occasional spikes are expected, but a sustained elevated error rate is an indication of an issue.
977-
Errors are generated by the Lambda system.
822+
Errors are generated by the Lambda system. Please follow the instructions in this document to mitigate
823+
https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.2gh4dxmz643n
978824
ActionsEnabled: true
979825
AlarmActions:
980826
- !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate"]
@@ -996,15 +842,41 @@ Resources:
996842
AlarmDescription: Alarm if Javabuilder error rate exceeds 90% every 5 minutes for 20
997843
minutes and there are at least 100 requests every 5 minutes.
998844
Occasional spikes are expected, but a sustained high error rate is an indication of an outage.
999-
Errors are generated by the Lambda system.
845+
Errors are generated by the Lambda system. Please follow the instructions in this document to mitigate
846+
https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.2gh4dxmz643n
1000847
ActionsEnabled: true
1001848
AlarmActions:
1002-
- !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate"]
849+
- !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:CDO-Urgent"]
1003850
AlarmRule: !Sub "ALARM(${SubDomainName}_<%=name.downcase%>_ninety_percent_error_rate) AND
1004851
ALARM(${SubDomainName}_<%=name.downcase%>_minimum_usage)"
1005852
InsufficientDataActions: []
1006853
OKActions: []
1007-
854+
855+
<%=name%>HighConcurrentExecutionsAlarm:
856+
Type: AWS::CloudWatch::Alarm
857+
Properties:
858+
AlarmName: !Sub "${SubDomainName}_<%=name.downcase%>_high_concurrent_executions"
859+
AlarmDescription: !Sub |
860+
Alarm if javabuilder usage exceeds 400 concurrent
861+
executions for 10 minutes. Occasional spikes are expected, but
862+
long-running high usage is an indication of an attack. Page the student learning
863+
team for further investigation. See this doc for investigation steps
864+
https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.xs1gcuxrw6ze
865+
ActionsEnabled: true
866+
AlarmActions:
867+
- !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:CDO-Urgent"]
868+
EvaluationPeriods: 10
869+
DatapointsToAlarm: 10
870+
Period: 60
871+
Threshold: 400
872+
ComparisonOperator: GreaterThanThreshold
873+
TreatMissingData: notBreaching
874+
MetricName: ConcurrentExecutions
875+
Namespace: AWS/Lambda
876+
Statistic: Maximum
877+
Dimensions:
878+
- Name: FunctionName
879+
Value: !Ref BuildAndRunJava<%=name%>ProjectFunction
1008880
<%end -%>
1009881

1010882
# We use shortened versions of names for partition keys (eg, user_id),

0 commit comments

Comments
 (0)