@@ -541,134 +541,6 @@ Resources:
541
541
ForwardedValues: {QueryString: true}
542
542
ViewerProtocolPolicy: redirect-to-https
543
543
544
- HighConcurrentExecutionsAlarm:
545
- Type: AWS::CloudWatch::Alarm
546
- Properties:
547
- AlarmName: !Sub "${SubDomainName}_high_concurrent_executions"
548
- AlarmDescription: !Sub |
549
- This will page the DOTD if javabuilder usage exceeds 50 concurrent
550
- executions for 10 minutes. Occasional spikes are expected, but
551
- long-running high usage is an indication of an attack. Go to the
552
- following URLs and set reserved concurrency to 10 immediately
553
- <% JAVALAB_APP_TYPES . each do | name | -%>
554
- https://console.aws.amazon.com/lambda/home?region=${AWS::Region}#/functions/${BuildAndRunJava<%= name%> ProjectFunction}/edit/concurrency?tab=configure
555
- <% end -%>
556
- Then post in #ap-csa-dev.
557
- ActionsEnabled: true
558
- AlarmActions:
559
- - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:CDO-Urgent"]
560
- EvaluationPeriods: 10
561
- DatapointsToAlarm: 10
562
- Threshold: 50
563
- ComparisonOperator: GreaterThanThreshold
564
- TreatMissingData: notBreaching
565
- Metrics:
566
- - Id: e1
567
- Label: Concurrent Executions Across All Lambdas
568
- ReturnData: true
569
- Expression: SUM(METRICS())
570
- <% { Theater : "m2" , Neighborhood : "m3" , Console : "m4" } . each do |name , id | -%>
571
- - Id: <%= id%>
572
- ReturnData: false
573
- MetricStat:
574
- Metric:
575
- Namespace: AWS/Lambda
576
- MetricName: ConcurrentExecutions
577
- Dimensions:
578
- - Name: FunctionName
579
- Value: !Ref BuildAndRunJava<%= name%> ProjectFunction
580
- Period: 60
581
- Stat: Maximum
582
- <% end -%>
583
-
584
- HighWebsocketConnectionsAlarm:
585
- Type: AWS::CloudWatch::Alarm
586
- Properties:
587
- AlarmName: !Sub "${SubDomainName}_high_websocket_connections"
588
- AlarmDescription: Significantly higher websocket connections than normal detected. Investigate if there is a DDOS.
589
- ActionsEnabled: false
590
- EvaluationPeriods: 20
591
- DatapointsToAlarm: 20
592
- ComparisonOperator: GreaterThanUpperThreshold
593
- TreatMissingData: notBreaching
594
- Metrics:
595
- - Id: m1
596
- ReturnData: true
597
- MetricStat:
598
- Metric:
599
- Namespace: AWS/ApiGateway
600
- MetricName: ConnectCount
601
- Dimensions:
602
- - Name: Stage
603
- Value: !Sub "${StageName}"
604
- - Name: ApiId
605
- Value: !Ref WebSocketApi
606
- Period: 60
607
- Stat: Sum
608
- - Id: ad1
609
- Label: ConnectCount (expected)
610
- ReturnData: true
611
- Expression: ANOMALY_DETECTION_BAND(m1, 8)
612
- ThresholdMetricId: ad1
613
-
614
- HighHttpRequestsAlarm:
615
- Type: AWS::CloudWatch::Alarm
616
- Properties:
617
- AlarmName: !Sub "${SubDomainName}_high_http_requests"
618
- AlarmDescription: Significantly higher HTTP requests than normal detected.
619
- Investigate if there is a DDOS.
620
- ActionsEnabled: true
621
- OKActions: []
622
- AlarmActions: []
623
- InsufficientDataActions: []
624
- EvaluationPeriods: 20
625
- DatapointsToAlarm: 20
626
- ComparisonOperator: GreaterThanUpperThreshold
627
- TreatMissingData: notBreaching
628
- Metrics:
629
- - Id: m1
630
- ReturnData: true
631
- MetricStat:
632
- Metric:
633
- Namespace: AWS/ApiGateway
634
- MetricName: Count
635
- Dimensions:
636
- - Name: ApiId
637
- Value: !Ref HttpApi
638
- Period: 60
639
- Stat: Sum
640
- - Id: ad1
641
- Label: Count (expected)
642
- ReturnData: true
643
- Expression: ANOMALY_DETECTION_BAND(m1, 8)
644
- ThresholdMetricId: ad1
645
-
646
- HighUsageCompositeAlarm:
647
- Type: AWS::CloudWatch::CompositeAlarm
648
- DependsOn:
649
- - ConsoleHighInvocationsAlarm
650
- - HighHttpRequestsAlarm
651
- - HighWebsocketConnectionsAlarm
652
- - NeighborhoodHighInvocationsAlarm
653
- - TheaterHighInvocationsAlarm
654
- Properties:
655
- ActionsEnabled: true
656
- AlarmActions:
657
- # TODO: after we have run at high usage for a while, consider re-enabling this alarm. Right now it is too noisy
658
- # - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:javabuilder-high-usage"]
659
- - !Ref AWS::NoValue
660
- AlarmDescription: Send message if abnormally high Javabuilder usage detected.
661
- Monitors usage across the HTTP API, WebSocket API, and all Build and Run
662
- Lambdas.
663
- AlarmName: !Sub "${SubDomainName}_high_usage_composite"
664
- AlarmRule: !Sub "ALARM(${SubDomainName}_console_high_invocations) OR
665
- ALARM(${SubDomainName}_high_http_requests) OR
666
- ALARM(${SubDomainName}_high_websocket_connections) OR
667
- ALARM(${SubDomainName}_neighborhood_high_invocations) OR
668
- ALARM(${SubDomainName}_theater_high_invocations)"
669
- InsufficientDataActions: []
670
- OKActions: []
671
-
672
544
<% JAVALAB_APP_TYPES . each do | name | -%>
673
545
<% {
674
546
TenPercentSevereErrorRateAlarm : { Threshold : 10 , AlarmName : 'ten_percent_severe_error_rate' } ,
@@ -871,35 +743,6 @@ Resources:
871
743
Threshold: 2500
872
744
Period: 60
873
745
874
- <%= name%> HighInvocationsAlarm:
875
- Type: AWS::CloudWatch::Alarm
876
- Properties:
877
- AlarmName: !Sub "${SubDomainName}_<%= name . downcase%> _high_invocations"
878
- AlarmDescription: Significantly higher <%= name%> build and run invocations than
879
- normal detected. Investigate if there is a DDOS.
880
- ActionsEnabled: false
881
- EvaluationPeriods: 20
882
- DatapointsToAlarm: 20
883
- ComparisonOperator: GreaterThanUpperThreshold
884
- TreatMissingData: notBreaching
885
- Metrics:
886
- - Id: m1
887
- ReturnData: true
888
- MetricStat:
889
- Metric:
890
- Namespace: AWS/Lambda
891
- MetricName: Invocations
892
- Dimensions:
893
- - Name: FunctionName
894
- Value: !Ref BuildAndRunJava<%= name%> ProjectFunction
895
- Period: 60
896
- Stat: Sum
897
- - Id: ad1
898
- Label: Invocations (expected)
899
- ReturnData: true
900
- Expression: ANOMALY_DETECTION_BAND(m1, 8)
901
- ThresholdMetricId: ad1
902
-
903
746
<%= name%> MinimumUsageAlarm:
904
747
Type: AWS::CloudWatch::Alarm
905
748
Properties:
@@ -932,7 +775,8 @@ Resources:
932
775
AlarmDescription: Alarm if Javabuilder severe error rate exceeds 10% every 5 minutes for 20
933
776
minutes and there are at least 100 requests every 5 minutes.
934
777
Occasional spikes are expected, but a sustained elevated severe error rate is an indication of an issue.
935
- Severe errors are generated and emitted by our code.
778
+ Severe errors are generated and emitted by our code. Please follow the instructions in this document to mitigate
779
+ https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.2gh4dxmz643n
936
780
ActionsEnabled: true
937
781
AlarmActions:
938
782
- !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate"]
@@ -954,10 +798,11 @@ Resources:
954
798
AlarmDescription: Alarm if Javabuilder severe error rate exceeds 90% every 5 minutes for 20
955
799
minutes and there are at least 100 requests every 5 minutes.
956
800
Occasional spikes are expected, but a sustained high severe error rate is an indication of an outage.
957
- Severe errors are generated and emitted by our code.
801
+ Severe errors are generated and emitted by our code. Please follow the instructions in this document to mitigate
802
+ https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.2gh4dxmz643n
958
803
ActionsEnabled: true
959
804
AlarmActions:
960
- - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate "]
805
+ - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:CDO-Urgent "]
961
806
AlarmRule: !Sub "ALARM(${SubDomainName}_<%= name . downcase%> _ninety_percent_severe_error_rate) AND
962
807
ALARM(${SubDomainName}_<%= name . downcase%> _minimum_usage)"
963
808
InsufficientDataActions: []
@@ -974,7 +819,8 @@ Resources:
974
819
AlarmDescription: Alarm if Javabuilder severe error rate exceeds 25% every 5 minutes for 20
975
820
minutes and there are at least 100 requests every 5 minutes.
976
821
Occasional spikes are expected, but a sustained elevated error rate is an indication of an issue.
977
- Errors are generated by the Lambda system.
822
+ Errors are generated by the Lambda system. Please follow the instructions in this document to mitigate
823
+ https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.2gh4dxmz643n
978
824
ActionsEnabled: true
979
825
AlarmActions:
980
826
- !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate"]
@@ -996,15 +842,41 @@ Resources:
996
842
AlarmDescription: Alarm if Javabuilder error rate exceeds 90% every 5 minutes for 20
997
843
minutes and there are at least 100 requests every 5 minutes.
998
844
Occasional spikes are expected, but a sustained high error rate is an indication of an outage.
999
- Errors are generated by the Lambda system.
845
+ Errors are generated by the Lambda system. Please follow the instructions in this document to mitigate
846
+ https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.2gh4dxmz643n
1000
847
ActionsEnabled: true
1001
848
AlarmActions:
1002
- - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate "]
849
+ - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:CDO-Urgent "]
1003
850
AlarmRule: !Sub "ALARM(${SubDomainName}_<%= name . downcase%> _ninety_percent_error_rate) AND
1004
851
ALARM(${SubDomainName}_<%= name . downcase%> _minimum_usage)"
1005
852
InsufficientDataActions: []
1006
853
OKActions: []
1007
-
854
+
855
+ <%= name%> HighConcurrentExecutionsAlarm:
856
+ Type: AWS::CloudWatch::Alarm
857
+ Properties:
858
+ AlarmName: !Sub "${SubDomainName}_<%= name . downcase%> _high_concurrent_executions"
859
+ AlarmDescription: !Sub |
860
+ Alarm if javabuilder usage exceeds 400 concurrent
861
+ executions for 10 minutes. Occasional spikes are expected, but
862
+ long-running high usage is an indication of an attack. Page the student learning
863
+ team for further investigation. See this doc for investigation steps
864
+ https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.xs1gcuxrw6ze
865
+ ActionsEnabled: true
866
+ AlarmActions:
867
+ - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:CDO-Urgent"]
868
+ EvaluationPeriods: 10
869
+ DatapointsToAlarm: 10
870
+ Period: 60
871
+ Threshold: 400
872
+ ComparisonOperator: GreaterThanThreshold
873
+ TreatMissingData: notBreaching
874
+ MetricName: ConcurrentExecutions
875
+ Namespace: AWS/Lambda
876
+ Statistic: Maximum
877
+ Dimensions:
878
+ - Name: FunctionName
879
+ Value: !Ref BuildAndRunJava<%= name%> ProjectFunction
1008
880
<% end -%>
1009
881
1010
882
# We use shortened versions of names for partition keys (eg, user_id),
0 commit comments