In [1]:
def cap(line: str) -> str:
    '''
    This capitalizes words at the beginning of each sentence
    '''
    phrases = line.split('. ')
    new_phrases = [str(' ' + i.capitalize()) for i in phrases]
    line = '.'.join(new_phrases)
    return line

In [2]:
import numpy as np
import random
from termcolor import colored

def read_a_file(file_path: str) -> dict:
    doc = dict()
    with open(file_path, 'r') as f:
        for l in f:
            if l.startswith('Q'):
                choices = []
                # Add to be a key
                q = l[4:].strip()
            elif l.startswith('x '):
                # Add a correct choice to a dict of choices
                choices.append([l[1:].strip(), True])

            elif l.startswith('- '):
                # Add a wrong choice to a dict of choices
                choices.append([l[1:].strip(), False])

            # Detect a blank line
            elif l in ['\n', '\r\n']:
                # Shuffle choices
                np.random.shuffle(choices)
                # Add them to be a set of question and choices
                doc[q] = np.array(choices)
    f.close()
    return doc

In [3]:
def create_a_nicer_file(filename: str, doc: dict):
    '''
    This is to create a nicer Q&A file name from doc, of dict type
    '''
    num = 1
    with open(str(filename), 'w') as f:
        for question, answers in doc.items():
            f.write(f'Q{num}. {question}\n')
            num += 1
            for answer in answers:
                if answer[1] == 'True':
                    f.write(f'x {answer[0]}\n')
                else:
                    f.write(f'- {answer[0]}\n')
            f.write('\n')
    f.close()

In [4]:
def make_flash_cards(doc):
    '''
    This is to create flash cards containing questions and ONLY ONE correct answer in a group of multiple choices
    '''
    # Shuffle questions
    doc = list(doc.items())
    random.shuffle(doc)
    doc = dict(doc)
    correct = 0
    index_q = 1
    for q, c in doc.items():
        print(colored(f'Q{index_q}. {q}', 'blue'))
        index_q += 1
        correct_answer = []
        for index, choice in enumerate(c, 1):
            print(colored(f'{index}. {choice[0]}', 'magenta'))
            if choice[1] == 'True':
                correct_answer.append(choice[0])
                
        if len(correct_answer) == 1: # When there is only one correct answer
            user_choice = int(input('Enter the answer number: '))
            while user_choice > len(c):
                user_choice = int(input('Enter the answer number: '))
                
            if c[user_choice - 1][1] == 'True':
                print('Correct')
                correct += 1
            else:
                print(colored('WRONG! The answer should be: ', 'yellow'), end='')
                print(colored(correct_answer[0], 'red'))
            print()
            
        else: # When there are more than one correct answer
            num_choice = 0
            user_choices = []
            while num_choice < len(correct_answer):
                user_choice = int(input(f'Enter the answer number {num_choice + 1} / {len(correct_answer)}:'))
                while user_choice > len(c):
                    user_choice = int(input(f'Enter the answer number {num_choice + 1} / {len(correct_answer)}:'))
                user_choices.append(c[user_choice - 1][1])
                num_choice += 1
                
            if 'False' in user_choices or len(user_choices) != len(correct_answer):
                print(colored('WRONG! The answer should be: ', 'yellow'), end='')
                print(colored(correct_answer, 'red'))
            else:
                print('Correct')
                correct += 1
            print()
            
    print(f'You passed with: {correct * 100 / len(doc):.2f}%')

In [5]:
doc = read_a_file('ML.txt')
make_flash_cards(doc)

[34mQ1. You are a ML specialist within a large organization who needs to run SQL queries and analytics on thousands of Apache logs files stored in S3. Which set of tools can help you achieve this with the LEAST amount of effort?[0m
[35m1. Data Pipeline and Athena[0m
[35m2. AWS Glue Data Catalog and Athena[0m
[35m3. Data Pipeline and RDS[0m
[35m4. Redshift and Redshift Spectrum[0m


Enter the answer number:  2


Correct

[34mQ2. You are a ML specialist who is working within SageMaker analyzing a dataset in a Jupyter notebook. On your local machine you have several open-source Python libraries that you have downloaded from the internet using a typical package manager. You want to download and use these same libraries on your dataset in SageMaker within your Jupyter notebook. What options allow you to use these libraries?[0m
[35m1. Use the integrated terminals in SageMaker to install libraries. This is typically done using conda install or pip install.[0m
[35m2. Upload the library in .zip format into S3 and use the Jupyter notebook in SageMaker to reference S3 bucket with Python libraries.[0m
[35m3. SageMaker offers a wide variety of built-in libraries. If the library you need is not included, contact AWS support with details on libraries needed for distribution.[0m
[35m4. SSH into the Jupyter notebook instance and install needed libraries. This is typically done using conda install or 

Enter the answer number:  1


Correct

[34mQ3. Your organization has a standalone Javascript (Node.js) application that streams data into AWS using Kinesis Data Streams. You notice that they are using the Kinesis API (AWS SDK) over the Kinesis Producer Library (KPL). What might be the reasoning behind this?[0m
[35m1. The Kinesis API (AWS SDK) provides greater functionality over the Kinesis Producer Library.[0m
[35m2. The Kinesis API (AWS SDK) runs faster in Javascript applications over the Kinesis Producer Library.[0m
[35m3. The Kinesis Producer Library cannot be integrated with a Javascript application because of its asynchronous architecture.[0m
[35m4. The Kinesis Producer Library must be installed as a Java application to use with Kinesis Data Streams.[0m


Enter the answer number:  4


Correct

[34mQ4. We are running a training job over and over again using slightly different, very large datasets as an experiment. Training is taking a very long time with your I/O-bound training algorithm and you want to improve training performance. What might you consider?[0m
[35m1. Make use of pipe mode to stream data directly from S3.[0m
[35m2. Convert the data format to an Integer32 tensor.[0m
[35m3. Make use of file mode to stream data directly from S3.[0m
[35m4. Use the SageMaker console to change your training job instance type from an ml.c5.xlarge to a r5.xlarge.[0m
[35m5. Convert the data format to protobuf recordIO format.[0m


Enter the answer number 1 / 2: 1
Enter the answer number 2 / 2: 5


Correct

[34mQ5. After training and validation sessions, we notice that the error rate is higher than we want for both sessions. Visualization of the data indicates that we don't seem to have any outliers. What else might we do?[0m
[35m1. Add more variables to the dataset.[0m
[35m2. Encode the data using Laminar Flow Step-up.[0m
[35m3. Run a random cut forest algorithm on the data.[0m
[35m4. Gather more data for our training process.[0m
[35m5. Run training for a longer period of time.[0m
[35m6. Reduce the dimensions of the data.[0m


Enter the answer number 1 / 3: 1
Enter the answer number 2 / 3: 4
Enter the answer number 3 / 3: 5


Correct

[34mQ6. You are a ML specialist working with data that is stored in a distributed EMR cluster on AWS. Currently, your machine learning applications are compatible with the Apache Hive Metastore tables on EMR. You have been tasked with configuring Hive to use the AWS Glue Data Catalog as its metastore. Before you can do this you need to transfer the Apache Hive metastore tables into an AWS Glue Data Catalog. What are the steps you'll need to take to achieve this with the LEAST amount of effort?[0m
[35m1. Create DMS endpoints for both the input Apache Hive Metastore and the output data store S3 bucket, run a DMS migration to transfer the data, then create a crawler that creates an AWS Glue Data Catalog.[0m
[35m2. Setup your Apache Hive application with JDBC driver connections, then create a crawler that crawlers the Apache Hive Metastore using the JDBC connection and creates an AWS Glue Data Catalog.[0m
[35m3. Create a Data Pipeline job that reads from your Apache Hive Me

Enter the answer number 1 / 2: 2
Enter the answer number 2 / 2: 5


Correct

[34mQ7. Which service in the Kinesis family can continuously capture gigabytes of data per second and make the collected data available in milliseconds to enable real-time analytics use cases?[0m
[35m1. Kinesis Data Streams[0m
[35m2. Kinesis Data Firehose[0m
[35m3. Kinesis Data Analytics[0m
[35m4. Kinesis Video Streams[0m


Enter the answer number:  1


Correct

[34mQ8. You are consulting with a retailer that wants to evaluate the sentiment of social media posts to determine if they are positive or negative. Which approach would be the most direct to this problem?[0m
[35m1. Use BlazingText in Text Classification mode.[0m
[35m2. Use BlazingText in Word2Vec mode for skip-gram.[0m
[35m3. Use Amazon Comprehend.[0m
[35m4. Use Amazon Macie.[0m
[35m5. Use Object2Vec in sentiment detection mode.[0m


Enter the answer number:  3


Correct

[34mQ9. Which of these examples would be considered as introducing bias into a problem space?[0m
[35m1. Failing to randomize a dataset even though you were told it was already random.[0m
[35m2. Omitting records before a certain date in a forecasting problem.[0m
[35m3. Removing records from a set of customer reviews that were not fully complete.[0m
[35m4. Filtering out outliers in a dataset which are greater than 4 standard deviations outside the mean.[0m
[35m5. Deciding to use a supervised learning method to estimate missing values in a dataset.[0m


Enter the answer number 1 / 2: 2
Enter the answer number 2 / 2: 3


[33mWRONG! The answer should be: [0m[31m['Failing to randomize a dataset even though you were told it was already random.', 'Removing records from a set of customer reviews that were not fully complete.'][0m

[34mQ10. Which service in the Kinesis family allows you to securely stream video from connected devices to AWS for analytics, machine learning (ML), and other processing?[0m
[35m1. Kinesis Streams[0m
[35m2. Kinesis Firehose[0m
[35m3. Kinesis Video Streams[0m
[35m4. Kinesis Data Analytics[0m


Enter the answer number:  3


Correct

[34mQ11. You work for a farming company that has dozens of tractors with build-in IoT devices. These devices stream data into AWS using Kinesis Data Streams. The features associated with the data is tractor Id, latitude, longitude, inside temp, outside temp, and fuel level. As a ML specialist you need to transform the data and store it in a data store. Which combination of services can you use to achieve this?[0m
[35m1. Use Kinesis Data Analytics to run real-time SQL queries to transform the data and immediately write the transformed data into S3.[0m
[35m2. Set up Kinesis Firehose to ingest data from Kinesis Data Streams, then send data to Lambda. Transform the data in Lambda and write the transformed data into S3.[0m
[35m3. Immediately send the data to Lambda from Kinesis Data Streams. Transform the data in Lambda and write the transformed data into S3.[0m
[35m4. Set up Kinesis Data Analytics to ingest the data from Kinesis Data Stream, then run real-time SQL queries

Enter the answer number 1 / 3: 2
Enter the answer number 2 / 3: 3
Enter the answer number 3 / 3: 4


Correct

[34mQ12. You are collecting clickstream data from an e-commerce website to make near-real time product suggestions for users actively using the site. Which combination of tools can be used to achieve the quickest recommendations and meets all of the requirements?[0m
[35m1. Use Kinesis Data Streams to ingest clickstream data, then use Lambda to process that data and write it to S3. Once the data is on S3, use Athena to query based on conditions that data and make real time recommendations to users.[0m
[35m2. Use Kinesis Data Streams to ingest clickstream data, then use Kinesis Data Analytics to run real time SQL queries to gain actionable insights and trigger real-time recommendations with AWS Lambda functions based on conditions.[0m
[35m3. Use Kinesis Data Firehose to ingest click stream data, then use Kinesis Data Analytics to run real time SQL queries to gain actionable insights and trigger real-time recommendations with AWS Lambda functions based on conditions, then 

Enter the answer number:  2


Correct

[34mQ13. You are designing a testing plan for an update release of your company's mission critical loan approval model. Due to regulatory compliance, it is critical that the updates are not used in production until regression testing has shown that the updates perform as good as the existing model. Which validation strategy would you choose?[0m
[35m1. Use a canary deployment to collect data on whether the model is ready for production.[0m
[35m2. Use a K-Fold validation method.[0m
[35m3. Use a rolling upgrade to determine if the model is ready for production.[0m
[35m4. Make use of backtesting with historic data.[0m
[35m5. Use an A/B test to expose the updates to real-world traffic.[0m


Enter the answer number 1 / 2: 2
Enter the answer number 2 / 2: 4


Correct

[34mQ14. Which service in the Kinesis family allows you to easily load streaming data into data stores and analytics tools?[0m
[35m1. Kinesis Video Streams[0m
[35m2. Kinesis Data Analytics[0m
[35m3. Kinesis Firehose[0m
[35m4. Kinesis Streams[0m


Enter the answer number:  4


[33mWRONG! The answer should be: [0m[31mKinesis Firehose[0m

[34mQ15. You are collecting clickstream data from an e-commerce website using Kinesis Data Firehose. You are using the PutRecord API from the AWS SDK to send the data to the stream. What are the required parameters when sending data to Kinesis Data Firehose using the API PutRecord call?[0m
[35m1. DataStreamName, PartitionKey, and Record (containing the data)[0m
[35m2. Data, PartitionKey, StreamName, ShardId[0m
[35m3. DeliveryStreamName and Record (containing the data)[0m
[35m4. Data, PartitionKey, StreamName[0m


Enter the answer number:  3


Correct

[34mQ16. You are trying to set up a crawler within AWS Glue that crawls your input data in S3. For some reason after the crawler finishes executing, it cannot determine the schema from your data and no tables are created within your AWS Glue Data Catalog. What is the reason for these results?[0m
[35m1. The checkbox for 'Do not create tables' was checked when setting up the crawler in AWS Glue.[0m
[35m2. The crawler does not have correct IAM permissions to access the input data in the S3 bucket.[0m
[35m3. The bucket path for the input data store in S3 is specified incorrectly.[0m
[35m4. AWS Glue built-in classifiers could not find the input data format. You need to create a custom classifier.[0m


Enter the answer number:  4


Correct

[34mQ17. You have been asked to help develop a vision system for a manufacturing line that will reorient parts to a specific position using a robotic arm. What algorithm might you choose for the vision part of this problem?[0m
[35m1. Object Detection[0m
[35m2. Object2Vec[0m
[35m3. Semantic Segmentation[0m
[35m4. Image Analysis[0m
[35m5. Seq2Seq[0m
[35m6. AWS Comprehend[0m


Enter the answer number:  3


Correct

[34mQ18. You have been tasked with converting multiple JSON files within a S3 bucket to Apache Parquet format. Which AWS service can you use to achieve this with the LEAST amount of effort?[0m
[35m1. Create an EMR cluster to run an Apache Spark job to process the data the Apache Parquet and output newly formatted files into S3.[0m
[35m2. Create a Data Pipeline job that reads from your S3 bucket and sends the data the EMR. Create an Apache Spark job to process the data the Apache Parquet and output newly formatted files into S3.[0m
[35m3. Create a Lambda function that reads all of the objects in the S3 bucket. Loop through each of the objects and convert from JSON to Apache Parquet. Once the conversion is complete, output newly formatted files into S3.[0m
[35m4. Create an AWS Glue Job to convert the S3 objects from JSON to Apache Parquet, then output newly formatted files into S3.[0m


Enter the answer number:  4


Correct

[34mQ19. An organization needs to store a mass amount of data in AWS. The data has a key-value access pattern, developers need to run complex SQL queries and transactions, and the data has a fixed schema. Which type of data store meets all of their needs?[0m
[35m1. Athena[0m
[35m2. DynamoDB[0m
[35m3. RDS[0m
[35m4. S3[0m


Enter the answer number:  3


Correct

[34mQ20. When you issue a CreateModel API call using a built-in algorithm, which of the following actions would be next?[0m
[35m1. SageMaker launches an appropriate training container from the algorithm selected from the regional container repository.[0m
[35m2. Sagemaker provisions an EC2 instances using the appropriate AMI for the algorithm selected from the global container registry.[0m
[35m3. SageMaker launches an appropriate inference container for the algorithm selected from the global container repository.[0m
[35m4. SageMaker launches an appropriate inference container for the algorithm selected from the regional container repository.[0m
[35m5. SageMaker provisions an EMR cluster and prepares a Spark script for the training job.[0m
[35m6. Sagemaker provisions an EC2 instances using the appropriate AMI for the algorithm selected from the regional container registry.[0m


Enter the answer number:  4


Correct

[34mQ21. A colleague is preparing for their very first training job using the XGBoost algorithm. They ask you how they can ensure that training metrics are captured during the training job. How do you direct them?[0m
[35m1. Enable CloudTrail logging for the SageMaker API service.[0m
[35m2. Do nothing. Use SageMaker's built-in logging feature and view the logs using Quicksight.[0m
[35m3. Do nothing. Use SageMaker's built-in logging to DynamoDB Streams.[0m
[35m4. Enable CloudWatch logging for Jupyter Notebook and the IAM user.[0m
[35m5. Do nothing. Sagemaker's built-in algorithms are already configured to send training metrics to CloudTrail.[0m
[35m6. Do nothing. Sagemaker's built-in algorithms are already configured to send training metrics to CloudWatch.[0m


Enter the answer number:  1


[33mWRONG! The answer should be: [0m[31mDo nothing. Sagemaker's built-in algorithms are already configured to send training metrics to CloudWatch.[0m

[34mQ22. You are preparing for a first training run using a custom algorithm that you have prepared in a docker container. What should you do to ensure that the training metrics are visible to CloudWatch?[0m
[35m1. Create a Lambda function to scrape the logs in the custom algorithm container and deposit them into CloudWatch via API.[0m
[35m2. When defining the training job, ensure that the metric_definitions section is populated with relevant metrics from the stdout and stderr streams in the container.[0m
[35m3. Enable CloudTrail for the respective container to capture the relevant training metrics from the custom algorithm.[0m
[35m4. Enable Kinesis Streams to capture the log stream emitting from the custom algorithm containers.[0m
[35m5. Do nothing. SageMaker will automatically parse training logs for custom algorithms an

Enter the answer number:  3


[33mWRONG! The answer should be: [0m[31mWhen defining the training job, ensure that the metric_definitions section is populated with relevant metrics from the stdout and stderr streams in the container.[0m

[34mQ23. You are a ML specialist needing to collect data from Twitter tweets. Your goal is to collect tweets that include only the name of your company and the tweet body, and store it off into a data store in AWS. What set of tools can you use to stream, transform, and load the data into AWS with the LEAST amount of effort?[0m
[35m1. Setup a Kinesis Data Firehose for data ingestion and immediately write that data to S3. Next, setup a Lambda function to trigger when data lands in S3 to transform it and finally write it to DynamoDB.[0m
[35m2. Setup Kinesis Data Streams for data ingestion. Next, setup Kinesis Data Firehouse to load that data into RedShift. Next, setup a Lambda function to query data using RedShift spectrum and store the results onto DynamoDB.[0m
[35m3. Setu

Enter the answer number:  3


[33mWRONG! The answer should be: [0m[31mSetup a Kinesis Data Firehose for data ingestion and immediately write that data to S3. Next, setup a Lambda function to trigger when data lands in S3 to transform it and finally write it to DynamoDB.[0m

[34mQ24. What are your options for storing data into S3?[0m
[35m1. PutRecords API call[0m
[35m2. UPLOAD command[0m
[35m3. The AWS console[0m
[35m4. AWS SDK[0m
[35m5. UNLOAD command[0m
[35m6. AWS CLI[0m


Enter the answer number 1 / 3: 3
Enter the answer number 2 / 3: 4
Enter the answer number 3 / 3: 6


Correct

[34mQ25. You have launched a training job but it fails after a few minutes. What is the first thing you should do for troubleshooting?[0m
[35m1. Go to CloudTrail logs and try to identify the error in the logs for your job.[0m
[35m2. Submit the job with AWS X-Ray enabled for additional debug information.[0m
[35m3. Check to see that your Notebook instance has the proper permissions to access the input files on S3.[0m
[35m4. Ensure that your instance type is large enough and resubmit the job in a different region.[0m
[35m5. Go to CloudWatch logs and try to identify the error in the logs for your job.[0m


Enter the answer number:  1


[33mWRONG! The answer should be: [0m[31mGo to CloudWatch logs and try to identify the error in the logs for your job.[0m

[34mQ26. If you have mission critical data that must be processed with as minimal delay as possible, you should use the Kinesis API (AWS SDK) over the Kinesis Producer Library.[0m
[35m1. True[0m
[35m2. False[0m


Enter the answer number:  1


Correct

[34mQ27. Your organization has given you several different sets of key-value pair JSON files that need to be used for a machine learning project within AWS. What type of data is this classified as and where is the best place to load this data into?[0m
[35m1. Structured data, stored in RDS.[0m
[35m2. Unstructured data, stored in S3.[0m
[35m3. Semi-structured data, stored in S3.[0m
[35m4. Semi-structured data, stored in DynamoDB.[0m


Enter the answer number:  3


Correct

You passed with: 77.78%
