In [1]:
from dataclasses import dataclass
import re


@dataclass  # This decorator helps save time from initializing constructor
class Flash_card:
    """This contains a question, its demonstration, possible choices, and correct answers out of the choices"""

    question: str
    choices: list
    correct_choices: list
    demo: str = None

    # Shuffle all choices
    def shuffle_choices(self):
        np.random.shuffle(self.choices)

In [2]:
from termcolor import colored
import numpy as np
import roman


class Folder:
    """This contains multiple flash cards"""

    my_flash = []
    correct: int = 0

    def total_score(self):
        """This prints the percentage of correct answers"""
        try:
            print(f"You get {self.correct * 100 / len(self.my_flash):.2f}%")
        except:
            print("You have to have at least one question")

    def learn(self):
        """The user chooses correct answer(s) by typing a, b, c, etc. (lower case) and hit enter"""
        # Shuffle order of questions
        np.random.shuffle(self.my_flash)
        for i, flash in enumerate(self.my_flash, 1):
            print(colored(f"{roman.toRoman(i)}. {flash.question[0].upper() + flash.question[1:]}", "blue"))  # Print question
            if flash.demo:
                print(flash.demo.strip())  # Print question demonstration
            # Shuffle choices
            flash.shuffle_choices()
            for index, c in enumerate(flash.choices):
                print(
                    colored(f"{index + 1}. {c[0].upper() + c[1:]}", "magenta")
                )  # Print possible choices
            answers = []
            if len(flash.correct_choices) > 1:
                answer = input(
                    "Choose multiple answers"
                )
                while any(int(a) > len(flash.choices) for a in answer):
                    answer = input('You have an invalid answer, please choose again')
                for a in list(answer):
                    answers.append(flash.choices[int(a) - 1])
            else:
                ans = int(input("Choose one correct answer"))
                while ans > len(flash.choices):
                    ans = int(input('You have an invalid answer, please choose again'))
                answers.append(flash.choices[ans - 1])
            if sorted(answers) == sorted(flash.correct_choices):  # Correct answer(s)
                print("Correct!")
                self.correct += 1
            else:  # Incorrect answer(s)
                print(colored(f"Wrong! Correct choice should be: ", "yellow"), end="")
                print(colored("\n".join(flash.correct_choices), "red"))
            print()
        self.total_score()

In [3]:
def read_a_file(file_path: str, general_choice_pattern: str, correct_choice_pattern: str) -> Folder:
    """This reads a text file and returns a Folder instance"""
    doc = Folder()
    with open(file_path, "r", encoding='utf8') as f:
        q, demo, correct_choices, choices = "", "", [], []
        for line in f:
            correct = re.match(correct_choice_pattern, line)
            regular = re.match(general_choice_pattern, line)
            # print(f'line: {line}')
            # Read multiple-lined questions
            if not regular:
                q += line
                # print(f'not regular, q: {q}')
            else:
                # print(f'regular, neat q: {q}')
                if correct:  # Get correct choices
                    correct_choice = line[correct.end():].strip()
                    correct_choices.append(correct_choice)
                    # Any way, the regular choice is added
                    choices.append(correct_choice)
                    # print(f'correct_choice: {correct_choice}')
                else:  # Get incorrect choices
                    regular_choice = line[regular.end():].strip()
                    # print(f'incorrect_choice: {regular_choice}')
                    choices.append(regular_choice)
            if line in ['\n', '\r\n']:
                # Extract the question
                # print(f'q before split: {q}')
                q = q.split(' ', 1)[-1]
                # print(f'q after split: {q}')
                q = q.replace('\n', '').strip()
                # print(f'q after replace: {q}')
                a_flash_card = Flash_card(
                    question=q,
                    choices=choices,
                    correct_choices=correct_choices,
                )
                if a_flash_card not in doc.my_flash:
                    doc.my_flash.append(a_flash_card)
                # print(f'question: {q}')
                # print(f'answers: {choices}')
                # print(f'correct answer: {correct_choices}')
                # print()
                q, demo, correct_choices, choices = "", "", [], []
    return doc

In [4]:
def create_a_nicer_file(doc: Folder, filename: str):
    """
    This is to create a nicer Q&A file name from doc a Folder instance
    """
    with open(str(filename), "w", encoding='utf8') as f:
        for i, flash in enumerate(doc.my_flash, 1):
            f.write(f"####Q{i}. {flash.question}\n")  # Write question
            if flash.demo:  # Write demo if it exists
                f.write(flash.demo)
            for choice in flash.choices:
                if choice in flash.correct_choices:
                    f.write(f"- [x] {choice}\n")
                else:
                    f.write(f"- [ ] {choice}\n")
            f.write("\n")

In [5]:
file_name = "5215.txt"
doc = read_a_file(file_name, general_choice_pattern=r'^\s*-\s*\[\s*\w*\s*\]\s*',
                  correct_choice_pattern=r'^\s*-\s*\[\s*\w\s*\]\s*')
create_a_nicer_file(doc, file_name.split(".", 1)[0] + ".txt")
doc.learn()

[34mI. Reward prediction error is[0m
[35m1. The expected reward value - the received reward value[0m
[35m2. The total future expected reward minus the total future actual reward (with temporal discounting)[0m
[35m3. The received reward value - the expected reward value[0m
[35m4. The total future actual reward minus the total future expected reward (with temporal discounting)[0m


Choose one correct answer 3


Correct!

[34mII. If you are teams #2 and #3 in a competition and you want to merge your teams to beat #1, you will more likely be using bagging rather than boosting to win.[0m
[35m1. False[0m
[35m2. True[0m


Choose one correct answer 1


Correct!

[34mIII. Backpropagation is the ability of deep learning neural networks, like google's Deep Dream, to propagate signals from higher level neurons down to low-level neurons - allowing us in some cases to see the "hallucinated" images.[0m
[35m1. False[0m
[35m2. True[0m


Choose one correct answer 1


Correct!

[34mIV. Check all of the following that are associated with "model-free" reinforcement learning as opposed to model-based learning?[0m
[35m1. It takes advantage of direct knowledge of probabilities between states to optimize learning[0m
[35m2. This technique uses prediction error as the primary means of updating policy decisions[0m
[35m3. It is more associated with valuation of repetitive events than new, novel environments (e.g. habit learning)[0m
[35m4. Inferring state and action value functions iteratively based on repeated rewards and punishments[0m


Choose multiple answers 341


[33mWrong! Correct choice should be: [0m[31mIt is more associated with valuation of repetitive events than new, novel environments (e.g. habit learning)
Inferring state and action value functions iteratively based on repeated rewards and punishments
This technique uses prediction error as the primary means of updating policy decisions[0m

[34mV. It is important to not remove features that are statistically independent of target values because they might be correlated to the target, and thus useful for prediction.[0m
[35m1. True[0m
[35m2. False[0m


Choose one correct answer 2


Correct!

[34mVI. When you want to know how well a product will work on a new person without any individual-specific training, it is better to use K-fold cross-validation than subject-wise cross-validation, because K-fold cross-validation may have an individual's data in both the training and test sets, which is what you want in that case.[0m
[35m1. False[0m
[35m2. True[0m


Choose one correct answer 1


Correct!

[34mVII. Check the scenario where density-based clustering algorithms like DBSCAN are expected to outperform K-means[0m
[35m1. When the clusters would share the same center-point, such as two concentric circles[0m
[35m2. When we know exactly how many clusters to expect[0m
[35m3. When the clusters are well represented by spheres[0m


Choose one correct answer 1


Correct!

[34mVIII. Feature engineering differs from feature selection by the fact that feature engineering uses systematic tools while feature selection is made on an ad-hoc arbitrary basis.[0m
[35m1. False[0m
[35m2. True[0m


Choose one correct answer 1


Correct!

[34mIX. Here is an analogy:"Rose" is to "Flower" as "Porsche" is to "Automobile", because the first word is a type of the second word."North" is to "South" as "Black" is to "White" because second word is the opposite of the first word.and so on...The following is analogy can be said for four important concepts in machine learning. Fill in the blank. Classification is to regression in supervised learning as _____________ is to dimensionality reduction in unsupervised learning. Or more succinctly, classification is to regression as ___________________ is to dimensionality reduction[0m
[35m1. Reinforcement learning[0m
[35m2. Clustering[0m
[35m3. PCA[0m
[35m4. Factor Analysis[0m


Choose one correct answer 2


Correct!

[34mX. Given the same data set, K-means always converges to the same solution, regardless of the starting point[0m
[35m1. True[0m
[35m2. False[0m


Choose one correct answer 2


Correct!

[34mXI. In Q learning, you are updating the action value function, but there are two parameters which control the manner in which this updating occurs[0m
[35m1. Regularization strength (lamda)[0m
[35m2. Learning rate (alpha)[0m
[35m3. Maximum estimation error (epsilon)[0m
[35m4. Temporal discounting (gamma)[0m


Choose multiple answers 12


[33mWrong! Correct choice should be: [0m[31mTemporal discounting (gamma)
Learning rate (alpha)[0m

[34mXII. Although metrics are available to measure the quality of clustering when we know the true classes, there are no metrics to measure the quality of unsupervised clustering when true classes are not known[0m
[35m1. False[0m
[35m2. True[0m


Choose one correct answer 1


Correct!

[34mXIII. In a given binary classification problem, Out of all the positive samples in the test set, the proportion of those which are correctly identified as positive by the classifier is called...[0m
[35m1. F1 Score[0m
[35m2. Precision[0m
[35m3. Specificity[0m
[35m4. Recall[0m


Choose one correct answer 4


Correct!

[34mXIV. After determining the best k value for a k nearest neighbors prediction, how might the best fitting k value change if we changed the training set by incorrectly labeling 10% of all examples?[0m
[35m1. Best k value would on average be lower[0m
[35m2. Best k value would on average be higher[0m
[35m3. Mathematically, the best fitting k value would stay the same regardless of adding noise[0m


Choose one correct answer 2


Correct!

[34mXV. K-fold cross-validation will lead to lower accuracies than expected with the full training set because only (K-1)/K % of the data is being used for training (e.g. 4/5ths for K=5). The way to improve this is by increasing K.But what is a problem with increasing K?[0m
[35m1. K models have to be trained which takes more time as K increases[0m
[35m2. The number of samples in the data set may not be perfectly divisible by K[0m
[35m3. The separated test set is getting small and may bias results of the cross-validation[0m


Choose one correct answer 1


Correct!

[34mXVI. Two child nodes of the same parent are independent until the parent node is observed, which then introduces a dependency.[0m
[35m1. True[0m
[35m2. False[0m


Choose one correct answer 2


Correct!

[34mXVII. The Q in Q-learning for reinforcement learning is best described as[0m
[35m1. The reward signal from the environment[0m
[35m2. The reward prediction error quotient[0m
[35m3. The discount factor[0m
[35m4. The sum of future expected rewards[0m


Choose one correct answer 4


Correct!

[34mXVIII. K-means is a supervised learning technique that can be used for classification while K-nearest neighbors is an unsupervised learning technique which returns classes of the samples based solely on similarity between samples[0m
[35m1. False[0m
[35m2. True[0m


Choose one correct answer 1


Correct!

[34mXIX. Two nodes that share the same child node are independent until the child node is observed, which then introduces a dependency.[0m
[35m1. False[0m
[35m2. True[0m


Choose one correct answer 2


Correct!

[34mXX. A random forest classifier uses boosting on a set of decision trees to increase performance[0m
[35m1. False[0m
[35m2. True[0m


Choose one correct answer 1


Correct!

[34mXXI. Specificity is...[0m
[35m1. Precision for the negative case[0m
[35m2. Recall for the negative case[0m
[35m3. Recall for the positive case[0m
[35m4. Precision for the positive case[0m


Choose one correct answer 2


Correct!

[34mXXII. In a classification problem using high dimensional data (e.g. greater than 10 features) a PCA dimensionality reduction to two PCA components was performed to visually observe how separable two classes are on a scatter plot with X as PCA component 1 and Y as PCA component 2 for each data point. If the classes are not visibly separate in the 2D plot, what does that mean for a classifier trained on all the features?[0m
[35m1. They cannot be distinguished by a classifier[0m
[35m2. They may be separable with more features, it is inconclusive[0m
[35m3. Overlaps in the PCA plot indicate the classes are separable when all features are used[0m


Choose one correct answer 2


Correct!

[34mXXIII. What is the purpose of regularization in linear regression?[0m
[35m1. To diminish the contribution of irrelevant features to the resulting model, effectively performing automated feature selection during learning[0m
[35m2. To lasso the ridge with an elastic net[0m
[35m3. To improve prediction accuracy on a future test set better than ordinary linear regression[0m
[35m4. To decrease the coefficient values for irrelevant terms in the regression model[0m


Choose multiple answers 134


Correct!

[34mXXIV. Accuracy is[0m
[35m1. The arithmetic mean of precision and recall[0m
[35m2. The average recall across classes if the number of items in each class is the same[0m
[35m3. The geometric mean of precision and recall[0m


Choose one correct answer 2


Correct!

[34mXXV. An important aspect in formulating a problem as a Markov process is that the future is conditionally independent of the past giving the current state[0m
[35m1. True[0m
[35m2. False[0m


Choose one correct answer 1


Correct!

[34mXXVI. When you are not sure if your state-action value function is correct, you should always pick the state-action pair of maximum value[0m
[35m1. True[0m
[35m2. False[0m


Choose one correct answer 2


Correct!

[34mXXVII. Pick the two data formats for use in clustering[0m
[35m1. A 'samples x features' matrix, standard in machine learning but without a chosen predictor[0m
[35m2. A 'features x features' sized correlation matrix[0m
[35m3. A similarity matrix of size 'samples x samples'[0m


Choose multiple answers 13


Correct!

[34mXXVIII. Select all the transformations of features that have the potential to improve prediction accuracy for most ML algorithms (note: simple linear combinations don't often help since most models do that automatically)[0m
[35m1. Cross products of features[0m
[35m2. Differences of features[0m
[35m3. Logarithm transformation[0m
[35m4. Sums of features[0m
[35m5. Absolute value[0m
[35m6. Squares (or cubes...)[0m


Choose multiple answers 136


Correct!

[34mXXIX. Check all the components of a basic reinforcement learning model[0m
[35m1. Rules that determine the immediate reward of certain transitions[0m
[35m2. Rules of transitions between states[0m
[35m3. A set of states of the environment[0m
[35m4. A set of actions the organism can take[0m
[35m5. Rules that describe what the organism can observe[0m


Choose multiple answers 12345


Correct!

[34mXXX. When cross-validation is performed in the validation set, the score of the best fitted model hyperparameters in that set is on average lower than the the score of that best fitted model on a separate test set.[0m
[35m1. False[0m
[35m2. True[0m


Choose one correct answer 1


Correct!

[34mXXXI. Select all examples of semi-supervised learning (as opposed to pure supervised or unsupervised learning examples)[0m
[35m1. Collecting constant GPS data, automatically clustering repeated locations, then having a personal label those clusters as “home” or “work” with the goal of having the setup detect whenever the wearer is at home or at work.[0m
[35m2. Your learning of music genres, especially your ability to ask questions about a category of music that you notice as particularly distinct.[0m
[35m3. Making stock predictions for a high-frequency trading company[0m
[35m4. Determining a taxonomy (tree-like classification) for animals based solely on their features[0m


Choose multiple answers 12


Correct!

[34mXXXII. If I want to test my voice recognition software to see how well it will works on a new person it has not yet been trained for, what type of cross-validation would give me the best sense of accuracy?[0m
[35m1. Stratified K-fold cross-validation[0m
[35m2. K fold cross-validation[0m
[35m3. Leave one out cross-validation[0m
[35m4. Subject-wise cross-validation[0m


Choose one correct answer 4


Correct!

[34mXXXIII. Choose correct answers[0m
[35m1. Subject-wise cross-validation: When you use the data from N-1 people to train your classifier, and you test it on the Nth person. Repeat the process by changing who is in the test set appropriately.[0m
[35m2. Leave one out cross-validation: When you use the data from N-1 people to train your classifier, and you test it on the Nth person. Repeat the process by changing who is in the test set appropriately.[0m
[35m3. Subject-wise cross-validation: Same as K-fold cross-validation where K = the size of the data set[0m
[35m4. Leave one out cross-validation: Same as K-fold cross-validation where K = the size of the data set[0m


Choose multiple answers 14


Correct!

[34mXXXIV. In Gaussian Naive Bayes, select all the parameters that have to be learned from the data to create a predictive model[0m
[35m1. The mean and standard deviation for each feature for each class[0m
[35m2. The mean and standard deviation for each feature, combining all classes together[0m
[35m3. The prior probability of each feature value's likelihood[0m
[35m4. The proportion of training data in each class[0m


Choose multiple answers 14


Correct!

[34mXXXV. If a potential feature does not necessarily correlate with a target, it should not necessarily be removed because[0m
[35m1. Correlation does not imply causation[0m
[35m2. It may still have a dependent relationship with the target[0m
[35m3. The best fitting line in a scatter plot with the feature and target may have a non-zero slope for a line in linear regression[0m
[35m4. Lack of correlation does not imply lack of causation[0m


Choose one correct answer 2


Correct!

[34mXXXVI. Select all scenarios that are examples of supervised learning[0m
[35m1. Using the nucleotide sequences on a region of non-coding DNA shared among species to estimate a phylogenetic tree.[0m
[35m2. An infant, unable to speak, but forming concepts of ‘r’ or ‘l’ sounds based on the grouping of similar sounds over time.[0m
[35m3. Netflix using their database of user ratings to predict how you would rate a movie you haven’t seen[0m
[35m4. Predicting a buyer's chance of clicking on an online advertisement based on the previous behavior of similar online shoppers.[0m


Choose multiple answers 34


Correct!

[34mXXXVII. If 5% of your samples have incorrect labels in your available labelled data, which option is likely best to improve model accuracy?[0m
[35m1. Add/remove features[0m
[35m2. Derive/predict new features from current features in your data set[0m
[35m3. Get more samples (even if they are 1% in error)[0m
[35m4. Change your hyperparameter to avoid overfitting[0m


Choose one correct answer 4


Correct!

[34mXXXVIII. Check all components of a fully-specified Bayesian network[0m
[35m1. Notes representing variables[0m
[35m2. Prior probabilities for root nodes[0m
[35m3. Conditional probability tables (or probability functions if continuous) quantifying the dependencies which the links represent[0m
[35m4. Links between the nodes representing dependencies between variables[0m


Choose multiple answers 1234


Correct!

[34mXXXIX. Which of the following is not an explicit part of the standard Q-learning equation?[0m
[35m1. A learning rate[0m
[35m2. Reward prediction error[0m
[35m3. Temporal discounting[0m
[35m4. State-action value function[0m
[35m5. The policy function[0m


Choose one correct answer 5


Correct!

[34mXL. What is the best description of what a link between nodes represents in a Bayesian network?[0m
[35m1. Any variables that are not independent from each other require a direct link[0m
[35m2. Linking direct dependences, not necessarily causal.[0m
[35m3. Only between variables that are directly causal, from cause to effect.[0m


Choose one correct answer 2


Correct!

[34mXLI. PCA...[0m
[35m1. Is used for visualizing high-dimensional data sets[0m
[35m2. Stands for Preferred Components Analysis[0m
[35m3. Is a type of factor analysis which can reduce the dimensionality of data[0m
[35m4. Is performed by finding the linear combination of features that explain most of the variance in the data, then removing that feature's impact, and continuing the process. It's a greedy algorithm with earlier features being the more important ones, and later features essentially just noise in most cases[0m


Choose multiple answers 34


Correct!

[34mXLII. Assuming the value of B is observed ("known"), in which of the graphs would the value of node A now depend upon the value of node C.[0m
[35m1. A --> B <-- C[0m
[35m2. A <-- B --> C[0m
[35m3. A --> B --> C[0m
[35m4. A <-- B <-- C[0m


Choose one correct answer 1


Correct!

[34mXLIII. Check which of the following are associated with Bagging instead of Boosting[0m
[35m1. This is a common strategy to combine multiple learners, even if they are from completely different modeling strategies (e.g. combining logistic regression and naive bayes)[0m
[35m2. This is more likely to be used for models which are weak learners, like decision stumps - decision trees with only one level.[0m
[35m3. This is more likely to be used for models which have the potential to overfit, like decision trees with no restrictions.[0m
[35m4. Random forest classifiers use this technique[0m


Choose multiple answers 34


Correct!

[34mXLIV. The proportion of correctly identified samples from the test samples that were identified as belonging to a particular class by the classifier is called...[0m
[35m1. Recall[0m
[35m2. F1 Score[0m
[35m3. Sensitivity[0m
[35m4. Precision[0m


Choose one correct answer 4


Correct!

[34mXLV. Choose correct answers[0m
[35m1. In Ridge Regression, irrelevant feature coefficients will often be set to zero, effectively removing them from the model[0m
[35m2. In Lasso Regression, irrelevant feature coefficients will often be set to zero, effectively removing them from the model[0m
[35m3. In Lasso Regression, irrelevant feature coefficients will be pushed to zero, but likely not completely removed[0m
[35m4. In Ridge Regression, irrelevant feature coefficients will be pushed to zero, but likely not completely removed[0m


Choose multiple answers 24


Correct!

[34mXLVI. Check which of the following are associated with Bagging instead of Boosting[0m
[35m1. The features (commonly the columns in a data set) and samples/observations (commonly the rows in a data set) may be resampled. And this can be done with or without replacement.[0m
[35m2. All estimators are weighted equally.[0m
[35m3. This technique is one of the reasons that some Kaggle competitions don’t allow teams to merge during competitions (e.g. team #2 and #3 join together)[0m


Choose multiple answers 12


Correct!

[34mXLVII. Assuming none of the variables are observed, in which of the graphs would the value of node A depend upon the value of node C. Multiple possible. Note, node B is NOT observed.[0m
[35m1. A <-- B <-- C[0m
[35m2. A <-- B --> C[0m
[35m3. A --> B --> C[0m
[35m4. A --> B <-- C[0m


Choose multiple answers 123


Correct!

[34mXLVIII. Cross-validation will lead to lower accuracies than expected with the full training set because only (K-1)/K % of the data is being used for training (e.g. 4/5ths for K=5). The way to improve this is by increasing K. But what is a problem with increasing K?[0m
[35m1. The number of samples in the data set may not be perfectly divisible by K[0m
[35m2. The separated test set is getting small and may bias results of the cross-validation[0m
[35m3. K models have to be trained which takes more time as K increases[0m


Choose one correct answer 3


Correct!

[34mXLIX. Check all the ways of initializing K-means[0m
[35m1. Initialize all centroids at the origin - e.g. (0,0)[0m
[35m2. Randomly choose samples as the initial centroids[0m
[35m3. Randomly assign all samples to one of K classes[0m
[35m4. Initialize all centroids at the mean of all the samples[0m


Choose multiple answers 23


Correct!

[34mL. Which is an example of "explaining away"? That is, how a shared child node can indicate a dependency among parent nodes only when observed.[0m
[35m1. When you see someone smoking, you know their odds of having a positive lung X-ray for cancer are higher.[0m
[35m2. Seeing someone with lung cancer is smoking makes you less likely to assume the cancer is from high levels of pollution.[0m
[35m3. A positive result on a lung X-ray indicates a high probability that someone has lung cancer[0m


Choose one correct answer 2


Correct!

You get 96.00%
