In [1]:
from collections import defaultdict, Counter
import numpy as np

<h2>Naive Bayes</h2>

In [139]:
def get_training_texts():
    texts = [
        ("Chinese Beijing Chinese", "c"),
        ("Chinese Chinese Shanghai", "c"),
        ("Chinese Macao", "c"),
        ("Tokyo Japan Chinese", "j"),
    ]
    return texts

In [140]:
from pprint import pprint


def predict_class(d: str, training_texts: list):
    
    s1_freqs = defaultdict(lambda: defaultdict(lambda: 0))
    
    vocab = set()
    for text, label in training_texts:  # go through all training samples
        for word in text.split():   # tokenize training document/text
            vocab.add(word)  # building vocabulary
            s1_freqs[label][word] += 1  # maintain frequency
    
    # Compute vocabulary Size
    V = len(vocab)

    # calculate class priors
    class_wise_den = defaultdict(lambda: 0)

    for c, word_pair in s1_freqs.items():
        class_wise_den[c] += sum(word_pair.values())

    # Computing predictor priors     
    _, labels = list(zip(*training_texts))
    all_classes = np.unique(labels)
    priors = {c: freq / len(labels) for c, freq in Counter(labels).items()}

    # Compute likelihoods
    likelihood = defaultdict(lambda: defaultdict(lambda : int))
    
    for c in priors:
        for word in vocab:
            prob = (s1_freqs[c][word] + 1.) / (class_wise_den[c] + V) 
            likelihood[c][word] = prob
    
    # Computing posteriors probabilities for all classes    
    final_probs = defaultdict(lambda: defaultdict(lambda: 1))
    for c in s1_freqs:
        p = priors[c]
        for w in d.split():
            p = p * likelihood[c][w]
        final_probs[c] = p

    # Filtering out the class with maxium probability     
    max_class, max_prob = '', float('-inf')
    for c, prob in final_probs.items():
        if prob > max_prob:
            max_prob = prob
            max_class = c
        
    return max_class, s1_freqs, final_probs

In [141]:
from nose.tools import assert_equal

s1_student = predict_class("Chinese Chinese Chinese Tokyo Japan", get_training_texts())
assert_equal(len(s1_student), 3, msg="Your function does not return 3 values. Check the description again for the requirements that your function must satisfy!")

s1_class, s1_freqs, s1_probs = s1_student
assert_equal(s1_class, "c", msg="Your first return value should be the predicted class. Your function did not classify correctly.")
assert_equal(s1_freqs["c"]["Chinese"], 5, msg="Your second return value should map class labels to dictionaries containing the frequency of terms in documents of that class in the training set. For example, 'Chinese' appears 5 times in documents with the 'c' class. Your return value does not have that information.")
assert_equal(s1_freqs["c"]["Tokyo"], 0, msg="Your second return value should map class labels to dictionaries containing the frequency of terms in documents of that class in the training set. For example, 'Tokyo' appears 0 times in documents with the 'c' class. Your return value does not have that information. Use a defaultdict to return 0 when the term does not appear in the dictionary!")
assert_equal(s1_probs["c"],  0.00030121377997263036, msg="The class probability for the document given class 'c' is not correct. Did you round the values? This test case works only with unrounded values.")
assert_equal(s1_probs["j"],  0.00013548070246744226, msg="The class probability for the document given class 'j' is not correct. Did you round the values? This test case works only with unrounded values.")
print("\x1b[6;30;42mSuccess!\x1b[0m")

[6;30;42mSuccess![0m


In [142]:
from nose.tools import assert_equal
# These tests check that your solution works not only for this one example, but other examples as well

texts = [
    ("yellow long long yellow fruit tree", "Fruit"),
    ("yellow yellow mollusk yellow mollusk fruit", "Animal"),
    ("brown fur brown brown forest brown fruit", "Animal"),
    ("red red round fruit fruit forest forest red", "Fruit"),
    ("brown brown fur fur brown fur fur tree", "Fruit"),
    ("red red red yellow yellow yellow red red yellow red", "Animal"),
    ("green green green green green green yellow yellow yellow red red red red red", "Color")
]

s1_class, s1_freqs, s1_probs = s1_student = predict_class("fruit fruit fruit fruit", texts)
assert_equal(s1_class, "Fruit", msg="Your first return value should be the predicted class. Your function did not classify correctly.")
assert_equal(s1_freqs["Fruit"]["red"], 3, msg="Your second return value should map class labels to dictionaries containing the frequency of terms in documents of that class in the training set.")
assert_equal(s1_freqs["Fruit"]["Cthulhu"], 0, msg="Your second return value should map class labels to dictionaries containing the frequency of terms in documents of that class in the training set.")
assert_equal(s1_freqs["Animal"]["yellow"], 7, msg="Your second return value should map class labels to dictionaries containing the frequency of terms in documents of that class in the training set.")
assert_equal(s1_freqs["Color"]["green"], 6, msg="Your second return value should map class labels to dictionaries containing the frequency of terms in documents of that class in the training set.")
assert_equal(s1_probs["Fruit"],  9.251399183780853e-05, msg="The class probability for the document given class 'Fruit' is not correct. Did you round the values? This test case works only with unrounded values.")
assert_equal(s1_probs["Animal"],  2.5977213600685546e-05, msg="The class probability for the document given class 'Animal' is not correct. Did you round the values? This test case works only with unrounded values.")
assert_equal(s1_probs["Color"],  3.657142857142857e-07, msg="The class probability for the document given class 'Color' is not correct. Did you round the values? This test case works only with unrounded values.")

# second example
s1_class, s1_freqs, s1_probs = s1_student = predict_class("green green round fruit", texts)
assert_equal(s1_class, "Color", msg="Your first return value should be the predicted class. Your function did not classify correctly.")
print("\x1b[6;30;42mSuccess!\x1b[0m")

[6;30;42mSuccess![0m


<h2> KNN </h2>

In [143]:
from sklearn.datasets import load_iris


def get_flower_data():
    iris = load_iris(as_frame=True)
    df = iris["data"]
    df[["sepal length (cm)", "sepal width (cm)"]].to_records(index=False)

    nodes = [(length, width, x) for x, (length, width) in zip(iris.target, df[["sepal length (cm)", "sepal width (cm)"]].to_records(index=False))]
    return nodes

In [144]:
from sklearn.neighbors import KNeighborsClassifier

def k_nearest_neighbors(node, nodes, k=3):
    
    x1, x2, y = list(zip(*nodes))
    X = list(zip(x1, x2))
    X = np.array(X)
    node = np.array([node])
    knn = KNeighborsClassifier(n_neighbors=k, p=2,  weights='uniform')
    knn.fit(X, y)
    pred = knn.predict(node)
    return pred[0]

In [145]:
# you can experiment with your solution here
nodes = get_flower_data()
# example of how your function will be called
# k_nearest_neighbors((7, 2), nodes, 10)
# nodes
k_nearest_neighbors((7, 2), nodes, 10)

1

In [146]:
from nose.tools import assert_equal

s4_nodes = get_flower_data()
# assert_equal(k_nearest_neighbors((7, 2), nodes, 10), 2)  # Exceptionally wrong input 
assert_equal(k_nearest_neighbors((7, 2), nodes, 3), 1)
assert_equal(k_nearest_neighbors((7, 2), nodes, 5), 1)
assert_equal(k_nearest_neighbors((7, 2), nodes, 7), 1)
assert_equal(k_nearest_neighbors((7, 2), nodes, 9), 2)
assert_equal(k_nearest_neighbors((7, 2), nodes, 11), 2)
assert_equal(k_nearest_neighbors((6, 2.8), nodes, 9), 1)
assert_equal(k_nearest_neighbors((-6, 2.8), nodes, 9), 0)
assert_equal(k_nearest_neighbors((4.77, 3.99), nodes, 9), 0)
assert_equal(k_nearest_neighbors((4.77, 3.99), nodes, 3), 0)
assert_equal(k_nearest_neighbors((5.77, 2.8), nodes, 121), 1)
print("\x1b[6;30;42mSuccess!\x1b[0m")

[6;30;42mSuccess![0m


<h2>Evaluation</h2>

In [147]:
from sklearn.metrics import classification_report

In [148]:
def evaluate(y_pred, y_true, labels):
    precision, recall, f1 = {}, {}, {}
    class_report = classification_report(y_true, y_pred, labels=labels, output_dict=True)
    # Structuring the output    
    for l in labels:
        precision[l] = class_report[str(l)]['precision']
        recall[l] = class_report[str(l)]['recall']
        f1[l] = class_report[str(l)]['f1-score']
    return precision, recall, f1

In [149]:
s3_solution = evaluate([1, 2, 3, 1, 2, 3], [1, 2, 1, 2, 1, 3], [1, 2, 3])  # The input was modified for consistency
assert_equal(len(s3_solution), 3, msg="Your function should return three values.")
precision, recall, f1 = s3_solution
assert_equal(precision[1], 0.5, msg="Your precision for class 1 is wrong.")
assert_equal(precision[2], 0.5, msg="Your precision for class 2 is wrong.")
assert_equal(precision[3], 0.5, msg="Your precision for class 3 is wrong.")
assert_equal(recall[1], 0.3333333333333333, msg="Your recall for class 1 is wrong.")
assert_equal(recall[2], 0.5, msg="Your recall for class 2 is wrong.")
assert_equal(recall[3], 1, msg="Your recall for class 3 is wrong.")
assert_equal(f1[1], 0.4, msg="Your f1 for class 1 is wrong.")
assert_equal(f1[2], 0.5, msg="Your f1 for class 2 is wrong.")
assert_equal(f1[3], 0.6666666666666666, msg="Your f1 for class 3 is wrong.")
print("\x1b[6;30;42mSuccess!\x1b[0m")

[6;30;42mSuccess![0m


<h2>Task 2G</h2>

In [150]:
s2_solution = "A)"  # Could possibly be a 'D)' as well

<h2>Task 4G</h2>

In [128]:
s5_solution = "D)"