In [1]:
# Will Hollingsworth, Colton Murray, Alexander Shiveley

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sympy as sp

# Q1

By using the same method in the notes to expand the kernel function for some n and d, m can be found.  After expanding, and simplifying, m = number of terms in the polynomial.

c is set to 1.

To avoid tedious calculations, the expansions were done using sympy.

In [19]:
def get_m(n, d):
    """
    Expands and counts the terms in the polynomial kernel function for some n and d, where c = 1
    """
    # Supports n up to 4
    if n == 1:
        x1, z1 = sp.symbols('x1 z1')
        result = sp.expand((x1*z1+1)**d)
    elif n == 2:
        x1, z1, x2, z2 = sp.symbols('x1 z1 x2 z2')
        result = sp.expand((x1*z1+x2*z2+1)**d)
    elif n == 3:
        x1, z1, x2, z2, x3, z3 = sp.symbols('x1 z1 x2 z2 x3 z3')
        result = sp.expand((x1*z1+x2*z2+x3*z3+1)**d)
    elif n == 4:
        x1, z1, x2, z2, x3, z3, x4, z4 = sp.symbols('x1 z1 x2 z2 x3 z3 x4 z4')
        result = sp.expand((x1*z1+x2*z2+x3*z3+x4*z4+1)**d)    
    return str(result).count('+') + 1

# Find m for n = [1, 4], d = [1, 5]
for n in range(1, 5):
    for d in range(1, 6):
        print('n:', n, '  d:', d, '  m:', get_m(n, d))    

n: 1   d: 1   m: 2
n: 1   d: 2   m: 3
n: 1   d: 3   m: 4
n: 1   d: 4   m: 5
n: 1   d: 5   m: 6
n: 2   d: 1   m: 3
n: 2   d: 2   m: 6
n: 2   d: 3   m: 10
n: 2   d: 4   m: 15
n: 2   d: 5   m: 21
n: 3   d: 1   m: 4
n: 3   d: 2   m: 10
n: 3   d: 3   m: 20
n: 3   d: 4   m: 35
n: 3   d: 5   m: 56
n: 4   d: 1   m: 5
n: 4   d: 2   m: 15
n: 4   d: 3   m: 35
n: 4   d: 4   m: 70
n: 4   d: 5   m: 126


The most noticable pattern is that for some $n_1$, $d_1$, $n_2$, $d_2$ where $n_2 = d_1$ and $d_2 = n_1$, $m_1 = m_2$.
For example for $n = 2$, $d = 3$ and $n = 3$, $d = 2$, results in $m = 10$ for both cases.

This suggests there is some term in the relation like $n + d$ or $n * d$ involved for symmetry.

Another pattern is that for larger $n$ and $d$, $m$ grows large quickly. This suggests operations like exponents or factorials may be involved.

Using the above patterns, the final equation below was found to relate $n$, $d$, and $m$:

$$m = \frac{(n + d)!}{n! d!}$$

# Q2 TODO (Copied over csv reading from assignment 5 for now)

# Getting the data into Python

In [3]:
# Load the csv as a numpy array of strings, 
# because it includes the column headers
raw_data = np.loadtxt('data_banknote_authentication.txt', delimiter=',', dtype=str)

# Convert everything into floats!
clean_data = np.array(raw_data, dtype=float)

clean_data

array([[  3.6216 ,   8.6661 ,  -2.8073 ,  -0.44699,   0.     ],
       [  4.5459 ,   8.1674 ,  -2.4586 ,  -1.4621 ,   0.     ],
       [  3.866  ,  -2.6383 ,   1.9242 ,   0.10645,   0.     ],
       ...,
       [ -3.7503 , -13.4586 ,  17.5932 ,  -2.7771 ,   1.     ],
       [ -3.5637 ,  -8.3827 ,  12.393  ,  -1.2823 ,   1.     ],
       [ -2.5419 ,  -0.65804,   2.6842 ,   1.1952 ,   1.     ]])

In [4]:
def get_sets(data, split):
    """
    Convenience function that randomly selects a training and test set from the input data.
    
    :param data: (ndarray) the data you want to split
    :param split: (float array) the percentages of the data you want to be TRAINING, VALIDATION, and TESTING data
    
    :returns: (tuple) a tuple where the first element is the training set, and the second element is the test set
    """
    # Randomly shuffle the order from a copy of the data
    shuffled = data.copy()
    np.random.shuffle(shuffled)

    row_count = data.shape[0]

    # calc the number of samples, assumes the input samples are seperated by row
    training_count = round(row_count * split[0])
    
    training_set = shuffled[:training_count]
    remaining_set = shuffled[training_count:]
    
    # calc the number of samples, assumes the input samples are seperated by row
    training_count = round(row_count * split[1] / (split[1] + split[2]))
    
    validation_set = remaining_set[:training_count]
    test_set = remaining_set[training_count:]
    
    return training_set, validation_set, test_set

In [5]:
training, validation, test = get_sets(clean_data, [1/3, 1/3, 1/3])