# Analysis of the dataset

Here I have to check if the data set has null values and those things.

In [169]:
import pandas as pd
import numpy as np

df = pd.read_csv("IRIS_Training.csv", sep=";")
df


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
85,6.7,3.1,4.7,1.5,Iris-versicolor
86,6.3,2.3,4.4,1.3,Iris-versicolor
87,5.6,3.0,4.1,1.3,Iris-versicolor
88,5.5,2.5,4.0,1.3,Iris-versicolor


In [170]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  90 non-null     float64
 1   sepal_width   90 non-null     float64
 2   petal_length  90 non-null     float64
 3   petal_width   90 non-null     float64
 4   species       90 non-null     object 
dtypes: float64(4), object(1)
memory usage: 3.6+ KB


In [171]:
X_train = df[df.columns[:-1]]
print(X_train.head())
X = X_train.to_numpy(dtype="float32")
X.shape

   sepal_length  sepal_width  petal_length  petal_width
0           4.9          3.0           1.4          0.2
1           4.7          3.2           1.3          0.2
2           4.6          3.1           1.5          0.2
3           5.0          3.6           1.4          0.2
4           5.4          3.9           1.7          0.4


(90, 4)

In [172]:
number_features = len(X_train.columns)  
number_samples = len(X_train)
number_samples

90

In [173]:
number_features

4

In [174]:
outcome = df[df.columns[-1]]
outcome

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
           ...       
85    Iris-versicolor
86    Iris-versicolor
87    Iris-versicolor
88    Iris-versicolor
89    Iris-versicolor
Name: species, Length: 90, dtype: object

In [175]:
Y_train = outcome == "Iris-setosa"
Y_train = Y_train.astype("int")

In [176]:
Y_train[Y_train==0] = -1
Y = np.array(Y_train)

In [177]:
number_features = len(df.columns) - 1  # We don't take the species column
number_samples = len(df)
number_samples

90

In [178]:
number_features

4

# Solution
As seen in class the goal of this homework is to classify two types of flowers. To do so we have to find a hyperplane that divide the space in two regions, one for the _Iris-setosa_ and the other for the _Iris-versicolor_. To do so we have to sole the following optimization problem:

$$
\text{minimize} \;\;\; \|\omega\|_2
$$
$$
\text{s.t.}\;\;\; y_i(\omega^\top x_i+b)\geq 1,
$$

where $\omega$ is the normal vector to the hyperplane, $b$ is the coordinate at the origin, $x_i$ is a vector in $\mathbb{R}^4$ containing the characteristics of one flower and $y_i$ is either 1 or -1, depending on the type of flower. The problem is completely equivalent to 

$$
\text{minimize} \;\;\; \frac{1}{2}\|\omega\|_2^2
$$
$$
\text{s.t.}\;\;\; 1-y_i(\omega^\top x_i+b)\leq 0,
$$

because $\|\omega\|_2$ and $\|\omega\|^2_2$ share the point where they achieve the minimum. This is a problem that we know how to solve because using a barrier method we can convert it into an unconstrained problem. We are going to use the logarithmic barrier and the problem becomes

$$
\text{minimize} \;\;\; f(\omega,b) = \frac{1}{2}\|\omega\|_2^2 - \left(\frac{1}{t}\right)\sum_{i=1}^n \log\left[y_i(\omega^\top x_i+b) -1\right],
$$

where $n$ is the number of samples, in our case 90. Now our problem with inequality constraints has become one without any constraint, and we have to minimize this function with respect to the parameters $\omega\in\mathbb{R}^4$ and $b\in\mathbb{R}$. To do so we are going to use the Newton method, so we need to compute the gradient and the Hessian of our function. In our case the gradient is $\nabla\to\left(\nabla_\omega,\partial_b\right)$

$$
\nabla_\omega f(\omega,b) = \omega - \left(\frac{1}{t}\right)\sum_{i=1}^n \frac{y_ix_i}{y_i(\omega^\top x_i+b) -1}
$$
$$
\partial_b f(\omega,b) = - \left(\frac{1}{t}\right)\sum_{i=1}^n \frac{y_i}{y_i(\omega^\top x_i+b) -1}. 
$$
As we can see $\nabla_\omega f(\omega,b)$ is a vector because of the factor $x_i$ in the numerator, and $\partial_b f(\omega,b)$ is a scalar because $y_i$ is a scalar.



# Code

In [192]:
def f(point, mu, y_vector, X, n = 90):
    """The function to minimize

    Args:
        point: is an array in which the first element is b
            and the rest are vector w.
        mu: the parameter that multiplies the logarithmic barrier,
            1/t in the formula.
        y_vector: is the array containing all the values of y_i.
        X: is the matrix with all the samples features
        n: the number of samples.
    """
    assert point.shape == (5,), "Incorrect size of point. The size must be 5"

    b = point[0]
    w = point[1:]

    # We create some matrices to compute the function f without explicit summations
    one = np.ones(shape=(n,))
    w_matrix = np.tile(w, (n, 1)).T  # Is a matrix whose columns are the vector w
    b_vector = np.full_like(y_vector, fill_value=b)

    #                                     this is equivalent to np.diag(W.T.dot(X))
    return 0.5*w.T@w - mu*one.T@np.log(y_vector * ((w_matrix * X).sum(0) - b_vector) - one)

def gradient(point, mu, y_vector, X, n = 90):
    """The function to minimize

    Args:
        point: is an array in which the first element is b
            and the rest are vector w.
        mu: the parameter that multiplies the logarithmic barrier,
            1/t in the formula.
        y_vector: is the array containing all the values of y_i.
        X: is the matrix with all the samples features
        n: the number of samples.
    """
    b = point[0]
    w = point[1:]

    # We create some matrices to compute the function f without explicit summations
    one = np.ones(shape=(n,))
    w_matrix = np.tile(w, (n, 1)).T  # Is a matrix whose columns are the vector w
    b_vector = np.full_like(y_vector, fill_value=b)
    denom = y_vector * ((w_matrix * X).sum(0) - b_vector) - one
    print(denom.shape)

    grad_w = w - mu * np.sum((Y*X) / denom, axis=1)
    grad_b = -mu * np.sum(Y/(y_vector * ((w_matrix * X).sum(0) - b_vector) - one))
    return np.concatenate((grad_w, grad_b), axis=None)



In [193]:
a = gradient(np.array([1,2,3,4,5]), 1, Y, X.T)
a

(90,)


array([-13.26085187,  -6.16919109,  -2.8325775 ,   3.29228367,
        -2.88233624])

In [None]:
def gradient_descend(stop, x):
    """The gradient descend method using line search"""

    grad = gradient(x)
    if np.dot(grad, grad) < stop**2: #check the case that we are already in a minimum
        return x, 0
    
    for ite in range(1,10_000):
        descend_direction = -grad
        t = line_search(descend_direction, x)
        x = x + t * descend_direction
        grad = gradient(x)
        if np.dot(grad, grad) < stop**2: 
            return x, ite
        
    print("The stop criterion wasn't achieve in 10000 iterations.")
    return x, ite

In [181]:
len(np.array([1,2,3,4,5]))
np.array([1,2,3,4,5]).shape == (5,)

True

In [182]:
# W = np.random.randint(0, 10, size=(3,5))
test = np.random.randint(0, 10, size=(3,6))
# print(W.T)
print()
print(test)


[[9 6 7 4 8 8]
 [3 8 1 2 0 0]
 [0 5 0 0 2 2]]


In [183]:
# print((W * X))
# (W * X).sum(0)

In [184]:
# np.diag(W.T.dot(X))

In [185]:
y = np.array([1,-1,1,-1,1,-1])
d = np.full_like(y, fill_value=2)
(y*test)/d

array([[ 4.5, -3. ,  3.5, -2. ,  4. , -4. ],
       [ 1.5, -4. ,  0.5, -1. ,  0. ,  0. ],
       [ 0. , -2.5,  0. ,  0. ,  1. , -1. ]])

In [186]:
# np.tile(y, (5,1)).T