In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Softmax Regression

Softmax regression is a generalization of logistic regression to the case where we want to handle multiple classes (k  classes)

**Contents**

- [Softmax Regression](#1.-Softmax-Regression)
- [Example: The Iris Dataset](#2.-Example:-The-Iris-Dataset)
- [Example: The Palmer Archipelago Penguin Dataset](#3.-Example:-The-Palmer-Archipelago-Penguin-Dataset)
- [Example: Oranges, Lemons and Apples dataset](#4.-Example:-Oranges,-Lemons-and-Apples-dataset)

## 1. Softmax Regression

### One-hot encoding

In [None]:
def one_hot_encoding(v):
    # labels
    labels = np.unique(v)
    # ordinal encoding
    dic_labels = {labels[i]:i for i in range(len(labels))}
    ord_labels = np.array([dic_labels[v[i]] for i in range(len(v))])
    # one-hot encoding
    V = np.zeros((len(v),len(labels)))
    V[np.arange(len(v)),ord_labels] = 1
    # return one-hot-encodings and the class labels
    return V,labels

In [None]:
# chekc that it works
y = np.array(['Spain','Italy','Italy','USA','Italy'])
Y, labels = one_hot_encoding(y)
Y

In [None]:
labels

### The softmax function

In [None]:
def softmax(X,theta):
    m,n = X.shape
    Y = np.exp(X.dot(theta))
    row_sum = np.sum(Y,axis=1).reshape(-1,1)
    return Y/row_sum

In [None]:
# check that it works; notice that the entries of each row add to 1'
X = np.random.randn(5,2) # 5 data points, 2 features
theta = np.random.randn(2,3) # 2 features, 3 classes
softmax(X,theta)

### The softmax cost function

In [None]:
def softmax_cost(X,Y,theta):
    m = X.shape[0]
    P = softmax(X,theta)
    return -np.sum(Y*np.log(P))

In [None]:
softmax_cost(X,Y,theta)

### Gradient Descent

In [None]:
def softmaxregression_GD(X,y,learning_rate,n_epochs):
    
    # one-hot-encoding function
    def one_hot_encoding(v):
        # labels
        labels = np.unique(v)
        # ordinal encoding
        dic_labels = {labels[i]:i for i in range(len(labels))}
        ord_labels = np.array([dic_labels[v[i]] for i in range(len(v))])
        # one-hot encoding
        V = np.zeros((len(v),len(labels)))
        V[np.arange(len(v)),ord_labels] = 1
        return V,labels
    
    # softmax function
    def softmax(X,theta):
        m,n = X.shape
        Y = np.exp(X.dot(theta))
        row_sum = np.sum(Y,axis=1).reshape(-1,1)
        return Y/row_sum
    
    # softmax cost function
    def softmax_cost(X,Y,theta):
        m = X.shape[0]
        P = softmax(X,theta)
        return -np.sum(Y*np.log(P))
    
    m,n = X.shape
    k = len(np.unique(y))
        
    # initialize vector theta
    theta = np.random.randn(n,k)
    
    # initialize cost vector
    cost = np.zeros(n_epochs)
    
    # one-hot encodings
    Y,labels = one_hot_encoding(y)
    
    # gradient descent iterations  
    for i in range(n_epochs):
        gradient = X.T.dot(softmax(X,theta)-Y)
        theta = theta - learning_rate * gradient
        cost[i] = softmax_cost(X,Y,theta)
            

    return theta,cost,labels

## 2. Example: The Iris Dataset

<img src="iris.png" alt="Drawing" style="width: 700px;"/>

The Iris flower data set is a data set introduced by the British statistician, and biologist Ronald Fisher in his 1936 paper *The use of multiple measurements in taxonomic problems as an example of linear discriminant analysis.*

In [None]:
# load the data
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/iris.csv'
data = pd.read_csv(url)
data.head(5) #first 5 rows

This dataset consists of 3 different types of irises’ (Setosa, Versicolour, and Virginica) petal and sepal length.

In [None]:
data['species'].unique()

In [None]:
# feature matrix
X = data[['sepal_length','sepal_width','petal_length','petal_width']].to_numpy()

In [None]:
# target vector
y = data['species'].to_numpy()

In [None]:
y

In [None]:
# one hot encoding
Y = one_hot_encoding(y)

In [None]:
theta,cost,labels = softmaxregression_GD(X,y,
                                         learning_rate = 0.0001,
                                         n_epochs=1000)

In [None]:
plt.plot(cost)

In [None]:
def predictor_softmax(X,theta,labels):
    P = softmax(X,theta)
    return labels[np.argmax(P,axis=1)]

In [None]:
y_pred = predictor_softmax(X,theta,labels)
y_pred

In [None]:
'Confusion matrix'
def confusion_matrix(y,y_pred,labels):
    C = np.zeros((len(labels),len(labels)))
    for i,label_i in enumerate(labels):
        for j,label_j in enumerate(labels):
            C[i,j]=sum(y_pred[y==label_i]==label_j)
    return C
confusion_matrix(y,y_pred,labels)

In [None]:
'percentage of correct classification'
100*np.sum(y_pred==y)/len(y)

## 3. Example: The Palmer Archipelago Penguin Dataset

<img src="cute_penguins.png" alt="Drawing" style="width: 500px;"/>

Art by @allison_horst

Data were collected and made available by Dr. Kristen Gorman and the Palmer Station, Antarctica LTER, a member of the Long Term Ecological Research Network.

In [None]:
# load the data
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Theory/master/Data/penguins_size.csv'
data = pd.read_csv(url)
data.head(5)

This dataset contains data for 344 penguins. There are 3 different species of penguins in this dataset, collected from 3 islands in the Palmer Archipelago, Antarctica

In [None]:
data.species.unique()

The culmen is the upper ridge of a bird’s bill. For this penguin data, the culmen (bill) length and depth are measured as shown below.

<img src="culmen_depth.png" alt="Drawing" style="width: 500px;"/>

Some rows contain missing values. We will drop them from the pandas dataframe.

In [None]:
data.dropna(axis=0,inplace=True)

In [None]:
# target vector
y = data['species'].to_numpy()

In [None]:
# feature matrix
X = data[['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']].to_numpy()

In [None]:
# scale the data
X = X/X.max(axis=0)

In [None]:
theta,cost,labels = softmaxregression_GD(X,y,
                                        learning_rate = 0.001,
                                        n_epochs=5000,
                                        )

In [None]:
plt.plot(cost)

In [None]:
y_pred = predictor_softmax(X,theta,labels)

In [None]:
'Confusion matrix'
confusion_matrix(y,y_pred,labels)

In [None]:
'percentage of incorrect classification'
100*np.sum(y_pred==y)/len(y)

## 4. Example: Oranges, Lemons and Apples dataset

<img src="fruits.png" alt="Drawing" style="width: 500px;"/>

The dataset *fruits.csv* contains measurements of the height (cm), width (cm) and mass (g) of a selection of oranges, lemons and apples.

In [None]:
# load the data
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Theory/master/Data/fruits.csv'
data = pd.read_csv(url)
data

In [None]:
X = data[['height', 'width', 'mass']].to_numpy()

In [None]:
# scale the data
X = X/X.max(axis=0)

In [None]:
y =  data['fruit'].to_numpy()

In [None]:
y

In [None]:
theta,cost,labels = softmaxregression_GD(X,y,
                               learning_rate = 0.01,
                               n_epochs=5000,)

In [None]:
plt.plot(cost)

In [None]:
y_pred = predictor_softmax(X,theta,labels)

In [None]:
'Confusion matrix'
confusion_matrix(y,y_pred,labels)

In [None]:
'percentage of incorrect classification'
100*np.sum(y_pred==y)/len(y)

## 5. Three spirals dataset

In [None]:
classes = 4
n_points = 500
noise = 0.35
radius = 10

X = np.zeros((n_points*classes, 2))
y = np.zeros(n_points*classes).astype('int')

for class_number in range(classes):
    ix = range(n_points*class_number, n_points*(class_number+1))
    r = np.linspace(0,1, n_points)
    t = np.linspace(class_number*radius, (class_number+1)*radius, n_points) + np.random.randn(n_points)*noise
    X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
    y[ix] = class_number
    
plt.plot(X[y==0,0],X[y==0,1],'o',label = 'class 0')
plt.plot(X[y==1,0],X[y==1,1],'o',label = 'class 1')
plt.plot(X[y==2,0],X[y==2,1],'o',label = 'class 2')
plt.plot(X[y==3,0],X[y==3,1],'o',label = 'class 3')

In [None]:
def build_poly_features(X,degree):
    from itertools import combinations_with_replacement as comb_w_r
    from itertools import chain
    
    # number of datapoints (rows), number of features (columns)
    try:
        m,n = X.shape # this won't work if X is a vector (n=1 features)
    except: 
        m = len(X)
        n = 1
        X = X.reshape(m,1) #  
    
    # number of polynomial features
    combinations = chain.from_iterable(comb_w_r(range(n),i) for i in range(degree+1))
    n_poly = sum(1 for combination in combinations) 
    
    # polynomial features matrix
    X_poly = np.ones((m,n_poly))
    combinations = chain.from_iterable(comb_w_r(range(n),i) for i in range(degree+1))\
    
    
    for column_index, combination in enumerate(combinations):
        X_poly[:,column_index] = np.prod(X[:,combination],axis=1)
        
    return X_poly

In [None]:
X_poly = build_poly_features(X,degree=10)
X_poly.shape

In [None]:
theta,cost,labels = softmaxregression_GD(X_poly,y,
                                        learning_rate = 0.001,
                                        n_epochs=20000,
                                        )

In [None]:
plt.plot(cost)

In [None]:
def plot_softmax_regions(X, y, theta, labels, degree=1):
    from matplotlib.colors import ListedColormap
    
    # softmax predictor function
    def predictor_softmax(X,theta,labels):
        P = softmax(X,theta)
        return labels[np.argmax(P,axis=1)]
    
    # create a 500x500 meshgrid
    m_plot = 500
    x1 = np.linspace(X[:,0].min()-0.5, X[:,0].max()+0.5, m_plot)
    x2 = np.linspace(X[:,1].min()-0.5,X[:,1].max()+0.5, m_plot)
    X1, X2 = np.meshgrid(x1, x2) 
    X_plot = np.c_[X1.ravel(), X2.ravel()]
    
    # add polynomial features
    X_plot_poly = build_poly_features(X_plot,degree=degree)

    # evaluate the softmax regression model at each point of the mesh grid    
    y_plot = predictor_softmax(X_plot_poly,theta,labels).reshape(X1.shape)        


    # custom color map
    k = len(labels)
    custom_cmap = ListedColormap(['C'+str(i) for i in range(k)])
    
    # softmax classification regions
    plt.figure(figsize=(12,5))
    plt.contourf(X1, X2, y_plot, alpha=0.3, cmap=custom_cmap)
    
    # plot data points
    for label in labels:
        plt.scatter(X[y==label,0],X[y==label,1], label=label)
        
    plt.legend(fontsize=15)

In [None]:
plot_softmax_regions(X, y, theta, labels, degree=10)