In [2]:
import cv
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

In [4]:
dataset = pd.read_csv('../data/iris.csv')
n = dataset.shape[0]
x, y = dataset[dataset.columns[:4]], dataset.species

In [5]:
dataset.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [6]:
y.value_counts()

versicolor    50
virginica     50
setosa        50
Name: species, dtype: int64

In [7]:
y.replace({'virginica': 0, 'setosa': 1, 'versicolor': 2}, inplace=True)

In [8]:
clf = LogisticRegression(multi_class='multinomial', solver='newton-cg')

---
## Cross validation

---
### Leave-one-out cross validation (LOOCV)

In [9]:
predictions, errors, cv_error = cv.loocv(x, y, clf)

In [10]:
cv_error

0.6666666666666666

#### Sanity check 1
This model seems to have a very high test error. As a very rough sanity check, I tried to obtain the same figure using a slightly different workflow based on the one at https://chrisalbon.com/machine_learning/naive_bayes/multinomial_logistic_regression/, and got the same.

In [21]:
from sklearn import datasets

In [22]:
iris = datasets.load_iris()
x = iris.data
y = iris.target
scaler = StandardScaler()
x_std = scaler.fit_transform(x)

In [23]:
clf = LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg')

In [24]:
model = clf.fit(x_std, y)

In [25]:
predictions = np.full(n, -1)
errors = np.zeros(n)

In [26]:
for i in range(n):
    clf = LogisticRegression(random_state=i, multi_class='multinomial', solver='newton-cg')
    model = clf.fit(scaler.fit_transform(np.append(x[:(i-1)], x[i:], axis=0)), np.append(y[:(i-1)], y[i:], axis=0))
    predictions[i] = model.predict(np.reshape(x[i], (1, -1)))[0]
    errors[i] = int(predictions[i] != y[i])

In [27]:
print('CV(n) = %f' % errors.mean())

CV(n) = 0.666667


#### Sanity check 2
Turns out I could simply use the score() method from LogisticRegression, which confirms the initial CV(n) obtained.

In [28]:
model.score(x, y)

0.3333333333333333

---
### k-fold cross validation, using k=10

In [10]:
# Reload dataset
dataset = pd.read_csv('data/iris.csv')

In [11]:
dataset_shuffled = shuffle(dataset).reset_index()

In [12]:
n = dataset_shuffled.shape[0]
x, y = dataset_shuffled[dataset_shuffled.columns[:4]], dataset_shuffled.species
y.replace({'virginica': 0, 'setosa': 1, 'versicolor': 2}, inplace=True)

In [13]:
predictions, errors, cv_error = cv.k_fold_cv(x, y, 10,
                                             LogisticRegression(multi_class='multinomial',solver='newton-cg'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['dependent'] = y


(array([0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0]),
 array([1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 0., 0., 1.,
        0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0.,
        1., 1., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
        1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0.,
        1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1.,
        1., 0., 0., 1., 0., 0., 0., 

---
## Bootstrap

In [175]:
def resample(data, n=None):
    n = n or data.shape[0]
    return data.loc[np.floor(np.random.rand(n) * n).astype(int)]

In [197]:
iters = 500
uniq_ys = y.unique().size
bootstrap_coefs = np.zeros((iters, uniq_ys, X.shape[1]))
bootstrap_intercepts = np.zeros((iters, 1, uniq_ys))
for i in range(iters):
    sample_i = resample(dataset)
    X, y = sample_i[dataset.columns[:4]], sample_i.species
    model = clf.fit(StandardScaler().fit_transform(X), y)
    bootstrap_coefs[i] = model.coef_
    bootstrap_intercepts[i] = model.intercept_

In [205]:
print('mean bootstrap coefficient estimates for %s: ' % ', '.join(y.unique()))
np.mean(bootstrap_coefs, axis=0)

mean bootstrap coefficient estimates for versicolor, setosa, virginica: 


array([[-1.0699087 ,  1.17078339, -1.90383244, -1.8030282 ],
       [ 0.56200119, -0.38256723, -0.35957855, -0.82623983],
       [ 0.50790752, -0.78821615,  2.26341099,  2.62926804]])

In [206]:
print('bootstrap coefficient standard errors for %s: ' % ', '.join(y.unique()))
np.std(bootstrap_coefs, axis=0)

bootstrap coefficient standard errors for versicolor, setosa, virginica: 


array([[0.07365553, 0.13163402, 0.07823331, 0.07110691],
       [0.15270834, 0.14394443, 0.18201906, 0.15319158],
       [0.17111136, 0.18121534, 0.18220624, 0.14134537]])

In [207]:
print('mean bootstrap intercept estimates for %s: ' % ', '.join(y.unique()))
np.mean(bootstrap_intercepts, axis=0)

mean bootstrap intercept estimates for versicolor, setosa, virginica: 


array([[-0.19836965,  2.05656474, -1.85819509]])

In [208]:
print('bootstrap intercept standard errors for %s: ' % ', '.join(y.unique()))
np.std(bootstrap_intercepts, axis=0)

bootstrap intercept standard errors for versicolor, setosa, virginica: 


array([[0.41207987, 0.16403797, 0.43781119]])