In [25]:
import numpy as np

from sklearn import linear_model
from tools.data_loading import load_images, load_labels, dummy_code
from tools.feature_learning import pins_generation
from tools.kernels import kernel_matrix
from tools.optimization import find_f
from tools.prediction import pred
from tools.process_images import process_images
from tools.quantization import kmeans
from tools.submission import labels_to_csv
from tools.visualization import imshow, dump_as_png

# Data loading

In [2]:
X_train = load_images(type="train")
n_train = X_train.shape[0]
Y_labels_train = load_labels()
Y_train = dummy_code(Y_labels_train)
n_classes = Y_train.shape[1]

# Data separation

In [3]:
indices = np.random.permutation(X_train.shape[0])
training_idx, test_idx = indices[:int(0.9*n_train)], indices[int(0.9*n_train):]

# Visual features

In [4]:
pins_dict_train = pins_generation(training_idx=training_idx)
pins_train = pins_dict_train["pins"]
train_pins = pins_dict_train["train_pins"]
pin_to_im_train = pins_dict_train["pin_to_im"]
pins_mat = np.vstack(train_pins)

In [5]:
indices_pins = np.random.permutation(pins_mat.shape[0])
training_idx_pins, test_idx_pins = indices_pins[:int(0.10*pins_mat.shape[0])], indices_pins[int(0.10*pins_mat.shape[0]):]
pins_mat_sample = pins_mat[training_idx_pins,:]
print(pins_mat_sample.shape)
visual_features = kmeans(pins_mat_sample, 70)

(9000, 16)
Wrongly clusterized pins: 8865
Wrongly clusterized pins: 3254
Wrongly clusterized pins: 1567
Wrongly clusterized pins: 1013
Wrongly clusterized pins: 747
Wrongly clusterized pins: 591
Wrongly clusterized pins: 481
Wrongly clusterized pins: 437
Wrongly clusterized pins: 352
Wrongly clusterized pins: 290
Wrongly clusterized pins: 282
Wrongly clusterized pins: 272
Wrongly clusterized pins: 213
Wrongly clusterized pins: 169
Wrongly clusterized pins: 166
Wrongly clusterized pins: 153
Wrongly clusterized pins: 148
Wrongly clusterized pins: 146
Wrongly clusterized pins: 124
Wrongly clusterized pins: 119
Wrongly clusterized pins: 104
Wrongly clusterized pins: 95
Wrongly clusterized pins: 82
Wrongly clusterized pins: 72
Wrongly clusterized pins: 74
Wrongly clusterized pins: 70
Wrongly clusterized pins: 58
Wrongly clusterized pins: 59
Wrongly clusterized pins: 47
Wrongly clusterized pins: 33
Wrongly clusterized pins: 36
Wrongly clusterized pins: 40
Wrongly clusterized pins: 37
Wrongly

In [6]:
visual_features.shape

(70, 16)

# Data processing

In [7]:
X_train = process_images(n_train, visual_features, pins_train, pin_to_im_train)
n_train, n_var = X_train.shape

# Data separation (bis)

In [8]:
X_sample = X_train[training_idx, :]
n_sample = X_sample.shape[0]
X_test = X_train[test_idx, :]
n_test = X_test.shape[0]
Y_sample = Y_train[training_idx, :]


# Training

## Kernel choice

In [9]:
kernel_type = "hellinger"
K_sample = kernel_matrix(X_sample, kernel_type=kernel_type)

## Classifier choice

In [39]:
classifier_type = "logistic regression"
alpha = np.zeros((n_classes, n_sample))
for dig in range(n_classes):
    alpha[dig, :] = find_f(K_sample, Y_sample[:, dig],
                           prob_type=classifier_type, lamb=70, n_iter=100)

### SVM test

In [23]:
from sklearn.svm import SVC
SVC_dict = {}
for dig in range(n_classes):
    clf = SVC( kernel='linear')
    clf.fit(X_sample, Y_sample[:,dig]) 
    SVC_dict[dig] = clf

In [24]:
Y_pred = np.zeros((X_test.shape[0], n_classes))
for dig in range(n_classes):
    Y_pred[:, dig] = SVC_dict[dig].predict(X_test)
Y_labels_pred = np.argmax(Y_pred, axis=1)
prec = np.mean(Y_labels_pred == Y_labels_train[test_idx])
print("The precision on the test set is of {}".format(prec))

The precision on the test set is of 0.1


In [22]:
Y_pred.sum()

6.0

# Evaluation

In [40]:
Y_pred_test = np.zeros((X_sample[:500].shape[0], n_classes))
for dig in range(n_classes):
    Y_pred_test[:, dig] = pred(X_sample, X_sample[:500], alpha[dig, :],
                          kernel_type=kernel_type)


Y_labels_pred_test = np.argmax(Y_pred_test, axis=1)
prec = np.mean(Y_labels_pred_test == Y_labels_train[training_idx][:500])
print("The precision on the train set is of {}".format(prec))

The precision on the train set is of 0.246


In [41]:
Y_pred = np.zeros((X_test.shape[0], n_classes))
for dig in range(n_classes):
    Y_pred[:, dig] = pred(X_sample, X_test, alpha[dig, :],
                          kernel_type=kernel_type)


Y_labels_pred = np.argmax(Y_pred, axis=1)
prec = np.mean(Y_labels_pred == Y_labels_train[test_idx])
print("The precision on the test set is of {}".format(prec))

The precision on the test set is of 0.182


# Prediction

In [38]:
X_eval = load_images(type="test")
n_eval = X_eval.shape[0]

# Visual features for submission
pins_dict_eval = pins_generation(data_type="test")
pins_eval = pins_dict_eval["pins"]
pin_to_im_eval = pins_dict_eval["pin_to_im"]


# Data processing
X_eval = process_images(n_eval, visual_features, pins_eval, pin_to_im_eval)
n_eval, n_var = X_eval.shape

Y_eval = np.zeros((n_eval, n_classes))
for dig in range(n_classes):
    Y_eval[:, dig] = pred(X_sample, X_eval, alpha[dig, :],
                          kernel_type=kernel_type)


Y_labels_eval = np.argmax(Y_eval, axis=1)


# Submission
labels_to_csv(Y_labels_eval, kernel=kernel_type, algo=classifier_type)