In [1]:
# data managment
import pandas as pd
import numpy as np
import scipy as sp
from lib.RegressorManager import *
from sklearn.externals import joblib
# preprocessing 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# metrics
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import GridSearchCV

# plotting
from mpl_toolkits.mplot3d import Axes3D
from lib.PlotLearningCurve import plot_learning_curve
from matplotlib import pyplot as plt
import seaborn as sns
# learning
from sklearn.decomposition import PCA, FastICA
from sklearn.cluster import KMeans

sns.set(color_codes=True)

%matplotlib notebook

In [2]:
from sklearn.externals.joblib import Parallel, parallel_backend, register_parallel_backend
import ipyparallel as ipp
from ipyparallel import Client
from ipyparallel.joblib import IPythonParallelBackend
c = ipp.Client()
print(c.ids)
bview = c.load_balanced_view()

# this is taken from the ipyparallel source code
register_parallel_backend('ipyparallel', lambda : IPythonParallelBackend(view=bview))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [3]:
# load our train and test datasets
winemag_df = pd.read_csv("datasets/wine_reviews/winemag-data-130k-v2.csv")

In [4]:
fashion_df = pd.read_csv("datasets/fashion_mnist/fashion-mnist_test.csv")

data preparation
----------------

first we need to clean the dataset, for this we will
 - select relevant columns
 - drop row containing na value (we can afford this because the ratio *number of row / input space dimension* is still good )
 - remove duplicate rows

In [5]:
X_winemag = winemag_df['description']
y_winemag = winemag_df['points']
(n_row, n_col) = fashion_df.values.shape
X_fashion = fashion_df.values[:, 1:n_col]
y_fashion = fashion_df.values[:, 0]

In [6]:
pipeline_wine = Pipeline(
    [
        ('vectorizer', CountVectorizer()),
        ('tfidf', TfidfTransformer(use_idf=True))
    ]
)

In [7]:
# with parallel_backend('ipyparallel'):
pipeline_wine.fit(X_winemag, y_winemag)
X_winemag_trans = pipeline_wine.transform(X_winemag)
X_winemag_trans.shape

(129971, 31275)

Fashion MNIST
=======

In [8]:
param_grid = {'n_clusters': range(1, 100)}

In [9]:
with parallel_backend('ipyparallel'):
    kmeans = KMeans(n_jobs=32)
clf = GridSearchCV(kmeans, param_grid, cv=2, verbose=1, n_jobs=14)

In [10]:
with parallel_backend('ipyparallel'):
    clf.fit(X_fashion)

Fitting 2 folds for each of 99 candidates, totalling 198 fits


[Parallel(n_jobs=14)]: Done  99 out of 198 | elapsed:  3.4min remaining:  3.4min
exception calling callback for <AsyncResult: <sklearn.externals.joblib.parallel.BatchedCalls object at 0x7fc38184ac50>:finished>
Traceback (most recent call last):
  File "/usr/lib/python3.4/concurrent/futures/_base.py", line 297, in _invoke_callbacks
    callback(self)
  File "/home/tboissin/nosave/env_parallel/lib/python3.4/site-packages/ipyparallel/client/_joblib.py", line 47, in <lambda>
    future.add_done_callback(lambda f: callback(f.result()))
  File "/usr/lib/python3.4/concurrent/futures/_base.py", line 395, in result
    return self.__get_result()
  File "/usr/lib/python3.4/concurrent/futures/_base.py", line 354, in __get_result
    raise self._exception
  File "/home/tboissin/nosave/env_parallel/lib/python3.4/site-packages/ipyparallel/client/asyncresult.py", line 226, in _resolve_result
    raise r
ipyparallel.error.RemoteError: EngineError(Engine b'cad135df-e805c2d5e898085c627bf736' died while 

RemoteError: EngineError(Engine b'cad135df-e805c2d5e898085c627bf736' died while running task 'b731c6be-9f9a4b5ff2cd883196cd51a0')

In [14]:
joblib.dump(clf, 'gridsearch_kmeans_fashion.pkl')

['gridsearch_kmeans_fashion.pkl']

space study:
--------------

In [15]:
s = sp.linalg.svd(X_fashion, full_matrices=False, compute_uv=False)

In [None]:
fig, ax = plt.subplots()
ax.plot(s)
ax.set_xscale('log')
#ax.set_yscale('log')
plt.show()

In [17]:
print(s[1])
print(s[10])
print(s[100])
print(s[-1])
print("cond:"+str(s[1]/s[100]))

93584.23439143323
21543.19687124636
5361.881650998871
2.2073418106227582
cond:17.45361805477361


which dimension to use for reduction ?
 - 3 : to plot 3d graph
 - 9 : as we have 9 categories for classification
 - other : somewhat inspired by the haussler theorem

3d reduction
--------------

In [18]:
pca = PCA(n_components=3, svd_solver='full')
pca.fit(X_fashion)

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='full', tol=0.0, whiten=False)

In [None]:
fig = plt.figure()
ax = Axes3D(fig)
low_dim = pca.transform(X_fashion)
ax.scatter(low_dim[:, 0], low_dim[:, 1], low_dim[:, 2], c=y_fashion, s=0.1)
plt.show()

In [20]:
ica = FastICA(n_components=3, algorithm='parallel')
ica.fit(X_fashion)

FastICA(algorithm='parallel', fun='logcosh', fun_args=None, max_iter=200,
    n_components=3, random_state=None, tol=0.0001, w_init=None,
    whiten=True)

In [None]:
fig = plt.figure()
ax = Axes3D(fig)
low_dim = ica.transform(X_fashion)
ax.scatter(low_dim[:, 0], low_dim[:, 1], low_dim[:, 2], c=y_fashion, s=0.05)
plt.show()

Wine reviews
=======

In [None]:
(_, s2, _) = sp.sparse.linalg.svds(X_winemag_trans, k=1000, return_singular_vectors=True)

In [None]:
print(sp.sum(s2)/sp.sum(X_winemag_trans.diagonal()))
fig, ax = plt.subplots()
ax.plot(s2[::-1])
ax.set_xscale('log')
#ax.set_yscale('log')
plt.show()