In [4]:
%cd ~/google_drive/code/projects/sparse/python
from sparse.core.sparse_dataframe import SparseDataFrame
from sparse.utilities.utils import *

%cd ~/google_drive/code/projects/texture_classifier/python
# %cd /home/ubuntu/texture_classifier/python
import PIL
from sklearn.cluster import KMeans
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist, squareform
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
import cv2
from pandas.io.pytables import HDFStore
# from plotly import plotly
# from plotly.graph_objs import *

import core.utils
reload(core.utils)
from core.utils import *

import core.image_scanner
reload(core.image_scanner)
from core.image_scanner import ImageScanner

def get_report(y_true, y_pred):
    x = classification_report(y_true, y_pred)
    x = re.sub('avg / total', 'total', x)
    x = map(lambda x: re.split(' +', x), x.split('\n'))
    x = map(lambda x: filter(lambda x: x != '', x), x)
    x = filter(lambda x: x != [], x)
    report = DataFrame(x[1:])
    report.set_index(0, inplace=True)
    report.columns = x[0]
    return report

def _random_scan(item, params):
    return [x for x in ImageScanner(item, **params).random_scan(params['patches'])]

def image_split(source, split='train', test_size=0.2):
    image = PIL.Image.open(source)
    test = list(image.getbbox())
    test[-1] = int(test[-1] * test_size)
    train = list(image.getbbox())
    train[1] = test[-1] + 1
    if split == 'train':
        return image.crop(train)
    elif split == 'test':
        return image.crop(test)
    return image

def _pil_to_opencv(item):
    return cv2.cvtColor(cv2.cvtColor(np.array(item), cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2RGB)

def _get_data(info, train, test_size):
    data = info.copy()

    data.source = data.source.apply(lambda x: image_split(x, train, test_size))
    temp = data[['source', 'params']].apply(lambda x: (x['source'], x['params']), axis=1)
    data.source = temp.apply(lambda x: _random_scan(x[0], x[1]))

    def func(item):
        output = DataFrame()
        output['source'] = item[0]
        output['y'] = item[1]
        output['params'] = None
        output.params = output.params.apply(lambda x: item[2])
        return output

    sdf = SparseDataFrame(data)
    data = sdf.merge_columns(['source', 'common_name', 'params'], func=func, new_column='new')
    data = pd.concat(data.new.tolist(), ignore_index=True)

    data['rgb'] = data.source.apply(_pil_to_opencv)
    data['hsv'] = data.rgb.apply(lambda x: cv2.cvtColor(x, cv2.COLOR_RGB2HSV))
    data['gray'] = data.rgb.apply(lambda x: cv2.cvtColor(x, cv2.COLOR_RGB2GRAY))
#     data.gray = data.gray.apply(lambda x: np.fft.hfft(x).astype(float))
#     data.gray = data.gray.apply(lambda x: np.histogram(x.ravel(), bins=256)[0])
#     data.gray = data.gray.apply(lambda x: StandardScaler().fit_transform(x))
#     data['fft_var'] = data.gray.apply(lambda x: x.var())
#     data['fft_max'] = data.gray.apply(lambda x: x.max())

    temp = data[['rgb', 'params']].apply(lambda x: (x['rgb'], x['params']), axis=1)
    data['r'] = temp.apply(lambda x: get_channel_histogram(x[0], 'r', **x[1])).apply(lambda x: x.tolist())
    data['g'] = temp.apply(lambda x: get_channel_histogram(x[0], 'g', **x[1])).apply(lambda x: x.tolist())
    data['b'] = temp.apply(lambda x: get_channel_histogram(x[0], 'b', **x[1])).apply(lambda x: x.tolist())

    temp = data[['hsv', 'params']].apply(lambda x: (x['hsv'], x['params']), axis=1)
    data['h'] = temp.apply(lambda x: get_channel_histogram(x[0], 'h', **x[1])).apply(lambda x: x.tolist())
    data['s'] = temp.apply(lambda x: get_channel_histogram(x[0], 's', **x[1])).apply(lambda x: x.tolist())
    data['v'] = temp.apply(lambda x: get_channel_histogram(x[0], 'v', **x[1])).apply(lambda x: x.tolist())

    sdf = SparseDataFrame(data)
    data = sdf.flatten(dtype=list)

    data = data.drop(['source', 'params', 'rgb', 'hsv', 'gray'], axis=1)
    
#     cols = filter(lambda x: x != 'y', data.columns.tolist())
#     data[cols] = StandardScaler().fit_transform(data[cols])

    index = data.index.tolist()
    np.random.shuffle(index)
    data = data.ix[index]
    data.reset_index(drop=True, inplace=True)
    return data

def get_data(info, test_size=0.2):
    return _get_data(info, 'train', test_size), _get_data(info, 'test', test_size)

/Users/alexbraun/google_drive/code/projects/sparse/python
/Users/alexbraun/google_drive/code/projects/texture_classifier/python


In [5]:
# wood_mask = [
#     'african-mahogany',
#     'african-walnut',
#     'afrormosia',
#     'afzelia',
#     'apple',
#     'bamboo',
#     'birdseye-maple',
#     'black-cherry',
#     'black-poplar',
#     'bloodwood',
#     'bog-oak',
#     'bubinga',
#     'cedar-of-lebanon',
#     'ceylon-satinwood',
#     'european-ash',
#     'european-beech',
#     'european-hornbeam',
#     'european-larch',
#     'european-lime',
#     'goncalo-alves',
#     'hemlock',
#     'honey-locust',
#     'iroko',
#     'kingwood',
#     'lati',
#     'louro-preto',
#     'macassar-ebony',
#     'makore',
#     'mansonia',
#     'merbau',
#     'moabi',
#     'okoume',
#     'olive-ash',
#     'olive',
#     'pear',
#     'peruvian-walnut',
#     'red-alder',
#     'sapele',
#     'sweet-chestnut',
#     'teak',
#     'tineo',
#     'wenge',
#     'western-red-cedar'
# ]

wood_mask = [
    'moabi',
    'sapele',
#     'olive-ash',
    'european-ash',
    'kingwood',
    'european-lime',
    'african-mahogany',
    'olive'
    # -----------------
#     'macassar-ebony',
#     'peruvian-walnut'
#     'bog-oak',
#     'goncalo-alves',
#     'merbau'
]

version = 18
version = str(version).zfill(3)

In [106]:
# WOOD-DATABASE
source = '/Users/alexbraun/Documents/data/texture_classifier/data/wood-database/texture'
spec = ['texture', 'image_id', 'common_name', 'sanded', 'sealed', 'saw_type', 'curly', 'full', 'source_extension']
info = get_info(source, spec)

# mask = info.common_name.apply(lambda x: x in a)
mask = info.common_name.apply(lambda x: x in wood_mask)
info = info[mask]

min_res = 10
max_res = 100
params = {
            'min_resolution':   (min_res, min_res),
            'max_resolution':   (max_res, max_res),
            'patches':                         100,
            'patch_resolution': (min_res, min_res),
            'normalize': True,
            'bins': 256
#             'rotation':         'random'
}
info['params'] = None
info.params = info.params.apply(lambda x: params)

# %time test = _get_data(info, None, 0.2)
%time train, test = get_data(info, test_size=0.2)

hdf = HDFStore(
    '/Users/alexbraun/Documents/data/texture_classifier/data/hdf/wood-database.data.' + version + '.hdf')
hdf['train'] = train
hdf['test'] = test
hdf['info'] = info
hdf.close()

CPU times: user 3.42 s, sys: 196 ms, total: 3.62 s
Wall time: 3.62 s


In [107]:
# ARROWAY
source = '/Users/alexbraun/Documents/data/texture_classifier/data/arroway'
spec = ['texture', 'image_id', 'image_class', 'common_name', 'pass_', 'source_extension']

info = get_info(source, spec)
info = info[(info.pass_ == 'diffuse')] # & (info.image_class == 'a')]
# mask = info.common_name.apply(lambda x: x in wdb_info.common_name.tolist())

mask = info.common_name.apply(lambda x: x in wood_mask)
info = info[mask]
info.reset_index(drop=True, inplace=True)

params = {
            'min_resolution':   (min_res, min_res),
            'max_resolution':   (max_res, max_res),
            'patches':                         100,
            'patch_resolution': (min_res, min_res),
            'normalize': True,
            'bins': 256
#             'rotation':         'random'
}
info['params'] = None
info.params = info.params.apply(lambda x: params)

# data = get_data(info)
%time test = _get_data(info, None, 0.2)
# %time train, test = get_data(info, test_size=0.2)

hdf = HDFStore(
    '/Users/alexbraun/Documents/data/texture_classifier/data/hdf/arroway.data.' + version + '.hdf')
# hdf['train'] = train
hdf['test'] = test
hdf['info'] = info
hdf.close()

CPU times: user 46.4 s, sys: 1.76 s, total: 48.1 s
Wall time: 48.2 s


In [108]:
def drop_features(data, features):
    drop = filter(lambda x: True if re.search(features, x) else False, data.columns.tolist())
    data = data[drop]

# ARROWAY-DATABASE
aw_hdf = HDFStore(
    '/Users/alexbraun/Documents/data/texture_classifier/data/hdf/arroway.data.' + version + '.hdf')
# aw_train = aw_hdf['train']
# aw_train_x = aw_train.drop('y', axis=1)
# aw_train_y = aw_train.y


aw_test = aw_hdf['test']
aw_test_x = aw_test.drop('y', axis=1)
aw_test_y = aw_test.y

aw_info = aw_hdf['info']

# WOOD-DATABASE
wd_hdf = HDFStore(
    '/Users/alexbraun/Documents/data/texture_classifier/data/hdf/wood-database.data.' + version + '.hdf')
wd_train = wd_hdf['train']
wd_train_x = wd_train.drop('y', axis=1)
wd_train_y = wd_train.y

wd_test = wd_hdf['test']
wd_test_x = wd_test.drop('y', axis=1)
wd_test_y = wd_test.y

wd_info = wd_hdf['info']

# features = 'h|s|v'
# map(lambda x: drop_features(x, features), [aw_test_x, wd_test_x, wd_train_x])
    
# clf = RandomForestClassifier()
# params = {
#     'n_estimators':        range(3, 20), #[10, 43, 100],
# #     'criterion':           ['gini'],
# #     'max_depth':           [None],
# #     'min_samples_split':   [2],
# #     'min_samples_leaf':    [1],
#     'max_features':        ['auto', 100, 500, 1000],
# #     'max_leaf_nodes':      [None],
# #     'bootstrap':           [True],
# #     'oob_score':           [False],
#     'n_jobs':              [-1]
# #     'random_state':        [42]
# #     'verbose':             [0],
# #     'min_density':         [None],
# #     'compute_importances': [None]
# }

clf = SVC()
params = {
    'C':            [5], #np.arange(0.1, 1, 0.1),
    'kernel':       ['linear'],#, 'rbf'],
#     'degree':       [3],
#     'gamma':        [0.0],
#     'coef0':        [0.0],
#     'shrinking':    [True],
#     'probability':  [False],
#     'tol':          [0.001],
#     'cache_size':   [200],
#     'class_weight': [None],
#     'verbose':      [False],
#     'max_iter':     [1],
#     'random_state': [None]
}

grid = GridSearchCV(clf, params, cv=5)
%time grid.fit(wd_train_x, wd_train_y)
# print(grid.best_score_)
pred = grid.best_estimator_.predict(wd_test_x)
wd_report = get_report(wd_test_y, pred)
pred = grid.best_estimator_.predict(aw_test_x)
print(grid.best_params_)
print(grid.best_estimator_.score(aw_test_x, aw_test_y))
report = get_report(aw_test_y, pred)
report

CPU times: user 8.38 s, sys: 43.7 ms, total: 8.43 s
Wall time: 8.43 s
{'kernel': 'linear', 'C': 5}
0.579166666667


Unnamed: 0_level_0,precision,recall,f1-score,support
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
african-mahogany,0.42,0.87,0.56,300
european-ash,0.66,0.81,0.73,300
european-lime,0.67,0.59,0.62,300
kingwood,0.91,0.8,0.85,300
moabi,0.82,0.97,0.89,300
olive,0.75,0.32,0.44,300
peruvian-walnut,0.18,0.2,0.19,300
sapele,0.43,0.07,0.12,300
total,0.6,0.58,0.55,2400


In [109]:
d = DataFrame([aw_test_y, pred]).T
d.columns = ['ytrue', 'yhat']
lut = {k:i for i, k in enumerate(wood_mask)}
ilut = {v:k for k, v in lut.iteritems()}
d.yhat = d.yhat.apply(lambda x: lut[x])
d = d.groupby('ytrue').agg(lambda x: x.mode()).yhat.apply(lambda x: ilut[x])
d = DataFrame(d)
print(d[d.index == d.yhat].shape[0] / float(d.shape[0]))
d

0.625


Unnamed: 0_level_0,yhat
ytrue,Unnamed: 1_level_1
african-mahogany,african-mahogany
european-ash,european-ash
european-lime,european-lime
kingwood,kingwood
moabi,moabi
olive,african-mahogany
peruvian-walnut,african-mahogany
sapele,peruvian-walnut


In [61]:
report.drop('total', axis=0).sort('precision', ascending=False)

  if __name__ == '__main__':


Unnamed: 0_level_0,precision,recall,f1-score,support
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
european-ash,1.0,0.47,0.64,300
kingwood,1.0,0.52,0.68,300
peruvian-walnut,1.0,0.06,0.12,300
african-mahogany,0.9,0.94,0.92,300
moabi,0.74,1.0,0.85,300
sapele,0.62,1.0,0.76,300
european-lime,0.53,0.9,0.67,300
olive,0.49,0.62,0.55,300


In [64]:
x = Series(grid.best_estimator_.feature_importances_, index=train_x.columns)
def func(item):
    for char in list('rgbhsv'):
        if char in item:
            return char
x.rename_axis(func, inplace=True)
x = x.groupby(x.index).sum()
x.sort(ascending=False, inplace=True)
x

In [None]:
PRECISION BEST TO WORST
olive-ash
moabi
european-lime
kingwood
european-ash
african-mahogany
olive
goncalo-alves
macassar-ebony
peruvian-walnut
merbau


RECALL BEST TO WORST
kingwood
goncalo-alves
african-mahogany
moabi
macassar-ebony
european-lime
olive-ash
peruvian-walnut
european-ash
olive
merbau