In [1]:
%cd ~/google_drive/code/projects/sparse/python
from sparse.core.sparse_dataframe import SparseDataFrame
from sparse.utilities.utils import *

%cd ~/google_drive/code/projects/texture_classifier/python
# %cd /home/ubuntu/texture_classifier/python
import multiprocessing
import PIL
from sklearn.cluster import KMeans
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist, squareform
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
import cv2
from pandas.io.pytables import HDFStore
# from plotly import plotly
# from plotly.graph_objs import *

import core.utils
reload(core.utils)
from core.utils import *

import core.image_scanner
reload(core.image_scanner)
from core.image_scanner import ImageScanner

/Users/alexbraun/google_drive/code/projects/sparse/python
/Users/alexbraun/google_drive/code/projects/texture_classifier/python


In [2]:
def get_report(y_true, y_pred):
    x = classification_report(y_true, y_pred)
    x = re.sub('avg / total', 'total', x)
    x = map(lambda x: re.split(' +', x), x.split('\n'))
    x = map(lambda x: filter(lambda x: x != '', x), x)
    x = filter(lambda x: x != [], x)
    report = DataFrame(x[1:])
    report.set_index(0, inplace=True)
    report.columns = x[0]
    return report

def info_split(info, test_size=0.2):
    def _info_split(info, test_size=0.2):
        train_x, test_x, train_y, test_y = train_test_split(info, info.common_name, test_size=test_size)
        return DataFrame(train_x, columns=info.columns), DataFrame(test_x, columns=info.columns)
    
    train = []
    test = []
    for name in info.common_name.unique():
        x, y = _info_split(info[info.common_name == name], test_size=test_size)
        train.append(x)
        test.append(y)
    return pd.concat(train, axis=0), pd.concat(test, axis=0)

def pil_to_opencv(image):
    return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

def generate_samples(image, y, params):
    scan = ImageScanner(image, **params)
    func = getattr(scan, params['scan_method'])
    return [[x, y, params] for x in func(**params)]

def get_data(info, features=['r', 'g', 'b', 'h', 's', 'v', 'fft_var', 'fft_max']):
    # create data from info
    data = info.copy()
    data = data[['source', 'common_name', 'params']]
    data.source = data.source.apply(lambda x: PIL.Image.open(x))
    data = data.apply(lambda x: 
        generate_samples(x['source'], x['common_name'], x['params']),
        axis=1
    )
    # create new expanded dataframe
    data = list(chain(*data.tolist()))
    data = DataFrame(data, columns=['x', 'y', 'params'])
    data['bgr'] = data.x.apply(pil_to_opencv)
    
    del data['x']
    
    # create feature lists
    rgb = filter(lambda x: x in list('rgb'), features)
    hsv = filter(lambda x: x in list('hsv'), features)
    fft = filter(lambda x: x in ['fft_var', 'fft_max'], features)
    
    # rgb distributions
    if rgb:
        temp = data[['bgr', 'params']].apply(lambda x: (x['bgr'], x['params']), axis=1)
        for chan in rgb:
            c = temp.apply(lambda x: get_channel_histogram(x[0], chan, **x[1]))
            data[chan] = c.apply(lambda x: x.tolist())

    # hsv distributions
    if hsv:
        data['hsv'] = data.bgr.apply(lambda x: cv2.cvtColor(x, cv2.COLOR_BGR2HSV))
        temp = data[['hsv', 'params']].apply(lambda x: (x['hsv'], x['params']), axis=1)
        for chan in hsv:
            c = temp.apply(lambda x: get_channel_histogram(x[0], chan, **x[1]))
            data[chan] = c.apply(lambda x: x.tolist())
    
        del data['hsv']
    
    # grain frequency
    if fft:
        data['gray'] = data.bgr.apply(lambda x: cv2.cvtColor(x, cv2.COLOR_BGR2GRAY))
        data.gray = data.gray.apply(lambda x: np.fft.hfft(x).astype(float))
        data.gray = data.gray.apply(lambda x: np.histogram(x.ravel(), bins=256)[0])
        data.gray = data.gray.apply(lambda x: StandardScaler().fit_transform(x))
        if 'fft_var' in fft:
            data['fft_var'] = data.gray.apply(lambda x: x.var())
        if 'fft_max' in fft:
            data['fft_max'] = data.gray.apply(lambda x: x.max())

        del data['gray']
    
    del data['bgr']
    del data['params']
    
    # expand columns that contain lists
    if rgb or hsv:
        sdf = SparseDataFrame(data)
        data = sdf.flatten(dtype=list)

    # shuffle data to destroy serial correlations
    index = data.index.tolist()
    np.random.shuffle(index)
    data = data.ix[index]
    data.reset_index(drop=True, inplace=True)
    
    return data

def _get_data(args):
    return get_data(args[0], features=args[1])

def get_data_multi(info, features=['r', 'g', 'b', 'h', 's', 'v', 'fft_var', 'fft_max'], processes=24):
    pool = multiprocessing.Pool(processes=processes)
    iterable = [(row.to_frame().T, features) for i, row in info.iterrows()]
    data = pool.map(_get_data, iterable)
    pool.close()
    data = pd.concat(data, axis=0)

    # shuffle data to destroy serial correlations
    index = data.index.tolist()
    np.random.shuffle(index)
    data = data.ix[index]
    data.reset_index(drop=True, inplace=True)
    
    return data

In [3]:
source = '/Users/alexbraun/Documents/data/texture_classifier/data/texture'
spec = [
    'material',
    'image_id',
    'common_name',
    'origin',
    'desc',
    'extension'
]

wood_mask = [
    'moabi',
    'sapele',
#     'olive-ash',
    'european-ash',
    'kingwood',
    'european-lime',
    'african-mahogany',
    'olive'

#     'macassar-ebony',
#     'peruvian-walnut'
#     'bog-oak',
#     'goncalo-alves',
#     'merbau'
]

In [5]:
# random scan
# min_res = 10
# max_res = 100
# params = {
#             'scan_method':      'random_scan',
#             'min_resolution':   (min_res, min_res),
#             'max_resolution':   (max_res, max_res),
#             'patches':          100,
#             'patch_resolution': (min_res, min_res),
#             'normalize':        True,
#             'bins':             256
# #             'rotation':         'random'
# }

# grid scan
min_res = 400
max_res = 400
params = {
            'scan_method':      'grid_scan',
            'min_resolution':   (min_res, min_res),
            'max_resolution':   (max_res, max_res),
            'resolutions':      1,
            'spacing':          'even',
#             'patch_resolution': (min_res, min_res),
            'normalize':        True,
            'bins':             256
}

info = get_info(source, spec)
info = info[info.common_name.apply(lambda x: x in wood_mask)]

# dataframes won't allow direct assignment of dicts
info['params'] = None
info.params = info.params.apply(lambda x: params)
    
# train, test = info_split(info)

train = info[info.desc.apply(lambda x: '_a_' in x)]
test = info[info.origin != 'arroway-textures']

In [None]:
# %%snakeviz

def get_all_data(train, test, hdf_path=None):
    hdf = {}
    if hdf_path:
        hdf = HDFStore(hdf_path)

    train = get_data(train)
    train_x, valid_x, train_y, valid_y = train_test_split(train.drop('y', axis=1), train.y, test_size=0.2)
    hdf['train_x'] = train_x
    hdf['valid_x'] = valid_x
    hdf['train_y'] = train_y
    hdf['valid_y'] = valid_y

    test = get_data(test)
    test_x = test.drop('y', axis=1)
    test_y = test.y

    hdf['test_x'] = test_x
    hdf['test_y'] = test_y

    if write_hdf:
        hdf.close()
    
    return train_x, valid_x, train_y, valid_y, test_x, test_y

version = '1'.zfill(3)
hdf_path = '/Users/alexbraun/Documents/data/texture_classifier/data/hdf/data'
hdf_path += '.' + version + '.hdf'

# %time
train_x, valid_x, train_y, valid_y, test_x, test_y = get_all_data(train, test, hdf_path)

In [8]:
train = get_data_multi(train, features=['fft_var', 'fft_max'])
train_x, valid_x, train_y, valid_y = train_test_split(train.drop('y', axis=1), train.y, test_size=0.2)

In [9]:
clf = SVC()
params = {
    'C':            [5], #np.arange(0.1, 1, 0.1),
    'kernel':       ['linear'],#, 'rbf'],
#     'degree':       [3],
#     'gamma':        [0.0],
#     'coef0':        [0.0],
#     'shrinking':    [True],
#     'probability':  [False],
#     'tol':          [0.001],
#     'cache_size':   [200],
#     'class_weight': [None],
#     'verbose':      [False],
#     'max_iter':     [1],
#     'random_state': [None]
}

grid = GridSearchCV(clf, params, cv=5)
%time grid.fit(train_x, train_y)
pred = grid.best_estimator_.predict(valid_x)
valid_report = get_report(valid_y, pred)
# pred = grid.best_estimator_.predict(test_x)
# print(grid.best_params_)
# print(grid.best_estimator_.score(test_x, test_y))
# report = get_report(test_y, pred)
# report
valid_report

CPU times: user 35.8 s, sys: 864 ms, total: 36.7 s
Wall time: 36.7 s


  'precision', 'predicted', average, warn_for)


Unnamed: 0_level_0,precision,recall,f1-score,support
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
african-mahogany,0.15,0.1,0.12,416
european-ash,0.07,0.06,0.07,428
european-lime,0.0,0.0,0.0,404
kingwood,0.0,0.0,0.0,408
moabi,0.0,0.0,0.0,415
olive,0.2,0.9,0.33,403
sapele,0.24,0.29,0.26,414
total,0.09,0.19,0.11,2888


In [10]:
d = DataFrame([valid_y, pred]).T
d.columns = ['ytrue', 'yhat']
lut = {k:i for i, k in enumerate(wood_mask)}
ilut = {v:k for k, v in lut.iteritems()}
d.yhat = d.yhat.apply(lambda x: lut[x])
d = d.groupby('ytrue').agg(lambda x: x.mode()).yhat.apply(lambda x: ilut[x])
d = DataFrame(d)
print(d[d.index == d.yhat].shape[0] / float(d.shape[0]))
d

0.142857142857


Unnamed: 0_level_0,yhat
ytrue,Unnamed: 1_level_1
african-mahogany,olive
european-ash,olive
european-lime,olive
kingwood,olive
moabi,olive
olive,olive
sapele,european-ash
