This is an example of feature extraction-model building flow

# Build SVM with Color and Texture Features

In [1]:
import os
import re
import time
import random
from glob import glob
import itertools
import pickle
import pandas as pd

import numpy as np
import multiprocessing

import skimage
from skimage import io

from sklearn import cross_validation
from sklearn import svm
from sklearn import preprocessing
from sklearn.linear_model.logistic import LogisticRegression

In [4]:
def shuffle_images(dir):
    """ Given a directory, returns a shuffled list files
    
    """
    random.seed(1)
    image_filenames = glob('{}/*.jpg'.format(dir))
    image_filenames.sort()
    random.shuffle(image_filenames)
    return image_filenames

# Data Preparation 

In [174]:
# change here the path to your data
file_list = shuffle_images("/train")
print len(file_list)
pickle.dump(file_list, open("file_list.pkl","wb"))

23484


In [181]:
train_pic_biz = pd.read_csv('/data/train_photo_to_biz_ids.csv')
train_pic_biz.ix[:10]

Unnamed: 0,photo_id,business_id
0,204149,3034
1,52779,2805
2,278973,485
3,195284,485
4,19992,485
5,80748,485
6,444996,1783
7,200285,35
8,90572,35
9,27565,1313


In [182]:
train_biz_label = pd.read_csv('/data/train.csv')
train_biz_label = train_biz_label.dropna()
train_biz_label[:10]

Unnamed: 0,business_id,labels
0,1000,1 2 3 4 5 6 7
1,1001,0 1 6 8
2,100,1 2 4 5 6 7
3,1006,1 2 4 5 6
4,1010,0 6 8
5,101,1 2 3 4 5 6
6,1011,2 3 5 6
7,1012,1 2 3 5 6
8,1014,1 2 4 5 6
9,1015,1 5 6 7


In [186]:
file_list[0].split("/")[-1]
new_list = [ int(x.split("/")[-1].strip('.jpg')) for x in file_list]
photo_list = pd.DataFrame(new_list, columns=["photo_id"])  
print photo_list[0:5]

   photo_id
0    107694
1    100025
2    160315
3    312701
4    139182


In [187]:
pic_biz = pd.merge(photo_list, train_pic_biz, on='photo_id', how='left')
pic_label = pd.merge(pic_biz, train_biz_label, on='business_id', how='left')

In [191]:
print pic_label[:5]

   photo_id  business_id       labels
0    107694         2127            8
1    100025         1504  1 2 3 4 5 6
2    160315         1114  1 2 4 5 6 7
3    312701           75          0 8
4    139182         3065  1 2 4 5 6 7


In [224]:
def get_label(labels, level):
    y_label_1 = [0]*len(labels)
    for i in range(len(labels)):
        try:
            y_label_1[i] = int(level in labels[i])
        except:
            pass
    return y_label_1
####only consider label 1 : good_for_dinner
y_label_1 = get_label(pic_label['labels'].tolist(), '1')
print len(y_label_1)

23484


In [None]:
print y_label_1[0:300]

In [22]:
from PIL import Image

In [35]:
from skimage.transform import resize

In [40]:
def file_to_rgb(filename):
    """ Returns an image in rgb format
  
      a gray scale image will be converted, a rgb image will be left untouched
    """
    img = io.imread(filename)
    img = resize(img, (250, 250))
    if (img.ndim == 2):
        return skimage.color.gray2rgb(img)
    return img

In [57]:
# testing file_to_rgb
filename = file_list[0]
file_rgb = file_to_rgb(filename)
#print file_rgb
img_hsv = skimage.color.rgb2hsv(file_rgb)

[[[ 0.09799714  0.56553398  0.20196078]]]


# Color Feature Extraction 

In [155]:
def hsv_to_feature_new(hsv, N, C_h, C_s, C_v):
    """ Takes an hsv picture and returns a feature vector.
  
    The vector is built as described in the paper 
    'Machine Learning Attacks Against the Asirra CAPTCHA'
    """  
    feature_vec = np.zeros((N * N, C_h * C_s * C_v), dtype=int)
    cell_size = 250/N
    h_range = np.arange(0.0,1.0,1.0/C_h)
    h_range = np.append(h_range,1.0)
    print h_range
    s_range = np.arange(0.0,1.0,1.0/C_s)
    s_range = np.append(s_range,1.0)
    v_range = np.arange(0.0,1.0,1.0/C_v)
    v_range = np.append(v_range,1.0)
    index_n = 0
    for i in range(N):
        for j in range(N):
            cell = hsv[i*cell_size:i*cell_size+cell_size,j*cell_size:j*cell_size+cell_size,:]
            res_ij = np.zeros(C_h * C_s * C_v, dtype=int)
            index = 0
            for h in range(C_h):
                h_cell = np.logical_and(cell[:,:,0]>=h_range[h],cell[:,:,0]<h_range[h+1])
                for s in range(C_s): 
                    s_cell = np.logical_and(cell[:,:,1]>=s_range[s],cell[:,:,1]<s_range[s+1])
                    for v in range(C_v):
                        v_cell = np.logical_and(cell[:,:,2]>=v_range[v],cell[:,:,2]<v_range[v+1])
                        res_ij[index] = np.logical_and(np.logical_and(h_cell,s_cell),v_cell).any()
                        index+=1
            print res_ij
            # here transform in h s v
            feature_vec[index_n, : ] = res_ij
            index_n+=1
    return feature_vec.flatten()

In [234]:
def build_color_feature_vector((filename, N, C_h, C_s, C_v)):
    """ Builds color feature vector 
  
    Takes a jpeg file and the parameters of the feature vector
    """
    rgb_img = file_to_rgb(filename)
    assert (rgb_img.shape[2] == 3)
    hsv_img = skimage.color.rgb2hsv(rgb_img)
    return hsv_to_feature(hsv_img, N, C_h, C_s, C_v)
    
def run_all_images(file_list, N, C_h, C_s, C_v):
    """ Builds the feature matrix of the jpegs in file list
    
    return featurematrix where the i-th row corresponds to the feature
    in the i-th image of the file list"
    """
    F_size = N * N * (C_h * C_s * C_v)
    X = [(f, N, C_h, C_s, C_v) for f in file_list]
    feature_arr = np.zeros((len(file_list), F_size), dtype=int)
    for i in range(len(X)):
        feature_arr[i] = build_color_feature_vector(X[i])
    return feature_arr

In [243]:
%time fea = build_color_feature_vector((file_list[0], 1, 10, 10, 10))

CPU times: user 277 ms, sys: 4.88 ms, total: 282 ms
Wall time: 283 ms


In [240]:
def build_color_feature_matrices_or_load(file_list):
    F1 = run_all_images(file_list,1,10,10,10)
    F2 = run_all_images(file_list,3,10,8,8)
    F3 = run_all_images(file_list,5,10,6,6)
    return F1, F2, F3

In [242]:
file_list = pickle.load(open("file_list.pkl","rb"))
%time F1,F2,F3 =build_color_feature_matrices_or_load(file_list[:300])
# np.save("F1",F1) 
# np.save("F2",F2)
# np.save("F3",F3)

CPU times: user 4min 23s, sys: 5.53 s, total: 4min 28s
Wall time: 4min 30s


In [244]:
np.save("F1",F1) 
np.save("F2",F2)
np.save("F3",F3)

# Color Modeling 

In [285]:
def classify_color_feature(F,y):
    start = time.time()
    clf = svm.SVC(kernel='poly',gamma=0.001)
    scores = cross_validation.cross_val_score(clf, F, y, cv=5) 
    f1score = cross_validation.cross_val_score(clf, F, y, cv=5,scoring='f1_weighted')
    time_diff = time.time() - start 
    print "Accuracy: %.1f F1 Score %.1f "   % (np.mean(scores)*100,np.mean(f1score) *100)

In [286]:
F1=np.load("F1.npy")
F2=np.load("F2.npy")
F3=np.load("F3.npy")

union = np.hstack((F1,F2,F3))

classify_color_feature(F1[:300],y_label_1[:300])
classify_color_feature(F2[:300],y_label_1[:300])
classify_color_feature(F3[:300],y_label_1[:300])
classify_color_feature(union[:300],y_label_1[:300])
#print y_label_1[:300]

Accuracy: 61.3 F1 Score 46.6 
Accuracy: 61.7 F1 Score 55.6 
Accuracy: 58.7 F1 Score 52.8 
Accuracy: 58.0 F1 Score 53.6 


In [4]:
np.save('temp',np.zeros((10,5,5,3)))

In [5]:
np.load('temp.npy')

array([[[[ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.]],

        [[ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.]],

        [[ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.]],

        [[ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.]],

        [[ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.]]],


       [[[ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.]],

        [[ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.]],

        [[ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0

# Building Texture

In [263]:
def texture_distance(T1,T2):
    y=np.linalg.norm(T1-T2,axis=2)
    return np.mean(y)

def build_tiles(number_of_tiles,files,threshold):
    """ Returns a number_of_tiles*5*5 - Matrix, where every 5*5_texture is at least threshold from each other """
    current=0
    textures = np.zeros((number_of_tiles,5,5,3))
    while(current<number_of_tiles):
        file_index = random.randint(0,len(files)-1)
        i = random.randint(0,49)
        j = random.randint(0,49)
        bild = io.imread(files[file_index])
        bild = resize(bild, (250, 250))
        if (bild.ndim==2):
            rgb_bild= skimage.color.gray2rgb(bild)
        else:
            rgb_bild = bild
            cell = rgb_bild[i*5:i*5+5,j*5:j*5+5,:] 
            close = False
        for i in range(current):
            T = textures[i,:,:] 
            if(texture_texture_distance(cell,T)<threshold):
                close=True
                break
        if(not close):
            textures[current,:,:]=cell
            current+=1
    return textures
    
def build_textures_or_load(number_of_tiles,files,threshold):
    try:
        textures = np.load("textures.npy")
    except IOError:
        textures = build_tiles(number_of_tiles,files,threshold)
    return textures
    

def texture_image_distance_simple(rgb,T):
    bigtile = np.tile(T,(50,50,1))
    distances = np.linalg.norm(rgb-bigtile,axis=2)
    splitted = [np.hsplit(x,50) for x in np.vsplit(distances,50)]
    merged = list(itertools.chain.from_iterable(splitted)) # flatten the list
    maxvalues=[np.max(x) for x in merged]
    return min(maxvalues)

def build_texture_feature_vector(pars):
    filename,textures=pars
    bild = io.imread(filename)
    if (bild.ndim==2):
        rgb= skimage.color.gray2rgb(bild)
    else:
        rgb = bild
    res=[]
    for t in textures:
        res.append(texture_image_distance_simple(rgb,t))
    return res

def build_texture_feature_matrix(file_list,texture):
    pool = multiprocessing.Pool()
    res = pool.map(build_texture_feature_vector,[(f,texture) for f in file_list])
    return np.array(res)

def build_texture_feature_matrix_or_load(file_list,textures):
    try:
        G = np.load("G.npy")
    except IOError:
        print "Building matrix"
        G = build_texture_feature_matrix(file_list,textures)
    return G

In [265]:
#file_list = pickle.load(open("file_list.pkl","rb"))

%time textures = build_textures_or_load(500,file_list,40)
#print textures
np.save("textures",textures)

Process PoolWorker-158:
Process PoolWorker-157:
Process PoolWorker-154:
Process PoolWorker-180:
Process PoolWorker-179:
Process PoolWorker-177:
Process PoolWorker-175:
Process PoolWorker-166:
Process PoolWorker-178:
Process PoolWorker-174:
Process PoolWorker-168:
Process PoolWorker-172:
Process PoolWorker-169:
Process PoolWorker-170:
Process PoolWorker-164:
Process PoolWorker-162:
Process PoolWorker-156:
Process PoolWorker-163:
Process PoolWorker-161:
Process PoolWorker-173:
Process PoolWorker-159:
Process PoolWorker-165:
Process PoolWorker-176:
Process PoolWorker-171:
Process PoolWorker-160:
Process PoolWorker-153:
Process PoolWorker-155:
Process PoolWorker-167:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (mo

KeyboardInterrupt: 

  File "/Users/s92wang/anaconda/lib/python2.7/multiprocessing/queues.py", line 376, in get
  File "/Users/s92wang/anaconda/lib/python2.7/multiprocessing/queues.py", line 376, in get
  File "/Users/s92wang/anaconda/lib/python2.7/multiprocessing/queues.py", line 376, in get
  File "/Users/s92wang/anaconda/lib/python2.7/multiprocessing/queues.py", line 376, in get
  File "/Users/s92wang/anaconda/lib/python2.7/multiprocessing/queues.py", line 376, in get
  File "/Users/s92wang/anaconda/lib/python2.7/multiprocessing/queues.py", line 376, in get
  File "/Users/s92wang/anaconda/lib/python2.7/multiprocessing/queues.py", line 376, in get
  File "/Users/s92wang/anaconda/lib/python2.7/multiprocessing/queues.py", line 376, in get
  File "/Users/s92wang/anaconda/lib/python2.7/multiprocessing/queues.py", line 376, in get
  File "/Users/s92wang/anaconda/lib/python2.7/multiprocessing/queues.py", line 376, in get
  File "/Users/s92wang/anaconda/lib/python2.7/multiprocessing/queues.py", line 378, in get

In [None]:
%time G=build_texture_feature_matrix_or_load(file_list[:300],textures)
np.save("G",G)

# Texture Modeling 

In [None]:
def classify_texture_feature(G,y):
    start = time.time()
    clf = svm.SVC(kernel='rbf',gamma=0.01)
    scores = cross_validation.cross_val_score(clf, G, y, cv=5) 
    time_diff = time.time() - start 
    f1score = cross_validation.cross_val_score(clf, F, y, cv=5,scoring='f1_weighted')
    print "Accuracy: %.1f F1 Score %.1f " % (np.mean(scores)*100,np.mean(f1score) *100)

In [None]:
G=np.load("G.npy")
classify_texture_feature(G[:300],y_label_1[:300])

# Combined Classifiers

In [None]:
class Combined:
    def __init__(self,clf1,clf2):
        self.clf1=clf1
        self.clf2=clf2
    def predict(self,F,G):
        y1=self.clf1.predict_proba(F)
        y2=self.clf2.predict_proba(G)
        y_out= 2*y1/3+y2/3
        m=np.argmax(y_out, axis=1)
        m[m==1]=1.0
        m[m==0]=-1.0
        return m

In [None]:
clf_color = svm.SVC(kernel='rbf',gamma=0.001,probability=True)
clf_color.fit(union[:300],y_label_1[:300])

In [None]:
clf_texture = svm.SVC(kernel='rbf',gamma=0.001,probability=True)
clf_texture.fit(G[:300],y_label_1[:300])

In [None]:
combined=Combined(clf_color,clf_texture)
print "Accuracy: ", np.mean(combined.predict(union[:300],G[:300])==y[:300])