In [17]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
import sys

import csv
import datetime
import itertools
import numpy as np
import pandas as pd
import random
import re
from collections import defaultdict

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context(rc={
       "figure.figsize": (16, 10),
       "axes.titlesize": 14})

import sklearn

from IPython.display import Image, display
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

from os.path import expanduser
sys.path.insert(1, '{}/datsci'.format(expanduser('~')))
from datsci import eda
from datsci import kaggle as kg

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from PIL import Image as PImage

# Create images of various colors

In [3]:
H, W = 10, 10
PImage.new("RGB", (H, W), "#FF0000")

<PIL.Image.Image image mode=RGB size=10x10 at 0x107880150>

In [11]:
def create_image_dir(dirname):
    '''Create image directories'''
    fullpath = os.path.join("Images", dirname)
    if not os.path.exists(fullpath):
	os.makedirs(fullpath)

def get_hex_generator():
    '''Generate a-f'''
    return itertools.chain(map(str, xrange(10)),
			   (chr(x) for x in xrange(ord('a'), ord('f') + 1)))

def generate_hex_colors(skip=0):
    '''Generate from #000000 to #ffffff, if `skip` is defined, sample every `skip` colors'''
    all_colors = itertools.product(*[get_hex_generator() for x in xrange(6)])
    for i, x in enumerate(all_colors):
	if skip and i % skip:
	    continue
	yield '#{}'.format(''.join(x))

def create_solid_image(height, width, color, outfile=None):
    '''Create solid image of given shape and color to given outfile handler'''
    im = PImage.new("RGB", (height, width), color)
    if outfile is not None:
	im.save(outfile, quality="web_high")
    return im

In [5]:
# Create a lot of images of different color

dirname = 'solid-colors'
create_image_dir(dirname)
filenames = []

h, w = 100, 100
for i, c in enumerate(generate_hex_colors(skip=3000)):
    fname = "Images/{}/h{}w{}_{}.jpg".format(dirname, h, w, c.replace('#', ''))
    create_solid_image(h, w, c, outfile=fname)
    filenames.append(fname)

# Write out the filenames
with open('Images_filenames/{}.txt'.format(dirname), 'wb') as f:
    f.writelines(map(lambda x: x + '\n', filenames))

In [6]:
# Create a lot of square images of different size for white color

dirname = 'solid-whites'
create_image_dir(dirname)
filenames = []

c = "#ffffff"
for s in xrange(101, 2500):
    fname = "Images/{}/h{}w{}_{}.jpg".format(dirname, s, s, c.replace('#', ''))
    create_solid_image(s, s, c, outfile=fname)
    filenames.append(fname)

# Write out the filenames
with open('Images_filenames/{}.txt'.format(dirname), 'wb') as f:
    f.writelines(map(lambda x: x + '\n', filenames))

In [7]:
# Black square images of various sizes

dirname = 'solid-blacks'
create_image_dir(dirname)
filenames = []

c = "#000000"
for s in xrange(101, 2500):
    fname = "Images/{}/h{}w{}_{}.jpg".format(dirname, s, s, c.replace('#', ''))
    create_solid_image(s, s, c, outfile=fname)
    filenames.append(fname)

# Write out the filenames
with open('Images_filenames/{}.txt'.format(dirname), 'wb') as f:
    f.writelines(map(lambda x: x + '\n', filenames))

In [8]:
# Green square images of various sizes

dirname = 'solid-greens'
create_image_dir(dirname)
filenames = []

c = "#00ff00"
for s in xrange(101, 2500):
    fname = "Images/{}/h{}w{}_{}.jpg".format(dirname, s, s, c.replace('#', ''))
    create_solid_image(s, s, c, outfile=fname)
    filenames.append(fname)

# Write out the filenames
with open('Images_filenames/{}.txt'.format(dirname), 'wb') as f:
    f.writelines(map(lambda x: x + '\n', filenames))

In [None]:
# Different shapes, not square

dirname = 'solidshapes-green'
create_image_dir(dirname)
filenames = []

c = '#00ff00'

# Varying width
h = 100
for w in xrange(101, 2300):
    fname = "Images/{}/h{}w{}_{}.jpg".format(dirname, h, w, c.replace('#', ''))
    create_solid_image(h, w, c, outfile=fname)
    filenames.append(fname)

# Varying height
w = 100
for h in xrange(101, 2300):
    fname = "Images/{}/h{}w{}_{}.jpg".format(dirname, h, w, c.replace('#', ''))
    create_solid_image(h, w, c, outfile=fname)
    filenames.append(fname)

# Write out the filenames
with open('Images_filenames/{}.txt'.format(dirname), 'wb') as f:
    f.writelines(map(lambda x: x + '\n', filenames))    

## Generate features for all dirname groups

```bash
dirname=solid-blacks
python image_deep_features.py --image_files_listfile Images_filenames/$dirname.txt --output_deep_features Images_features/$dirname.df.txt
```

In [52]:
# PCA on marginal data

## Format data
hpat = re.compile(r'.*h(\d+)w\d+.*')
wpat  =re.compile(r'.*h\d+w(\d+).*')
cpat = re.compile(r'.*h\d+w\d+_([0-9a-f]{6})\.jpg')

def load_data(dirname):
    with open('Images_filenames/{}.txt'.format(dirname), 'rU') as f:
	filenames = [line.strip() for line in f]
    df = pd.read_csv('Images_features/{}.df.txt'.format(dirname), header=None)
    feature_cols = list(df.columns[:])
    for i in xrange(len(feature_cols)):
	feature_cols[i] = "feature{}".format(i)
    df.columns = feature_cols
    df['filepath'] = filenames
    df['h'] = df.filepath.apply(lambda _fp: int(hpat.match(_fp).group(1)))
    df['w'] = df.filepath.apply(lambda _fp: int(wpat.match(_fp).group(1)))
    df['c'] = df.filepath.apply(lambda _fp: cpat.match(_fp).group(1))
    return df, feature_cols

# Colors

In [62]:
df, feature_cols = load_data('solid-colors')

In [63]:
df.shape

(5593, 2052)

In [64]:
df[df.columns[-5:]].head()

   feature2047                                 filepath    h    w       c
0     0.144750  Images/solid-colors/h100w100_000000.jpg  100  100  000000
1     0.176099  Images/solid-colors/h100w100_000bb8.jpg  100  100  000bb8
2     0.225654  Images/solid-colors/h100w100_001770.jpg  100  100  001770
3     0.255043  Images/solid-colors/h100w100_002328.jpg  100  100  002328
4     0.213345  Images/solid-colors/h100w100_002ee0.jpg  100  100  002ee0

In [68]:
from sklearn.decomposition import PCA
pca = PCA(n_components=5, whiten=True).fit(df[feature_cols])

In [66]:
pca.components_

array([[-0.01183274,  0.00106169, -0.00209195, ...,  0.01517026,
        -0.0174829 ,  0.01523232],
       [-0.0508863 ,  0.0004355 ,  0.01992389, ...,  0.00245636,
         0.00311535,  0.01819104],
       [-0.05274458,  0.00253087, -0.00456185, ..., -0.00859057,
         0.03545388, -0.00853666],
       [ 0.01944707,  0.00322937, -0.01220475, ..., -0.00927544,
         0.01323637, -0.00601408],
       [-0.0127572 ,  0.00628936,  0.00650778, ..., -0.03395036,
         0.00917629, -0.04412621]])

In [67]:
pca.explained_variance_ratio_

array([ 0.27514579,  0.18777733,  0.11312552,  0.08802827,  0.07529514])

In [70]:
df_feature_cols_scaled = sklearn.preprocessing.scale(df[feature_cols])

In [74]:
pca_scaled = PCA(n_components=10, whiten=True).fit(df_feature_cols_scaled)

In [72]:
pca_scaled.components_

array([[ 0.03168516, -0.00685311, -0.03700053, ..., -0.01432242,
         0.01055312, -0.02365171],
       [-0.00579936, -0.00599648,  0.02101187, ..., -0.02726263,
         0.02241021, -0.01256568],
       [ 0.04800691, -0.02257235, -0.00638297, ...,  0.02289722,
        -0.02459563,  0.01583467],
       [-0.01605782, -0.03980402,  0.01413739, ...,  0.04008179,
         0.00020987,  0.03256104],
       [ 0.01000163,  0.0135203 , -0.02931602, ...,  0.0044705 ,
         0.03863625,  0.00511308]])

In [73]:
pca_scaled.explained_variance_ratio_

array([ 0.12286456,  0.12113521,  0.09209193,  0.05913861,  0.04981343])

In [75]:
pca_scaled.explained_variance_ratio_

array([ 0.12286456,  0.12113521,  0.09209193,  0.05913861,  0.04981343,
        0.04216726,  0.03588744,  0.02759877,  0.02299013,  0.02116537])

In [None]:
foo =  pd.DataFrame( pca_scaled.components_)

In [81]:
foo[foo.columns[:10]]

          0         1         2         3         4         5         6  \
0  0.031685 -0.006853 -0.037001  0.006251 -0.008930  0.045349  0.013358   
1 -0.005799 -0.005996  0.021012 -0.041690 -0.027340  0.024839  0.026681   
2  0.048007 -0.022572 -0.006383 -0.012504 -0.003535  0.032305  0.046001   
3 -0.016058 -0.039804  0.014137 -0.020117 -0.036291  0.008196  0.004093   
4  0.010002  0.013520 -0.029316  0.001803 -0.006594 -0.015116  0.005406   
5 -0.007558 -0.022196  0.025172 -0.025810 -0.023321  0.004696  0.008362   
6  0.022458  0.027974  0.029940  0.028660  0.032905 -0.011844 -0.012510   
7 -0.014392  0.005965 -0.019621  0.013631  0.023321  0.013258  0.033376   
8  0.018034  0.005402 -0.010094  0.009525  0.013187 -0.010086 -0.004841   
9 -0.008522 -0.024218 -0.007096 -0.033878 -0.019306 -0.008426  0.001750   

          7         8         9  
0  0.012847  0.020677 -0.013548  
1  0.036480 -0.015606 -0.028998  
2 -0.032299 -0.041258  0.009015  
3 -0.030876 -0.002148  0.005867  
4 -0

In [84]:
pca_scaled.explained_variance_ratio_.sum()

0.59485269674660457

# White images of various square sizes

In [124]:
df, feature_cols = load_data('solid-whites')
df.shape

(2399, 2052)

In [125]:
df[feature_cols].drop_duplicates().shape

(4, 2048)

In [91]:
from sklearn.decomposition import PCA
#df_feature_cols_scaled = sklearn.preprocessing.scale(df[feature_cols])
pca = PCA(n_components=5, whiten=True).fit(df[feature_cols])

In [96]:
sum(pca.explained_variance_ratio_[:2])

0.93411701149264548

In [106]:
def biplot(df):
    # Fit on 2 components
    pca = PCA(n_components=2, whiten=True).fit(df)
    # Plot transformed/projected data
    ax = pd.DataFrame(
	pca.transform(df),
	columns=['PC1', 'PC2']
    ).plot(kind='scatter', x='PC1', y='PC2', figsize=(10, 8), s=0.8)
    # Plot arrows and labels
    for i, (pc1, pc2) in enumerate(
	zip(pca.components_[0], pca.components_[1])):
	ax.arrow(0, 0, pc1, pc2, width=0.001, fc='orange', ec='orange')
	ax.annotate(df.columns[i], (pc1, pc2), size=12)
    return ax

In [107]:
ax = biplot(df[feature_cols])
ax.set_xlim([-1.0, 1.0])
ax.set_ylim([-1.0, 1.0])

<matplotlib.figure.Figure at 0x1170b8450>

(-1.0, 1.0)

In [108]:
df_feature_cols_scaled = sklearn.preprocessing.scale(df[feature_cols])



# Black images of various square sizes

In [126]:
df, feature_cols = load_data('solid-blacks')
df.shape

(2399, 2052)

In [120]:
pca = PCA(n_components=5, whiten=True).fit(df[feature_cols])

In [139]:
pca.explained_variance_ratio_

array([  1.00000000e+00,   3.70681363e-27,   2.72632089e-27,
         2.50115646e-27,   2.12532290e-27])

In [121]:
sum(pca.explained_variance_ratio_[:2])

1.0

In [123]:
df[feature_cols].drop_duplicates().shape

(1, 2048)

In [127]:
ax = biplot(df[feature_cols])
ax.set_xlim([-1.0, 1.0])
ax.set_ylim([-1.0, 1.0])

<matplotlib.figure.Figure at 0x10d0a89d0>

(-1.0, 1.0)

# Green

In [135]:
df, feature_cols = load_data('solid-greens')
df.shape

(2399, 2052)

In [138]:
df[feature_cols].drop_duplicates().shape

(4, 2048)

In [141]:
ax = biplot(df[feature_cols])
ax.set_xlim([-1.0, 1.0])
ax.set_ylim([-1.0, 1.0])

<matplotlib.figure.Figure at 0x10fdcb210>

(-1.0, 1.0)

In [148]:
ax = biplot(pd.DataFrame(sklearn.preprocessing.scale(foo), columns=feature_cols))
ax.set_xlim([-1.0, 1.0])
ax.set_ylim([-1.0, 1.0])

<matplotlib.figure.Figure at 0x10ded5650>

(-1.0, 1.0)

In [160]:
imagefile = 'Images/n02106662-German_shepherd/n02106662_22245.jpg'
im = PImage.open(imagefile)

In [161]:
im.size

(2272, 1704)

In [None]:
create_image_dir('husky-squaresize')